1 
2 /*--------------------------------------------------------------------*/
3 /*--- begin                                       guest_x86_toIR.c ---*/
4 /*--------------------------------------------------------------------*/
5 
6 /*
7    This file is part of Valgrind, a dynamic binary instrumentation
8    framework.
9 
10    Copyright (C) 2004-2015 OpenWorks LLP
11       info@open-works.net
12 
13    This program is free software; you can redistribute it and/or
14    modify it under the terms of the GNU General Public License as
15    published by the Free Software Foundation; either version 2 of the
16    License, or (at your option) any later version.
17 
18    This program is distributed in the hope that it will be useful, but
19    WITHOUT ANY WARRANTY; without even the implied warranty of
20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21    General Public License for more details.
22 
23    You should have received a copy of the GNU General Public License
24    along with this program; if not, write to the Free Software
25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26    02110-1301, USA.
27 
28    The GNU General Public License is contained in the file COPYING.
29 
30    Neither the names of the U.S. Department of Energy nor the
31    University of California nor the names of its contributors may be
32    used to endorse or promote products derived from this software
33    without prior written permission.
34 */
35 
36 /* Translates x86 code to IR. */
37 
38 /* TODO:
39 
40    All Puts to CC_OP/CC_DEP1/CC_DEP2/CC_NDEP should really be checked
41    to ensure a 32-bit value is being written.
42 
43    FUCOMI(P): what happens to A and S flags?  Currently are forced
44       to zero.
45 
46    x87 FP Limitations:
47 
48    * all arithmetic done at 64 bits
49 
50    * no FP exceptions, except for handling stack over/underflow
51 
52    * FP rounding mode observed only for float->int conversions
53      and int->float conversions which could lose accuracy, and
54      for float-to-float rounding.  For all other operations,
55      round-to-nearest is used, regardless.
56 
57    * some of the FCOM cases could do with testing -- not convinced
58      that the args are the right way round.
59 
60    * FSAVE does not re-initialise the FPU; it should do
61 
62    * FINIT not only initialises the FPU environment, it also
63      zeroes all the FP registers.  It should leave the registers
64      unchanged.
65 
66    SAHF should cause eflags[1] == 1, and in fact it produces 0.  As
67    per Intel docs this bit has no meaning anyway.  Since PUSHF is the
68    only way to observe eflags[1], a proper fix would be to make that
69    bit be set by PUSHF.
70 
71    The state of %eflags.AC (alignment check, bit 18) is recorded by
72    the simulation (viz, if you set it with popf then a pushf produces
73    the value you set it to), but it is otherwise ignored.  In
74    particular, setting it to 1 does NOT cause alignment checking to
75    happen.  Programs that set it to 1 and then rely on the resulting
76    SIGBUSs to inform them of misaligned accesses will not work.
77 
78    Implementation of sysenter is necessarily partial.  sysenter is a
79    kind of system call entry.  When doing a sysenter, the return
80    address is not known -- that is something that is beyond Vex's
81    knowledge.  So the generated IR forces a return to the scheduler,
82    which can do what it likes to simulate the systenter, but it MUST
83    set this thread's guest_EIP field with the continuation address
84    before resuming execution.  If that doesn't happen, the thread will
85    jump to address zero, which is probably fatal.
86 
87    This module uses global variables and so is not MT-safe (if that
88    should ever become relevant).
89 
90    The delta values are 32-bit ints, not 64-bit ints.  That means
91    this module may not work right if run on a 64-bit host.  That should
92    be fixed properly, really -- if anyone ever wants to use Vex to
93    translate x86 code for execution on a 64-bit host.
94 
95    casLE (implementation of lock-prefixed insns) and rep-prefixed
96    insns: the side-exit back to the start of the insn is done with
97    Ijk_Boring.  This is quite wrong, it should be done with
98    Ijk_NoRedir, since otherwise the side exit, which is intended to
99    restart the instruction for whatever reason, could go somewhere
100    entirely else.  Doing it right (with Ijk_NoRedir jumps) would make
101    no-redir jumps performance critical, at least for rep-prefixed
102    instructions, since all iterations thereof would involve such a
103    jump.  It's not such a big deal with casLE since the side exit is
104    only taken if the CAS fails, that is, the location is contended,
105    which is relatively unlikely.
106 
107    XXXX: Nov 2009: handling of SWP on ARM suffers from the same
108    problem.
109 
110    Note also, the test for CAS success vs failure is done using
111    Iop_CasCmp{EQ,NE}{8,16,32,64} rather than the ordinary
112    Iop_Cmp{EQ,NE} equivalents.  This is so as to tell Memcheck that it
113    shouldn't definedness-check these comparisons.  See
114    COMMENT_ON_CasCmpEQ in memcheck/mc_translate.c for
115    background/rationale.
116 */
117 
118 /* Performance holes:
119 
120    - fcom ; fstsw %ax ; sahf
121      sahf does not update the O flag (sigh) and so O needs to
122      be computed.  This is done expensively; it would be better
123      to have a calculate_eflags_o helper.
124 
125    - emwarns; some FP codes can generate huge numbers of these
126      if the fpucw is changed in an inner loop.  It would be
127      better for the guest state to have an emwarn-enable reg
128      which can be set zero or nonzero.  If it is zero, emwarns
129      are not flagged, and instead control just flows all the
130      way through bbs as usual.
131 */
132 
133 /* "Special" instructions.
134 
135    This instruction decoder can decode three special instructions
136    which mean nothing natively (are no-ops as far as regs/mem are
137    concerned) but have meaning for supporting Valgrind.  A special
138    instruction is flagged by the 12-byte preamble C1C703 C1C70D C1C71D
139    C1C713 (in the standard interpretation, that means: roll $3, %edi;
140    roll $13, %edi; roll $29, %edi; roll $19, %edi).  Following that,
141    one of the following 3 are allowed (standard interpretation in
142    parentheses):
143 
144       87DB (xchgl %ebx,%ebx)   %EDX = client_request ( %EAX )
145       87C9 (xchgl %ecx,%ecx)   %EAX = guest_NRADDR
146       87D2 (xchgl %edx,%edx)   call-noredir *%EAX
147       87FF (xchgl %edi,%edi)   IR injection
148 
149    Any other bytes following the 12-byte preamble are illegal and
150    constitute a failure in instruction decoding.  This all assumes
151    that the preamble will never occur except in specific code
152    fragments designed for Valgrind to catch.
153 
154    No prefixes may precede a "Special" instruction.
155 */
156 
157 /* LOCK prefixed instructions.  These are translated using IR-level
158    CAS statements (IRCAS) and are believed to preserve atomicity, even
159    from the point of view of some other process racing against a
160    simulated one (presumably they communicate via a shared memory
161    segment).
162 
163    Handlers which are aware of LOCK prefixes are:
164       dis_op2_G_E      (add, or, adc, sbb, and, sub, xor)
165       dis_cmpxchg_G_E  (cmpxchg)
166       dis_Grp1         (add, or, adc, sbb, and, sub, xor)
167       dis_Grp3         (not, neg)
168       dis_Grp4         (inc, dec)
169       dis_Grp5         (inc, dec)
170       dis_Grp8_Imm     (bts, btc, btr)
171       dis_bt_G_E       (bts, btc, btr)
172       dis_xadd_G_E     (xadd)
173 */
174 
175 
176 #include "libvex_basictypes.h"
177 #include "libvex_ir.h"
178 #include "libvex.h"
179 #include "libvex_guest_x86.h"
180 
181 #include "main_util.h"
182 #include "main_globals.h"
183 #include "guest_generic_bb_to_IR.h"
184 #include "guest_generic_x87.h"
185 #include "guest_x86_defs.h"
186 
187 
188 /*------------------------------------------------------------*/
189 /*--- Globals                                              ---*/
190 /*------------------------------------------------------------*/
191 
192 /* These are set at the start of the translation of an insn, right
193    down in disInstr_X86, so that we don't have to pass them around
194    endlessly.  They are all constant during the translation of any
195    given insn. */
196 
197 /* We need to know this to do sub-register accesses correctly. */
198 static VexEndness host_endness;
199 
200 /* Pointer to the guest code area (points to start of BB, not to the
201    insn being processed). */
202 static const UChar* guest_code;
203 
204 /* The guest address corresponding to guest_code[0]. */
205 static Addr32 guest_EIP_bbstart;
206 
207 /* The guest address for the instruction currently being
208    translated. */
209 static Addr32 guest_EIP_curr_instr;
210 
211 /* The IRSB* into which we're generating code. */
212 static IRSB* irsb;
213 
214 /* Whether are not we are in protected mode */
215 static Bool protected_mode;
216 
217 /* The addr-op size of the instruction
218  * By default it is 4 for protected mode and 2 for real mode.
219  * If there is the 0x67 prefix it is swapped
220  */
221 static Int current_sz_addr;
222 
223 /* The data-op size of the instruction
224  * By default it is 4 for protected mode and 2 for real mode.
225  * If there is the 0x66 prefix it is swapped
226  */
227 static Int current_sz_data;
228 
229 
230 /*------------------------------------------------------------*/
231 /*--- Debugging output                                     ---*/
232 /*------------------------------------------------------------*/
233 
234 #ifndef _MSC_VER
235 #define DIP(format, args...)           \
236    if (vex_traceflags & VEX_TRACE_FE)  \
237       vex_printf(format, ## args)
238 
239 #define DIS(buf, format, args...)      \
240    if (vex_traceflags & VEX_TRACE_FE)  \
241       vex_sprintf(buf, format, ## args)
242 #else
243 #define DIP(format, ...)           \
244    if (vex_traceflags & VEX_TRACE_FE)  \
245       vex_printf(format, __VA_ARGS__)
246 
247 #define DIS(buf, format, ...)      \
248    if (vex_traceflags & VEX_TRACE_FE)  \
249       vex_sprintf(buf, format, __VA_ARGS__)
250 #endif
251 
252 
253 
254 /*------------------------------------------------------------*/
255 /*--- Offsets of various parts of the x86 guest state.     ---*/
256 /*------------------------------------------------------------*/
257 
258 #define OFFB_EAX       offsetof(VexGuestX86State,guest_EAX)
259 #define OFFB_EBX       offsetof(VexGuestX86State,guest_EBX)
260 #define OFFB_ECX       offsetof(VexGuestX86State,guest_ECX)
261 #define OFFB_EDX       offsetof(VexGuestX86State,guest_EDX)
262 #define OFFB_ESP       offsetof(VexGuestX86State,guest_ESP)
263 #define OFFB_EBP       offsetof(VexGuestX86State,guest_EBP)
264 #define OFFB_ESI       offsetof(VexGuestX86State,guest_ESI)
265 #define OFFB_EDI       offsetof(VexGuestX86State,guest_EDI)
266 
267 #define OFFB_EIP       offsetof(VexGuestX86State,guest_EIP)
268 
269 #define OFFB_CC_OP     offsetof(VexGuestX86State,guest_CC_OP)
270 #define OFFB_CC_DEP1   offsetof(VexGuestX86State,guest_CC_DEP1)
271 #define OFFB_CC_DEP2   offsetof(VexGuestX86State,guest_CC_DEP2)
272 #define OFFB_CC_NDEP   offsetof(VexGuestX86State,guest_CC_NDEP)
273 
274 #define OFFB_FPREGS    offsetof(VexGuestX86State,guest_FPREG[0])
275 #define OFFB_FPTAGS    offsetof(VexGuestX86State,guest_FPTAG[0])
276 #define OFFB_DFLAG     offsetof(VexGuestX86State,guest_DFLAG)
277 #define OFFB_IDFLAG    offsetof(VexGuestX86State,guest_IDFLAG)
278 #define OFFB_ACFLAG    offsetof(VexGuestX86State,guest_ACFLAG)
279 #define OFFB_FTOP      offsetof(VexGuestX86State,guest_FTOP)
280 #define OFFB_FC3210    offsetof(VexGuestX86State,guest_FC3210)
281 #define OFFB_FPROUND   offsetof(VexGuestX86State,guest_FPROUND)
282 
283 #define OFFB_CS        offsetof(VexGuestX86State,guest_CS)
284 #define OFFB_DS        offsetof(VexGuestX86State,guest_DS)
285 #define OFFB_ES        offsetof(VexGuestX86State,guest_ES)
286 #define OFFB_FS        offsetof(VexGuestX86State,guest_FS)
287 #define OFFB_GS        offsetof(VexGuestX86State,guest_GS)
288 #define OFFB_SS        offsetof(VexGuestX86State,guest_SS)
289 #define OFFB_LDT       offsetof(VexGuestX86State,guest_LDT)
290 #define OFFB_GDT       offsetof(VexGuestX86State,guest_GDT)
291 
292 #define OFFB_SSEROUND  offsetof(VexGuestX86State,guest_SSEROUND)
293 #define OFFB_XMM0      offsetof(VexGuestX86State,guest_XMM0)
294 #define OFFB_XMM1      offsetof(VexGuestX86State,guest_XMM1)
295 #define OFFB_XMM2      offsetof(VexGuestX86State,guest_XMM2)
296 #define OFFB_XMM3      offsetof(VexGuestX86State,guest_XMM3)
297 #define OFFB_XMM4      offsetof(VexGuestX86State,guest_XMM4)
298 #define OFFB_XMM5      offsetof(VexGuestX86State,guest_XMM5)
299 #define OFFB_XMM6      offsetof(VexGuestX86State,guest_XMM6)
300 #define OFFB_XMM7      offsetof(VexGuestX86State,guest_XMM7)
301 
302 #define OFFB_EMNOTE    offsetof(VexGuestX86State,guest_EMNOTE)
303 
304 #define OFFB_CMSTART   offsetof(VexGuestX86State,guest_CMSTART)
305 #define OFFB_CMLEN     offsetof(VexGuestX86State,guest_CMLEN)
306 #define OFFB_NRADDR    offsetof(VexGuestX86State,guest_NRADDR)
307 
308 #define OFFB_IP_AT_SYSCALL offsetof(VexGuestX86State,guest_IP_AT_SYSCALL)
309 
310 
311 /*------------------------------------------------------------*/
312 /*--- Helper bits and pieces for deconstructing the        ---*/
313 /*--- x86 insn stream.                                     ---*/
314 /*------------------------------------------------------------*/
315 
316 /* This is the Intel register encoding -- integer regs. */
317 #define R_EAX 0
318 #define R_ECX 1
319 #define R_EDX 2
320 #define R_EBX 3
321 #define R_ESP 4
322 #define R_EBP 5
323 #define R_ESI 6
324 #define R_EDI 7
325 
326 #define R_AL (0+R_EAX)
327 #define R_AH (4+R_EAX)
328 
329 /* This is the Intel register encoding -- segment regs. */
330 #define R_ES 0
331 #define R_CS 1
332 #define R_SS 2
333 #define R_DS 3
334 #define R_FS 4
335 #define R_GS 5
336 
337 
338 /* Add a statement to the list held by "irbb". */
stmt(IRStmt * st)339 static void stmt ( IRStmt* st )
340 {
341    addStmtToIRSB( irsb, st );
342 }
343 
344 /* Generate a new temporary of the given type. */
newTemp(IRType ty)345 static IRTemp newTemp ( IRType ty )
346 {
347    vassert(isPlausibleIRType(ty));
348    return newIRTemp( irsb->tyenv, ty );
349 }
350 
351 /* Various simple conversions */
352 
extend_s_8to32(UInt x)353 static UInt extend_s_8to32( UInt x )
354 {
355    return (UInt)((Int)(x << 24) >> 24);
356 }
357 
extend_s_16to32(UInt x)358 static UInt extend_s_16to32 ( UInt x )
359 {
360   return (UInt)((Int)(x << 16) >> 16);
361 }
362 
363 /* Fetch a byte from the guest insn stream. */
getIByte(Int delta)364 static UChar getIByte ( Int delta )
365 {
366    return guest_code[delta];
367 }
368 
369 /* Extract the reg field from a modRM byte. */
gregOfRM(UChar mod_reg_rm)370 static Int gregOfRM ( UChar mod_reg_rm )
371 {
372    return (Int)( (mod_reg_rm >> 3) & 7 );
373 }
374 
375 /* Figure out whether the mod and rm parts of a modRM byte refer to a
376    register or memory.  If so, the byte will have the form 11XXXYYY,
377    where YYY is the register number. */
epartIsReg(UChar mod_reg_rm)378 static Bool epartIsReg ( UChar mod_reg_rm )
379 {
380    return toBool(0xC0 == (mod_reg_rm & 0xC0));
381 }
382 
383 /* ... and extract the register number ... */
eregOfRM(UChar mod_reg_rm)384 static Int eregOfRM ( UChar mod_reg_rm )
385 {
386    return (Int)(mod_reg_rm & 0x7);
387 }
388 
389 /* Get a 8/16/32-bit unsigned value out of the insn stream. */
390 
getUChar(Int delta)391 static UChar getUChar ( Int delta )
392 {
393    UChar v = guest_code[delta+0];
394    return toUChar(v);
395 }
396 
getUDisp16(Int delta)397 static UInt getUDisp16 ( Int delta )
398 {
399    UInt v = guest_code[delta+1]; v <<= 8;
400    v |= guest_code[delta+0];
401    return v & 0xFFFF;
402 }
403 
getUDisp32(Int delta)404 static UInt getUDisp32 ( Int delta )
405 {
406    UInt v = guest_code[delta+3]; v <<= 8;
407    v |= guest_code[delta+2]; v <<= 8;
408    v |= guest_code[delta+1]; v <<= 8;
409    v |= guest_code[delta+0];
410    return v;
411 }
412 
getUDisp(Int size,Int delta)413 static UInt getUDisp ( Int size, Int delta )
414 {
415    switch (size) {
416       case 4: return getUDisp32(delta);
417       case 2: return getUDisp16(delta);
418       case 1: return (UInt)getUChar(delta);
419       default: vpanic("getUDisp(x86)");
420    }
421    return 0; /*notreached*/
422 }
423 
424 
425 /* Get a byte value out of the insn stream and sign-extend to 32
426    bits. */
getSDisp8(Int delta)427 static UInt getSDisp8 ( Int delta )
428 {
429    return extend_s_8to32( (UInt) (guest_code[delta]) );
430 }
431 
getSDisp16(Int delta0)432 static UInt getSDisp16 ( Int delta0 )
433 {
434    const UChar* eip = &guest_code[delta0];
435    UInt d = *eip++;
436    d |= ((*eip++) << 8);
437    return extend_s_16to32(d);
438 }
439 
getSDisp(Int size,Int delta)440 static UInt getSDisp ( Int size, Int delta )
441 {
442    switch (size) {
443       case 4: return getUDisp32(delta);
444       case 2: return getSDisp16(delta);
445       case 1: return getSDisp8(delta);
446       default: vpanic("getSDisp(x86)");
447   }
448   return 0; /*notreached*/
449 }
450 
451 
452 /*------------------------------------------------------------*/
453 /*--- Helpers for constructing IR.                         ---*/
454 /*------------------------------------------------------------*/
455 
456 /* Create a 1/2/4 byte read of an x86 integer registers.  For 16/8 bit
457    register references, we need to take the host endianness into
458    account.  Supplied value is 0 .. 7 and in the Intel instruction
459    encoding. */
460 
szToITy(Int n)461 static IRType szToITy ( Int n )
462 {
463    switch (n) {
464       case 1: return Ity_I8;
465       case 2: return Ity_I16;
466       case 4: return Ity_I32;
467       default: vpanic("szToITy(x86)");
468    }
469 }
470 
471 /* On a little-endian host, less significant bits of the guest
472    registers are at lower addresses.  Therefore, if a reference to a
473    register low half has the safe guest state offset as a reference to
474    the full register.
475 */
integerGuestRegOffset(Int sz,UInt archreg)476 static Int integerGuestRegOffset ( Int sz, UInt archreg )
477 {
478    vassert(archreg < 8);
479 
480    /* Correct for little-endian host only. */
481    vassert(host_endness == VexEndnessLE);
482 
483    if (sz == 4 || sz == 2 || (sz == 1 && archreg < 4)) {
484       switch (archreg) {
485          case R_EAX: return OFFB_EAX;
486          case R_EBX: return OFFB_EBX;
487          case R_ECX: return OFFB_ECX;
488          case R_EDX: return OFFB_EDX;
489          case R_ESI: return OFFB_ESI;
490          case R_EDI: return OFFB_EDI;
491          case R_ESP: return OFFB_ESP;
492          case R_EBP: return OFFB_EBP;
493          default: vpanic("integerGuestRegOffset(x86,le)(4,2)");
494       }
495    }
496 
497    vassert(archreg >= 4 && archreg < 8 && sz == 1);
498    switch (archreg-4) {
499       case R_EAX: return 1+ OFFB_EAX;
500       case R_EBX: return 1+ OFFB_EBX;
501       case R_ECX: return 1+ OFFB_ECX;
502       case R_EDX: return 1+ OFFB_EDX;
503       default: vpanic("integerGuestRegOffset(x86,le)(1h)");
504    }
505 
506    /* NOTREACHED */
507    vpanic("integerGuestRegOffset(x86,le)");
508 }
509 
segmentGuestRegOffset(UInt sreg)510 static Int segmentGuestRegOffset ( UInt sreg )
511 {
512    switch (sreg) {
513       case R_ES: return OFFB_ES;
514       case R_CS: return OFFB_CS;
515       case R_SS: return OFFB_SS;
516       case R_DS: return OFFB_DS;
517       case R_FS: return OFFB_FS;
518       case R_GS: return OFFB_GS;
519       default: vpanic("segmentGuestRegOffset(x86)");
520    }
521 }
522 
xmmGuestRegOffset(UInt xmmreg)523 static Int xmmGuestRegOffset ( UInt xmmreg )
524 {
525    switch (xmmreg) {
526       case 0: return OFFB_XMM0;
527       case 1: return OFFB_XMM1;
528       case 2: return OFFB_XMM2;
529       case 3: return OFFB_XMM3;
530       case 4: return OFFB_XMM4;
531       case 5: return OFFB_XMM5;
532       case 6: return OFFB_XMM6;
533       case 7: return OFFB_XMM7;
534       default: vpanic("xmmGuestRegOffset");
535    }
536 }
537 
538 /* Lanes of vector registers are always numbered from zero being the
539    least significant lane (rightmost in the register).  */
540 
xmmGuestRegLane16offset(UInt xmmreg,Int laneno)541 static Int xmmGuestRegLane16offset ( UInt xmmreg, Int laneno )
542 {
543    /* Correct for little-endian host only. */
544    vassert(host_endness == VexEndnessLE);
545    vassert(laneno >= 0 && laneno < 8);
546    return xmmGuestRegOffset( xmmreg ) + 2 * laneno;
547 }
548 
xmmGuestRegLane32offset(UInt xmmreg,Int laneno)549 static Int xmmGuestRegLane32offset ( UInt xmmreg, Int laneno )
550 {
551    /* Correct for little-endian host only. */
552    vassert(host_endness == VexEndnessLE);
553    vassert(laneno >= 0 && laneno < 4);
554    return xmmGuestRegOffset( xmmreg ) + 4 * laneno;
555 }
556 
xmmGuestRegLane64offset(UInt xmmreg,Int laneno)557 static Int xmmGuestRegLane64offset ( UInt xmmreg, Int laneno )
558 {
559    /* Correct for little-endian host only. */
560    vassert(host_endness == VexEndnessLE);
561    vassert(laneno >= 0 && laneno < 2);
562    return xmmGuestRegOffset( xmmreg ) + 8 * laneno;
563 }
564 
getIReg(Int sz,UInt archreg)565 static IRExpr* getIReg ( Int sz, UInt archreg )
566 {
567    vassert(sz == 1 || sz == 2 || sz == 4);
568    vassert(archreg < 8);
569    return IRExpr_Get( integerGuestRegOffset(sz,archreg),
570                       szToITy(sz) );
571 }
572 
573 /* Ditto, but write to a reg instead. */
putIReg(Int sz,UInt archreg,IRExpr * e)574 static void putIReg ( Int sz, UInt archreg, IRExpr* e )
575 {
576    IRType ty = typeOfIRExpr(irsb->tyenv, e);
577    switch (sz) {
578       case 1: vassert(ty == Ity_I8); break;
579       case 2: vassert(ty == Ity_I16); break;
580       case 4: vassert(ty == Ity_I32); break;
581       default: vpanic("putIReg(x86)");
582    }
583    vassert(archreg < 8);
584    stmt( IRStmt_Put(integerGuestRegOffset(sz,archreg), e) );
585 }
586 
getSReg(UInt sreg)587 static IRExpr* getSReg ( UInt sreg )
588 {
589    return IRExpr_Get( segmentGuestRegOffset(sreg), Ity_I16 );
590 }
591 
putSReg(UInt sreg,IRExpr * e)592 static void putSReg ( UInt sreg, IRExpr* e )
593 {
594    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
595    stmt( IRStmt_Put( segmentGuestRegOffset(sreg), e ) );
596 }
597 
getXMMReg(UInt xmmreg)598 static IRExpr* getXMMReg ( UInt xmmreg )
599 {
600    return IRExpr_Get( xmmGuestRegOffset(xmmreg), Ity_V128 );
601 }
602 
getXMMRegLane64(UInt xmmreg,Int laneno)603 static IRExpr* getXMMRegLane64 ( UInt xmmreg, Int laneno )
604 {
605    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_I64 );
606 }
607 
getXMMRegLane64F(UInt xmmreg,Int laneno)608 static IRExpr* getXMMRegLane64F ( UInt xmmreg, Int laneno )
609 {
610    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_F64 );
611 }
612 
getXMMRegLane32(UInt xmmreg,Int laneno)613 static IRExpr* getXMMRegLane32 ( UInt xmmreg, Int laneno )
614 {
615    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_I32 );
616 }
617 
getXMMRegLane32F(UInt xmmreg,Int laneno)618 static IRExpr* getXMMRegLane32F ( UInt xmmreg, Int laneno )
619 {
620    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_F32 );
621 }
622 
putXMMReg(UInt xmmreg,IRExpr * e)623 static void putXMMReg ( UInt xmmreg, IRExpr* e )
624 {
625    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
626    stmt( IRStmt_Put( xmmGuestRegOffset(xmmreg), e ) );
627 }
628 
putXMMRegLane64(UInt xmmreg,Int laneno,IRExpr * e)629 static void putXMMRegLane64 ( UInt xmmreg, Int laneno, IRExpr* e )
630 {
631    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
632    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
633 }
634 
putXMMRegLane64F(UInt xmmreg,Int laneno,IRExpr * e)635 static void putXMMRegLane64F ( UInt xmmreg, Int laneno, IRExpr* e )
636 {
637    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
638    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
639 }
640 
putXMMRegLane32F(UInt xmmreg,Int laneno,IRExpr * e)641 static void putXMMRegLane32F ( UInt xmmreg, Int laneno, IRExpr* e )
642 {
643    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
644    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
645 }
646 
putXMMRegLane32(UInt xmmreg,Int laneno,IRExpr * e)647 static void putXMMRegLane32 ( UInt xmmreg, Int laneno, IRExpr* e )
648 {
649    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
650    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
651 }
652 
putXMMRegLane16(UInt xmmreg,Int laneno,IRExpr * e)653 static void putXMMRegLane16 ( UInt xmmreg, Int laneno, IRExpr* e )
654 {
655    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
656    stmt( IRStmt_Put( xmmGuestRegLane16offset(xmmreg,laneno), e ) );
657 }
658 
assign(IRTemp dst,IRExpr * e)659 static void assign ( IRTemp dst, IRExpr* e )
660 {
661    stmt( IRStmt_WrTmp(dst, e) );
662 }
663 
storeLE(IRExpr * addr,IRExpr * data)664 static void storeLE ( IRExpr* addr, IRExpr* data )
665 {
666    stmt( IRStmt_Store(Iend_LE, addr, data) );
667 }
668 
unop(IROp op,IRExpr * a)669 static IRExpr* unop ( IROp op, IRExpr* a )
670 {
671    return IRExpr_Unop(op, a);
672 }
673 
binop(IROp op,IRExpr * a1,IRExpr * a2)674 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
675 {
676    return IRExpr_Binop(op, a1, a2);
677 }
678 
triop(IROp op,IRExpr * a1,IRExpr * a2,IRExpr * a3)679 static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
680 {
681    return IRExpr_Triop(op, a1, a2, a3);
682 }
683 
mkexpr(IRTemp tmp)684 static IRExpr* mkexpr ( IRTemp tmp )
685 {
686    return IRExpr_RdTmp(tmp);
687 }
688 
mkU8(UInt i)689 static IRExpr* mkU8 ( UInt i )
690 {
691    vassert(i < 256);
692    return IRExpr_Const(IRConst_U8( (UChar)i ));
693 }
694 
mkU16(UInt i)695 static IRExpr* mkU16 ( UInt i )
696 {
697    vassert(i < 65536);
698    return IRExpr_Const(IRConst_U16( (UShort)i ));
699 }
700 
mkU32(UInt i)701 static IRExpr* mkU32 ( UInt i )
702 {
703    return IRExpr_Const(IRConst_U32(i));
704 }
705 
mkU64(ULong i)706 static IRExpr* mkU64 ( ULong i )
707 {
708    return IRExpr_Const(IRConst_U64(i));
709 }
710 
mkU(IRType ty,UInt i)711 static IRExpr* mkU ( IRType ty, UInt i )
712 {
713    if (ty == Ity_I8)  return mkU8(i);
714    if (ty == Ity_I16) return mkU16(i);
715    if (ty == Ity_I32) return mkU32(i);
716    /* If this panics, it usually means you passed a size (1,2,4)
717       value as the IRType, rather than a real IRType. */
718    vpanic("mkU(x86)");
719 }
720 
mkV128(UShort mask)721 static IRExpr* mkV128 ( UShort mask )
722 {
723    return IRExpr_Const(IRConst_V128(mask));
724 }
725 
loadLE(IRType ty,IRExpr * addr)726 static IRExpr* loadLE ( IRType ty, IRExpr* addr )
727 {
728    return IRExpr_Load(Iend_LE, ty, addr);
729 }
730 
mkSizedOp(IRType ty,IROp op8)731 static IROp mkSizedOp ( IRType ty, IROp op8 )
732 {
733    Int adj;
734    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
735    vassert(op8 == Iop_Add8 || op8 == Iop_Sub8
736            || op8 == Iop_Mul8
737            || op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8
738            || op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
739            || op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
740            || op8 == Iop_CasCmpNE8
741            || op8 == Iop_ExpCmpNE8
742            || op8 == Iop_Not8);
743    adj = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
744    return adj + op8;
745 }
746 
mkWidenOp(Int szSmall,Int szBig,Bool signd)747 static IROp mkWidenOp ( Int szSmall, Int szBig, Bool signd )
748 {
749    if (szSmall == 1 && szBig == 4) {
750       return signd ? Iop_8Sto32 : Iop_8Uto32;
751    }
752    if (szSmall == 1 && szBig == 2) {
753       return signd ? Iop_8Sto16 : Iop_8Uto16;
754    }
755    if (szSmall == 2 && szBig == 4) {
756       return signd ? Iop_16Sto32 : Iop_16Uto32;
757    }
758    vpanic("mkWidenOp(x86,guest)");
759 }
760 
mkAnd1(IRExpr * x,IRExpr * y)761 static IRExpr* mkAnd1 ( IRExpr* x, IRExpr* y )
762 {
763    vassert(typeOfIRExpr(irsb->tyenv,x) == Ity_I1);
764    vassert(typeOfIRExpr(irsb->tyenv,y) == Ity_I1);
765    return unop(Iop_32to1,
766                binop(Iop_And32,
767                      unop(Iop_1Uto32,x),
768                      unop(Iop_1Uto32,y)));
769 }
770 
771 /* Generate a compare-and-swap operation, operating on memory at
772    'addr'.  The expected value is 'expVal' and the new value is
773    'newVal'.  If the operation fails, then transfer control (with a
774    no-redir jump (XXX no -- see comment at top of this file)) to
775    'restart_point', which is presumably the address of the guest
776    instruction again -- retrying, essentially. */
casLE(IRExpr * addr,IRExpr * expVal,IRExpr * newVal,Addr32 restart_point)777 static void casLE ( IRExpr* addr, IRExpr* expVal, IRExpr* newVal,
778                     Addr32 restart_point )
779 {
780    IRCAS* cas;
781    IRType tyE    = typeOfIRExpr(irsb->tyenv, expVal);
782    IRType tyN    = typeOfIRExpr(irsb->tyenv, newVal);
783    IRTemp oldTmp = newTemp(tyE);
784    IRTemp expTmp = newTemp(tyE);
785    vassert(tyE == tyN);
786    vassert(tyE == Ity_I32 || tyE == Ity_I16 || tyE == Ity_I8);
787    assign(expTmp, expVal);
788    cas = mkIRCAS( IRTemp_INVALID, oldTmp, Iend_LE, addr,
789                   NULL, mkexpr(expTmp), NULL, newVal );
790    stmt( IRStmt_CAS(cas) );
791    stmt( IRStmt_Exit(
792             binop( mkSizedOp(tyE,Iop_CasCmpNE8),
793                    mkexpr(oldTmp), mkexpr(expTmp) ),
794             Ijk_Boring, /*Ijk_NoRedir*/
795             IRConst_U32( restart_point ),
796             OFFB_EIP
797          ));
798 }
799 
800 
801 /*------------------------------------------------------------*/
802 /*--- Helpers for %eflags.                                 ---*/
803 /*------------------------------------------------------------*/
804 
805 /* -------------- Evaluating the flags-thunk. -------------- */
806 
807 /* Build IR to calculate all the eflags from stored
808    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
809    Ity_I32. */
mk_x86g_calculate_eflags_all(void)810 static IRExpr* mk_x86g_calculate_eflags_all ( void )
811 {
812    IRExpr** args
813       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
814                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
815                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
816                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
817    IRExpr* call
818       = mkIRExprCCall(
819            Ity_I32,
820            0/*regparm*/,
821            "x86g_calculate_eflags_all", &x86g_calculate_eflags_all,
822            args
823         );
824    /* Exclude OP and NDEP from definedness checking.  We're only
825       interested in DEP1 and DEP2. */
826    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
827    return call;
828 }
829 
830 /* Build IR to calculate some particular condition from stored
831    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
832    Ity_Bit. */
mk_x86g_calculate_condition(X86Condcode cond)833 static IRExpr* mk_x86g_calculate_condition ( X86Condcode cond )
834 {
835    IRExpr** args
836       = mkIRExprVec_5( mkU32(cond),
837                        IRExpr_Get(OFFB_CC_OP,  Ity_I32),
838                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
839                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
840                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
841    IRExpr* call
842       = mkIRExprCCall(
843            Ity_I32,
844            0/*regparm*/,
845            "x86g_calculate_condition", &x86g_calculate_condition,
846            args
847         );
848    /* Exclude the requested condition, OP and NDEP from definedness
849       checking.  We're only interested in DEP1 and DEP2. */
850    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<1) | (1<<4);
851    return unop(Iop_32to1, call);
852 }
853 
854 /* Build IR to calculate just the carry flag from stored
855    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression :: Ity_I32. */
mk_x86g_calculate_eflags_c(void)856 static IRExpr* mk_x86g_calculate_eflags_c ( void )
857 {
858    IRExpr** args
859       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
860                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
861                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
862                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
863    IRExpr* call
864       = mkIRExprCCall(
865            Ity_I32,
866            3/*regparm*/,
867            "x86g_calculate_eflags_c", &x86g_calculate_eflags_c,
868            args
869         );
870    /* Exclude OP and NDEP from definedness checking.  We're only
871       interested in DEP1 and DEP2. */
872    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
873    return call;
874 }
875 
876 
877 /* -------------- Building the flags-thunk. -------------- */
878 
879 /* The machinery in this section builds the flag-thunk following a
880    flag-setting operation.  Hence the various setFlags_* functions.
881 */
882 
isAddSub(IROp op8)883 static Bool isAddSub ( IROp op8 )
884 {
885    return toBool(op8 == Iop_Add8 || op8 == Iop_Sub8);
886 }
887 
isLogic(IROp op8)888 static Bool isLogic ( IROp op8 )
889 {
890    return toBool(op8 == Iop_And8 || op8 == Iop_Or8 || op8 == Iop_Xor8);
891 }
892 
893 /* U-widen 8/16/32 bit int expr to 32. */
widenUto32(IRExpr * e)894 static IRExpr* widenUto32 ( IRExpr* e )
895 {
896    switch (typeOfIRExpr(irsb->tyenv,e)) {
897       case Ity_I32: return e;
898       case Ity_I16: return unop(Iop_16Uto32,e);
899       case Ity_I8:  return unop(Iop_8Uto32,e);
900       default: vpanic("widenUto32");
901    }
902 }
903 
904 /* S-widen 8/16/32 bit int expr to 32. */
widenSto32(IRExpr * e)905 static IRExpr* widenSto32 ( IRExpr* e )
906 {
907    switch (typeOfIRExpr(irsb->tyenv,e)) {
908       case Ity_I32: return e;
909       case Ity_I16: return unop(Iop_16Sto32,e);
910       case Ity_I8:  return unop(Iop_8Sto32,e);
911       default: vpanic("widenSto32");
912    }
913 }
914 
915 /* Narrow 8/16/32 bit int expr to 8/16/32.  Clearly only some
916    of these combinations make sense. */
narrowTo(IRType dst_ty,IRExpr * e)917 static IRExpr* narrowTo ( IRType dst_ty, IRExpr* e )
918 {
919    IRType src_ty = typeOfIRExpr(irsb->tyenv,e);
920    if (src_ty == dst_ty)
921       return e;
922    if (src_ty == Ity_I32 && dst_ty == Ity_I16)
923       return unop(Iop_32to16, e);
924    if (src_ty == Ity_I32 && dst_ty == Ity_I8)
925       return unop(Iop_32to8, e);
926 
927    vex_printf("\nsrc, dst tys are: ");
928    ppIRType(src_ty);
929    vex_printf(", ");
930    ppIRType(dst_ty);
931    vex_printf("\n");
932    vpanic("narrowTo(x86)");
933 }
934 
935 
936 /* Set the flags thunk OP, DEP1 and DEP2 fields.  The supplied op is
937    auto-sized up to the real op. */
938 
939 static
setFlags_DEP1_DEP2(IROp op8,IRTemp dep1,IRTemp dep2,IRType ty)940 void setFlags_DEP1_DEP2 ( IROp op8, IRTemp dep1, IRTemp dep2, IRType ty )
941 {
942    Int ccOp = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
943 
944    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
945 
946    switch (op8) {
947       case Iop_Add8: ccOp += X86G_CC_OP_ADDB;   break;
948       case Iop_Sub8: ccOp += X86G_CC_OP_SUBB;   break;
949       default:       ppIROp(op8);
950                      vpanic("setFlags_DEP1_DEP2(x86)");
951    }
952    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
953    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(dep1))) );
954    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(mkexpr(dep2))) );
955    /* Set NDEP even though it isn't used.  This makes redundant-PUT
956       elimination of previous stores to this field work better. */
957    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
958 }
959 
960 
961 /* Set the OP and DEP1 fields only, and write zero to DEP2. */
962 
963 static
setFlags_DEP1(IROp op8,IRTemp dep1,IRType ty)964 void setFlags_DEP1 ( IROp op8, IRTemp dep1, IRType ty )
965 {
966    Int ccOp = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
967 
968    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
969 
970    switch (op8) {
971       case Iop_Or8:
972       case Iop_And8:
973       case Iop_Xor8: ccOp += X86G_CC_OP_LOGICB; break;
974       default:       ppIROp(op8);
975                      vpanic("setFlags_DEP1(x86)");
976    }
977    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
978    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(dep1))) );
979    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0)) );
980    /* Set NDEP even though it isn't used.  This makes redundant-PUT
981       elimination of previous stores to this field work better. */
982    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
983 }
984 
985 
986 /* For shift operations, we put in the result and the undershifted
987    result.  Except if the shift amount is zero, the thunk is left
988    unchanged. */
989 
setFlags_DEP1_DEP2_shift(IROp op32,IRTemp res,IRTemp resUS,IRType ty,IRTemp guard)990 static void setFlags_DEP1_DEP2_shift ( IROp    op32,
991                                        IRTemp  res,
992                                        IRTemp  resUS,
993                                        IRType  ty,
994                                        IRTemp  guard )
995 {
996    Int ccOp = ty==Ity_I8 ? 2 : (ty==Ity_I16 ? 1 : 0);
997 
998    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
999    vassert(guard);
1000 
1001    /* Both kinds of right shifts are handled by the same thunk
1002       operation. */
1003    switch (op32) {
1004       case Iop_Shr32:
1005       case Iop_Sar32: ccOp = X86G_CC_OP_SHRL - ccOp; break;
1006       case Iop_Shl32: ccOp = X86G_CC_OP_SHLL - ccOp; break;
1007       default:        ppIROp(op32);
1008                       vpanic("setFlags_DEP1_DEP2_shift(x86)");
1009    }
1010 
1011    /* guard :: Ity_I8.  We need to convert it to I1. */
1012    IRTemp guardB = newTemp(Ity_I1);
1013    assign( guardB, binop(Iop_CmpNE8, mkexpr(guard), mkU8(0)) );
1014 
1015    /* DEP1 contains the result, DEP2 contains the undershifted value. */
1016    stmt( IRStmt_Put( OFFB_CC_OP,
1017                      IRExpr_ITE( mkexpr(guardB),
1018                                  mkU32(ccOp),
1019                                  IRExpr_Get(OFFB_CC_OP,Ity_I32) ) ));
1020    stmt( IRStmt_Put( OFFB_CC_DEP1,
1021                      IRExpr_ITE( mkexpr(guardB),
1022                                  widenUto32(mkexpr(res)),
1023                                  IRExpr_Get(OFFB_CC_DEP1,Ity_I32) ) ));
1024    stmt( IRStmt_Put( OFFB_CC_DEP2,
1025                      IRExpr_ITE( mkexpr(guardB),
1026                                  widenUto32(mkexpr(resUS)),
1027                                  IRExpr_Get(OFFB_CC_DEP2,Ity_I32) ) ));
1028    /* Set NDEP even though it isn't used.  This makes redundant-PUT
1029       elimination of previous stores to this field work better. */
1030    stmt( IRStmt_Put( OFFB_CC_NDEP,
1031                      IRExpr_ITE( mkexpr(guardB),
1032                                  mkU32(0),
1033                                  IRExpr_Get(OFFB_CC_NDEP,Ity_I32) ) ));
1034 }
1035 
1036 
1037 /* For the inc/dec case, we store in DEP1 the result value and in NDEP
1038    the former value of the carry flag, which unfortunately we have to
1039    compute. */
1040 
setFlags_INC_DEC(Bool inc,IRTemp res,IRType ty)1041 static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
1042 {
1043    Int ccOp = inc ? X86G_CC_OP_INCB : X86G_CC_OP_DECB;
1044 
1045    ccOp += ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
1046    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
1047 
1048    /* This has to come first, because calculating the C flag
1049       may require reading all four thunk fields. */
1050    stmt( IRStmt_Put( OFFB_CC_NDEP, mk_x86g_calculate_eflags_c()) );
1051    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
1052    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(res))) );
1053    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0)) );
1054 }
1055 
1056 
1057 /* Multiplies are pretty much like add and sub: DEP1 and DEP2 hold the
1058    two arguments. */
1059 
1060 static
setFlags_MUL(IRType ty,IRTemp arg1,IRTemp arg2,UInt base_op)1061 void setFlags_MUL ( IRType ty, IRTemp arg1, IRTemp arg2, UInt base_op )
1062 {
1063    switch (ty) {
1064       case Ity_I8:
1065          stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+0) ) );
1066          break;
1067       case Ity_I16:
1068          stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+1) ) );
1069          break;
1070       case Ity_I32:
1071          stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+2) ) );
1072          break;
1073       default:
1074          vpanic("setFlags_MUL(x86)");
1075    }
1076    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(arg1)) ));
1077    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(mkexpr(arg2)) ));
1078    /* Set NDEP even though it isn't used.  This makes redundant-PUT
1079       elimination of previous stores to this field work better. */
1080    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
1081 }
1082 
1083 
1084 /* -------------- Condition codes. -------------- */
1085 
1086 /* Condition codes, using the Intel encoding.  */
1087 
name_X86Condcode(X86Condcode cond)1088 static const HChar* name_X86Condcode ( X86Condcode cond )
1089 {
1090    switch (cond) {
1091       case X86CondO:      return "o";
1092       case X86CondNO:     return "no";
1093       case X86CondB:      return "b";
1094       case X86CondNB:     return "nb";
1095       case X86CondZ:      return "z";
1096       case X86CondNZ:     return "nz";
1097       case X86CondBE:     return "be";
1098       case X86CondNBE:    return "nbe";
1099       case X86CondS:      return "s";
1100       case X86CondNS:     return "ns";
1101       case X86CondP:      return "p";
1102       case X86CondNP:     return "np";
1103       case X86CondL:      return "l";
1104       case X86CondNL:     return "nl";
1105       case X86CondLE:     return "le";
1106       case X86CondNLE:    return "nle";
1107       case X86CondAlways: return "ALWAYS";
1108       default: vpanic("name_X86Condcode");
1109    }
1110 }
1111 
1112 static
positiveIse_X86Condcode(X86Condcode cond,Bool * needInvert)1113 X86Condcode positiveIse_X86Condcode ( X86Condcode  cond,
1114                                       Bool*        needInvert )
1115 {
1116    vassert(cond >= X86CondO && cond <= X86CondNLE);
1117    if (cond & 1) {
1118       *needInvert = True;
1119       return cond-1;
1120    } else {
1121       *needInvert = False;
1122       return cond;
1123    }
1124 }
1125 
1126 
1127 /* -------------- Helpers for ADD/SUB with carry. -------------- */
1128 
1129 /* Given ta1, ta2 and tres, compute tres = ADC(ta1,ta2) and set flags
1130    appropriately.
1131 
1132    Optionally, generate a store for the 'tres' value.  This can either
1133    be a normal store, or it can be a cas-with-possible-failure style
1134    store:
1135 
1136    if taddr is IRTemp_INVALID, then no store is generated.
1137 
1138    if taddr is not IRTemp_INVALID, then a store (using taddr as
1139    the address) is generated:
1140 
1141      if texpVal is IRTemp_INVALID then a normal store is
1142      generated, and restart_point must be zero (it is irrelevant).
1143 
1144      if texpVal is not IRTemp_INVALID then a cas-style store is
1145      generated.  texpVal is the expected value, restart_point
1146      is the restart point if the store fails, and texpVal must
1147      have the same type as tres.
1148 */
helper_ADC(Int sz,IRTemp tres,IRTemp ta1,IRTemp ta2,IRTemp taddr,IRTemp texpVal,Addr32 restart_point)1149 static void helper_ADC ( Int sz,
1150                          IRTemp tres, IRTemp ta1, IRTemp ta2,
1151                          /* info about optional store: */
1152                          IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
1153 {
1154    UInt    thunkOp;
1155    IRType  ty    = szToITy(sz);
1156    IRTemp  oldc  = newTemp(Ity_I32);
1157    IRTemp  oldcn = newTemp(ty);
1158    IROp    plus  = mkSizedOp(ty, Iop_Add8);
1159    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
1160 
1161    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
1162    vassert(sz == 1 || sz == 2 || sz == 4);
1163    thunkOp = sz==4 ? X86G_CC_OP_ADCL
1164                    : (sz==2 ? X86G_CC_OP_ADCW : X86G_CC_OP_ADCB);
1165 
1166    /* oldc = old carry flag, 0 or 1 */
1167    assign( oldc,  binop(Iop_And32,
1168                         mk_x86g_calculate_eflags_c(),
1169                         mkU32(1)) );
1170 
1171    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
1172 
1173    assign( tres, binop(plus,
1174                        binop(plus,mkexpr(ta1),mkexpr(ta2)),
1175                        mkexpr(oldcn)) );
1176 
1177    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
1178       start of this function. */
1179    if (taddr != IRTemp_INVALID) {
1180       if (texpVal == IRTemp_INVALID) {
1181          vassert(restart_point == 0);
1182          storeLE( mkexpr(taddr), mkexpr(tres) );
1183       } else {
1184          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
1185          /* .. and hence 'texpVal' has the same type as 'tres'. */
1186          casLE( mkexpr(taddr),
1187                 mkexpr(texpVal), mkexpr(tres), restart_point );
1188       }
1189    }
1190 
1191    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(thunkOp) ) );
1192    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(ta1)) ));
1193    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(binop(xor, mkexpr(ta2),
1194                                                          mkexpr(oldcn)) )) );
1195    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
1196 }
1197 
1198 
1199 /* Given ta1, ta2 and tres, compute tres = SBB(ta1,ta2) and set flags
1200    appropriately.  As with helper_ADC, possibly generate a store of
1201    the result -- see comments on helper_ADC for details.
1202 */
helper_SBB(Int sz,IRTemp tres,IRTemp ta1,IRTemp ta2,IRTemp taddr,IRTemp texpVal,Addr32 restart_point)1203 static void helper_SBB ( Int sz,
1204                          IRTemp tres, IRTemp ta1, IRTemp ta2,
1205                          /* info about optional store: */
1206                          IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
1207 {
1208    UInt    thunkOp;
1209    IRType  ty    = szToITy(sz);
1210    IRTemp  oldc  = newTemp(Ity_I32);
1211    IRTemp  oldcn = newTemp(ty);
1212    IROp    minus = mkSizedOp(ty, Iop_Sub8);
1213    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
1214 
1215    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
1216    vassert(sz == 1 || sz == 2 || sz == 4);
1217    thunkOp = sz==4 ? X86G_CC_OP_SBBL
1218                    : (sz==2 ? X86G_CC_OP_SBBW : X86G_CC_OP_SBBB);
1219 
1220    /* oldc = old carry flag, 0 or 1 */
1221    assign( oldc, binop(Iop_And32,
1222                        mk_x86g_calculate_eflags_c(),
1223                        mkU32(1)) );
1224 
1225    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
1226 
1227    assign( tres, binop(minus,
1228                        binop(minus,mkexpr(ta1),mkexpr(ta2)),
1229                        mkexpr(oldcn)) );
1230 
1231    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
1232       start of this function. */
1233    if (taddr != IRTemp_INVALID) {
1234       if (texpVal == IRTemp_INVALID) {
1235          vassert(restart_point == 0);
1236          storeLE( mkexpr(taddr), mkexpr(tres) );
1237       } else {
1238          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
1239          /* .. and hence 'texpVal' has the same type as 'tres'. */
1240          casLE( mkexpr(taddr),
1241                 mkexpr(texpVal), mkexpr(tres), restart_point );
1242       }
1243    }
1244 
1245    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(thunkOp) ) );
1246    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(ta1) )) );
1247    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(binop(xor, mkexpr(ta2),
1248                                                          mkexpr(oldcn)) )) );
1249    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
1250 }
1251 
1252 
1253 /* -------------- Helpers for disassembly printing. -------------- */
1254 
nameGrp1(Int opc_aux)1255 static const HChar* nameGrp1 ( Int opc_aux )
1256 {
1257    static const HChar* grp1_names[8]
1258      = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
1259    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp1(x86)");
1260    return grp1_names[opc_aux];
1261 }
1262 
nameGrp2(Int opc_aux)1263 static const HChar* nameGrp2 ( Int opc_aux )
1264 {
1265    static const HChar* grp2_names[8]
1266      = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
1267    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp2(x86)");
1268    return grp2_names[opc_aux];
1269 }
1270 
nameGrp4(Int opc_aux)1271 static const HChar* nameGrp4 ( Int opc_aux )
1272 {
1273    static const HChar* grp4_names[8]
1274      = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
1275    if (opc_aux < 0 || opc_aux > 1) vpanic("nameGrp4(x86)");
1276    return grp4_names[opc_aux];
1277 }
1278 
nameGrp5(Int opc_aux)1279 static const HChar* nameGrp5 ( Int opc_aux )
1280 {
1281    static const HChar* grp5_names[8]
1282      = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
1283    if (opc_aux < 0 || opc_aux > 6) vpanic("nameGrp5(x86)");
1284    return grp5_names[opc_aux];
1285 }
1286 
nameGrp8(Int opc_aux)1287 static const HChar* nameGrp8 ( Int opc_aux )
1288 {
1289    static const HChar* grp8_names[8]
1290      = { "???", "???", "???", "???", "bt", "bts", "btr", "btc" };
1291    if (opc_aux < 4 || opc_aux > 7) vpanic("nameGrp8(x86)");
1292    return grp8_names[opc_aux];
1293 }
1294 
nameIReg(Int size,Int reg)1295 static const HChar* nameIReg ( Int size, Int reg )
1296 {
1297    static const HChar* ireg32_names[8]
1298      = { "%eax", "%ecx", "%edx", "%ebx",
1299          "%esp", "%ebp", "%esi", "%edi" };
1300    static const HChar* ireg16_names[8]
1301      = { "%ax", "%cx", "%dx", "%bx", "%sp", "%bp", "%si", "%di" };
1302    static const HChar* ireg8_names[8]
1303      = { "%al", "%cl", "%dl", "%bl",
1304          "%ah{sp}", "%ch{bp}", "%dh{si}", "%bh{di}" };
1305    if (reg < 0 || reg > 7) goto bad;
1306    switch (size) {
1307       case 4: return ireg32_names[reg];
1308       case 2: return ireg16_names[reg];
1309       case 1: return ireg8_names[reg];
1310    }
1311   bad:
1312    vpanic("nameIReg(X86)");
1313    return NULL; /*notreached*/
1314 }
1315 
nameSReg(UInt sreg)1316 static const HChar* nameSReg ( UInt sreg )
1317 {
1318    switch (sreg) {
1319       case R_ES: return "%es";
1320       case R_CS: return "%cs";
1321       case R_SS: return "%ss";
1322       case R_DS: return "%ds";
1323       case R_FS: return "%fs";
1324       case R_GS: return "%gs";
1325       default: vpanic("nameSReg(x86)");
1326    }
1327 }
1328 
nameMMXReg(Int mmxreg)1329 static const HChar* nameMMXReg ( Int mmxreg )
1330 {
1331    static const HChar* mmx_names[8]
1332      = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
1333    if (mmxreg < 0 || mmxreg > 7) vpanic("nameMMXReg(x86,guest)");
1334    return mmx_names[mmxreg];
1335 }
1336 
nameXMMReg(Int xmmreg)1337 static const HChar* nameXMMReg ( Int xmmreg )
1338 {
1339    static const HChar* xmm_names[8]
1340      = { "%xmm0", "%xmm1", "%xmm2", "%xmm3",
1341          "%xmm4", "%xmm5", "%xmm6", "%xmm7" };
1342    if (xmmreg < 0 || xmmreg > 7) vpanic("name_of_xmm_reg");
1343    return xmm_names[xmmreg];
1344 }
1345 
nameMMXGran(Int gran)1346 static const HChar* nameMMXGran ( Int gran )
1347 {
1348    switch (gran) {
1349       case 0: return "b";
1350       case 1: return "w";
1351       case 2: return "d";
1352       case 3: return "q";
1353       default: vpanic("nameMMXGran(x86,guest)");
1354    }
1355 }
1356 
nameISize(Int size)1357 static HChar nameISize ( Int size )
1358 {
1359    switch (size) {
1360       case 4: return 'l';
1361       case 2: return 'w';
1362       case 1: return 'b';
1363       default: vpanic("nameISize(x86)");
1364    }
1365 }
1366 
1367 
1368 /*------------------------------------------------------------*/
1369 /*--- JMP helpers                                          ---*/
1370 /*------------------------------------------------------------*/
1371 
jmp_lit(DisResult * dres,IRJumpKind kind,Addr32 d32)1372 static void jmp_lit( /*MOD*/DisResult* dres,
1373                      IRJumpKind kind, Addr32 d32 )
1374 {
1375    vassert(dres->whatNext    == Dis_Continue);
1376    vassert(dres->len         == 0);
1377    vassert(dres->continueAt  == 0);
1378    vassert(dres->jk_StopHere == Ijk_INVALID);
1379    dres->whatNext    = Dis_StopHere;
1380    dres->jk_StopHere = kind;
1381    stmt( IRStmt_Put( OFFB_EIP, mkU32(d32) ) );
1382 }
1383 
jmp_treg(DisResult * dres,IRJumpKind kind,IRTemp t)1384 static void jmp_treg( /*MOD*/DisResult* dres,
1385                       IRJumpKind kind, IRTemp t )
1386 {
1387    vassert(dres->whatNext    == Dis_Continue);
1388    vassert(dres->len         == 0);
1389    vassert(dres->continueAt  == 0);
1390    vassert(dres->jk_StopHere == Ijk_INVALID);
1391    dres->whatNext    = Dis_StopHere;
1392    dres->jk_StopHere = kind;
1393    stmt( IRStmt_Put( OFFB_EIP, mkexpr(t) ) );
1394 }
1395 
1396 static
jcc_01(DisResult * dres,X86Condcode cond,Addr32 d32_false,Addr32 d32_true)1397 void jcc_01( /*MOD*/DisResult* dres,
1398              X86Condcode cond, Addr32 d32_false, Addr32 d32_true )
1399 {
1400    Bool        invert;
1401    X86Condcode condPos;
1402    vassert(dres->whatNext    == Dis_Continue);
1403    vassert(dres->len         == 0);
1404    vassert(dres->continueAt  == 0);
1405    vassert(dres->jk_StopHere == Ijk_INVALID);
1406    dres->whatNext    = Dis_StopHere;
1407    dres->jk_StopHere = Ijk_Boring;
1408    condPos = positiveIse_X86Condcode ( cond, &invert );
1409    if (invert) {
1410       stmt( IRStmt_Exit( mk_x86g_calculate_condition(condPos),
1411                          Ijk_Boring,
1412                          IRConst_U32(d32_false),
1413                          OFFB_EIP ) );
1414       stmt( IRStmt_Put( OFFB_EIP, mkU32(d32_true) ) );
1415    } else {
1416       stmt( IRStmt_Exit( mk_x86g_calculate_condition(condPos),
1417                          Ijk_Boring,
1418                          IRConst_U32(d32_true),
1419                          OFFB_EIP ) );
1420       stmt( IRStmt_Put( OFFB_EIP, mkU32(d32_false) ) );
1421    }
1422 }
1423 
1424 
1425 /*------------------------------------------------------------*/
1426 /*--- Disassembling addressing modes                       ---*/
1427 /*------------------------------------------------------------*/
1428 
1429 static
sorbTxt(UChar sorb)1430 const HChar* sorbTxt ( UChar sorb )
1431 {
1432    switch (sorb) {
1433       case 0:    return ""; /* no override */
1434       case 0x3E: return "%ds";
1435       case 0x26: return "%es:";
1436       case 0x64: return "%fs:";
1437       case 0x65: return "%gs:";
1438       case 0x2e: return "%cs:";
1439       case 0x36: return "%ss:";
1440       default: vpanic("sorbTxt(x86,guest)");
1441    }
1442 }
1443 
1444 
1445 static
handleSegOverrideAux(IRTemp seg_selector,IRExpr * virtual)1446 IRExpr* handleSegOverrideAux ( IRTemp seg_selector, IRExpr* virtual )
1447 {
1448    IRTemp ldt_ptr, gdt_ptr, r64;
1449 
1450    ldt_ptr      = newTemp(Ity_I64);
1451    gdt_ptr      = newTemp(Ity_I64);
1452    r64          = newTemp(Ity_I64);
1453 
1454    assign( ldt_ptr, IRExpr_Get( OFFB_LDT, Ity_I64 ));
1455    assign( gdt_ptr, IRExpr_Get( OFFB_GDT, Ity_I64 ));
1456 
1457    /*
1458    Call this to do the translation and limit checks:
1459    ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
1460                                  UInt seg_selector, UInt virtual_addr )
1461    */
1462    assign(
1463       r64,
1464       mkIRExprCCall(
1465          Ity_I64,
1466          0/*regparms*/,
1467          "x86g_use_seg_selector",
1468          &x86g_use_seg_selector,
1469          mkIRExprVec_4( mkexpr(ldt_ptr), mkexpr(gdt_ptr),
1470                         mkexpr(seg_selector), virtual)
1471       )
1472    );
1473 
1474    /* If the high 32 of the result are non-zero, there was a
1475       failure in address translation.  In which case, make a
1476       quick exit.
1477    */
1478    stmt(
1479       IRStmt_Exit(
1480          binop(Iop_CmpNE32, unop(Iop_64HIto32, mkexpr(r64)), mkU32(0)),
1481          Ijk_MapFail,
1482          IRConst_U32( guest_EIP_curr_instr ),
1483          OFFB_EIP
1484       )
1485    );
1486 
1487    /* otherwise, here's the translated result. */
1488    return unop(Iop_64to32, mkexpr(r64));
1489 }
1490 
1491 /* 'virtual' is an IRExpr* holding a virtual address.  Convert it to a
1492    linear address by adding any required segment override as indicated
1493    by sorb. */
1494 static
handleSegOverride(UChar sorb,IRExpr * virtual)1495 IRExpr* handleSegOverride ( UChar sorb, IRExpr* virtual )
1496 {
1497    Int    sreg;
1498    IRTemp seg_selector;
1499 
1500    if (sorb == 0)
1501       /* the common case - no override */
1502       return virtual;
1503 
1504    switch (sorb) {
1505       case 0x3E: sreg = R_DS; break;
1506       case 0x26: sreg = R_ES; break;
1507       case 0x64: sreg = R_FS; break;
1508       case 0x65: sreg = R_GS; break;
1509       case 0x2E: sreg = R_CS; break;
1510       case 0x36: sreg = R_SS; break;
1511       default: vpanic("handleSegOverride(x86,guest)");
1512    }
1513 
1514 
1515    seg_selector = newTemp(Ity_I32);
1516    assign( seg_selector, unop(Iop_16Uto32, getSReg(sreg)) );
1517 
1518    return handleSegOverrideAux(seg_selector, virtual);
1519 }
1520 
1521 
1522 /* Generate IR to calculate an address indicated by a ModRM and
1523    following SIB bytes.  The expression, and the number of bytes in
1524    the address mode, are returned.  Note that this fn should not be
1525    called if the R/M part of the address denotes a register instead of
1526    memory.  If print_codegen is true, text of the addressing mode is
1527    placed in buf.
1528 
1529    The computed address is stored in a new tempreg, and the
1530    identity of the tempreg is returned.  */
1531 
disAMode_copy2tmp(IRExpr * addr)1532 static IRTemp disAMode_copy2tmp ( IRExpr* addr )
1533 {
1534    IRTemp tmp = newTemp(Ity_I32);
1535    IRTemp halfsize_tmp = IRTemp_INVALID;
1536 
1537    if (current_sz_addr == 4) {
1538       assign( tmp, addr );
1539    } else {
1540       halfsize_tmp = newTemp(Ity_I16);
1541       assign(halfsize_tmp, addr);
1542       assign(tmp, unop(Iop_16Uto32, mkexpr(halfsize_tmp)));
1543    }
1544    return tmp;
1545 }
1546 
1547 static
disAMode32(Int * len,UChar sorb,Int delta,HChar * buf)1548 IRTemp disAMode32 ( Int* len, UChar sorb, Int delta, HChar* buf )
1549 {
1550    UChar mod_reg_rm = getIByte(delta);
1551    delta++;
1552 
1553    buf[0] = (UChar)0;
1554 
1555    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
1556       jump table seems a bit excessive.
1557    */
1558    mod_reg_rm &= 0xC7;                      /* is now XX000YYY */
1559    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
1560                                             /* is now XX0XXYYY */
1561    mod_reg_rm &= 0x1F;                      /* is now 000XXYYY */
1562    switch (mod_reg_rm) {
1563 
1564       /* (%eax) .. (%edi), not including (%esp) or (%ebp).
1565          --> GET %reg, t
1566       */
1567       case 0x00: case 0x01: case 0x02: case 0x03:
1568       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
1569          { UChar rm = mod_reg_rm;
1570            DIS(buf, "%s(%s)", sorbTxt(sorb), nameIReg(4,rm));
1571            *len = 1;
1572            return disAMode_copy2tmp(
1573                   handleSegOverride(sorb, getIReg(4,rm)));
1574          }
1575 
1576       /* d8(%eax) ... d8(%edi), not including d8(%esp)
1577          --> GET %reg, t ; ADDL d8, t
1578       */
1579       case 0x08: case 0x09: case 0x0A: case 0x0B:
1580       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
1581          { UChar rm = toUChar(mod_reg_rm & 7);
1582            UInt  d  = getSDisp8(delta);
1583            DIS(buf, "%s%d(%s)", sorbTxt(sorb), (Int)d, nameIReg(4,rm));
1584            *len = 2;
1585            return disAMode_copy2tmp(
1586                   handleSegOverride(sorb,
1587                      binop(Iop_Add32,getIReg(4,rm),mkU32(d))));
1588          }
1589 
1590       /* d32(%eax) ... d32(%edi), not including d32(%esp)
1591          --> GET %reg, t ; ADDL d8, t
1592       */
1593       case 0x10: case 0x11: case 0x12: case 0x13:
1594       /* ! 14 */ case 0x15: case 0x16: case 0x17:
1595          { UChar rm = toUChar(mod_reg_rm & 7);
1596            UInt  d  = getUDisp32(delta);
1597            DIS(buf, "%s0x%x(%s)", sorbTxt(sorb), d, nameIReg(4,rm));
1598            *len = 5;
1599            return disAMode_copy2tmp(
1600                   handleSegOverride(sorb,
1601                      binop(Iop_Add32,getIReg(4,rm),mkU32(d))));
1602          }
1603 
1604       /* a register, %eax .. %edi.  This shouldn't happen. */
1605       case 0x18: case 0x19: case 0x1A: case 0x1B:
1606       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
1607          vpanic("disAMode(x86): not an addr!");
1608 
1609       /* a 32-bit literal address
1610          --> MOV d32, tmp
1611       */
1612       case 0x05:
1613          { UInt d = getUDisp32(delta);
1614            *len = 5;
1615            DIS(buf, "%s(0x%x)", sorbTxt(sorb), d);
1616            return disAMode_copy2tmp(
1617                      handleSegOverride(sorb, mkU32(d)));
1618          }
1619 
1620       case 0x04: {
1621          /* SIB, with no displacement.  Special cases:
1622             -- %esp cannot act as an index value.
1623                If index_r indicates %esp, zero is used for the index.
1624             -- when mod is zero and base indicates EBP, base is instead
1625                a 32-bit literal.
1626             It's all madness, I tell you.  Extract %index, %base and
1627             scale from the SIB byte.  The value denoted is then:
1628                | %index == %ESP && %base == %EBP
1629                = d32 following SIB byte
1630                | %index == %ESP && %base != %EBP
1631                = %base
1632                | %index != %ESP && %base == %EBP
1633                = d32 following SIB byte + (%index << scale)
1634                | %index != %ESP && %base != %ESP
1635                = %base + (%index << scale)
1636 
1637             What happens to the souls of CPU architects who dream up such
1638             horrendous schemes, do you suppose?
1639          */
1640          UChar sib     = getIByte(delta);
1641          UChar scale   = toUChar((sib >> 6) & 3);
1642          UChar index_r = toUChar((sib >> 3) & 7);
1643          UChar base_r  = toUChar(sib & 7);
1644          delta++;
1645 
1646          if (index_r != R_ESP && base_r != R_EBP) {
1647             DIS(buf, "%s(%s,%s,%d)", sorbTxt(sorb),
1648                       nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
1649             *len = 2;
1650             return
1651                disAMode_copy2tmp(
1652                handleSegOverride(sorb,
1653                   binop(Iop_Add32,
1654                         getIReg(4,base_r),
1655                         binop(Iop_Shl32, getIReg(4,index_r),
1656                               mkU8(scale)))));
1657          }
1658 
1659          if (index_r != R_ESP && base_r == R_EBP) {
1660             UInt d = getUDisp32(delta);
1661             DIS(buf, "%s0x%x(,%s,%d)", sorbTxt(sorb), d,
1662                       nameIReg(4,index_r), 1<<scale);
1663             *len = 6;
1664             return
1665                disAMode_copy2tmp(
1666                handleSegOverride(sorb,
1667                   binop(Iop_Add32,
1668                         binop(Iop_Shl32, getIReg(4,index_r), mkU8(scale)),
1669                         mkU32(d))));
1670          }
1671 
1672          if (index_r == R_ESP && base_r != R_EBP) {
1673             DIS(buf, "%s(%s,,)", sorbTxt(sorb), nameIReg(4,base_r));
1674             *len = 2;
1675             return disAMode_copy2tmp(
1676                    handleSegOverride(sorb, getIReg(4,base_r)));
1677          }
1678 
1679          if (index_r == R_ESP && base_r == R_EBP) {
1680             UInt d = getUDisp32(delta);
1681             DIS(buf, "%s0x%x(,,)", sorbTxt(sorb), d);
1682             *len = 6;
1683             return disAMode_copy2tmp(
1684                    handleSegOverride(sorb, mkU32(d)));
1685          }
1686          /*NOTREACHED*/
1687          vassert(0);
1688       }
1689 
1690       /* SIB, with 8-bit displacement.  Special cases:
1691          -- %esp cannot act as an index value.
1692             If index_r indicates %esp, zero is used for the index.
1693          Denoted value is:
1694             | %index == %ESP
1695             = d8 + %base
1696             | %index != %ESP
1697             = d8 + %base + (%index << scale)
1698       */
1699       case 0x0C: {
1700          UChar sib     = getIByte(delta);
1701          UChar scale   = toUChar((sib >> 6) & 3);
1702          UChar index_r = toUChar((sib >> 3) & 7);
1703          UChar base_r  = toUChar(sib & 7);
1704          UInt  d       = getSDisp8(delta+1);
1705 
1706          if (index_r == R_ESP) {
1707             DIS(buf, "%s%d(%s,,)", sorbTxt(sorb),
1708                                    (Int)d, nameIReg(4,base_r));
1709             *len = 3;
1710             return disAMode_copy2tmp(
1711                    handleSegOverride(sorb,
1712                       binop(Iop_Add32, getIReg(4,base_r), mkU32(d)) ));
1713          } else {
1714             DIS(buf, "%s%d(%s,%s,%d)", sorbTxt(sorb), (Int)d,
1715                      nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
1716             *len = 3;
1717             return
1718                 disAMode_copy2tmp(
1719                 handleSegOverride(sorb,
1720                   binop(Iop_Add32,
1721                         binop(Iop_Add32,
1722                               getIReg(4,base_r),
1723                               binop(Iop_Shl32,
1724                                     getIReg(4,index_r), mkU8(scale))),
1725                         mkU32(d))));
1726          }
1727 	 /*NOTREACHED*/
1728          vassert(0);
1729       }
1730 
1731       /* SIB, with 32-bit displacement.  Special cases:
1732          -- %esp cannot act as an index value.
1733             If index_r indicates %esp, zero is used for the index.
1734          Denoted value is:
1735             | %index == %ESP
1736             = d32 + %base
1737             | %index != %ESP
1738             = d32 + %base + (%index << scale)
1739       */
1740       case 0x14: {
1741          UChar sib     = getIByte(delta);
1742          UChar scale   = toUChar((sib >> 6) & 3);
1743          UChar index_r = toUChar((sib >> 3) & 7);
1744          UChar base_r  = toUChar(sib & 7);
1745          UInt d        = getUDisp32(delta+1);
1746 
1747          if (index_r == R_ESP) {
1748             DIS(buf, "%s%d(%s,,)", sorbTxt(sorb),
1749                                    (Int)d, nameIReg(4,base_r));
1750             *len = 6;
1751             return disAMode_copy2tmp(
1752                    handleSegOverride(sorb,
1753                       binop(Iop_Add32, getIReg(4,base_r), mkU32(d)) ));
1754          } else {
1755             DIS(buf, "%s%d(%s,%s,%d)", sorbTxt(sorb), (Int)d,
1756                      nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
1757             *len = 6;
1758             return
1759                 disAMode_copy2tmp(
1760                 handleSegOverride(sorb,
1761                   binop(Iop_Add32,
1762                         binop(Iop_Add32,
1763                               getIReg(4,base_r),
1764                               binop(Iop_Shl32,
1765                                     getIReg(4,index_r), mkU8(scale))),
1766                         mkU32(d))));
1767          }
1768 	 /*NOTREACHED*/
1769          vassert(0);
1770       }
1771 
1772       default:
1773          vpanic("disAMode(x86)");
1774          return 0; /*notreached*/
1775    }
1776 }
1777 
1778 static
disAMode16(Int * len,UChar sorb,Int delta,HChar * buf)1779 IRTemp disAMode16 ( Int* len, UChar sorb, Int delta, HChar* buf )
1780 {
1781    UChar mod_reg_rm = getIByte(delta);
1782    delta++;
1783 
1784    buf[0] = (UChar)0;
1785 
1786    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
1787       jump table seems a bit excessive.
1788    */
1789    mod_reg_rm &= 0xC7;                      /* is now XX000YYY */
1790    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
1791                                             /* is now XX0XXYYY */
1792    mod_reg_rm &= 0x1F;                      /* is now 000XXYYY */
1793    switch (mod_reg_rm) {
1794 
1795       case 0x00: case 0x01: case 0x02: case 0x03:
1796          vpanic("TODO disAMode16 1");
1797          break;
1798 
1799       case 0x04: case 0x05: case 0x07:
1800          { UChar rm = mod_reg_rm;
1801            *len = 1;
1802            return disAMode_copy2tmp(
1803                   handleSegOverride(sorb, getIReg(2,rm)));
1804          }
1805 
1806       case 0x08: case 0x09: case 0x0a: case 0x0b:
1807          vpanic("TODO disAMode16 2");
1808          break;
1809 
1810       case 0x0C: case 0x0D: case 0x0E: case 0x0F:
1811          { UChar rm = toUChar(mod_reg_rm & 7);
1812            UInt  d  = getSDisp8(delta);
1813            DIS(buf, "%s%d(%s)", sorbTxt(sorb), (Int)d, nameIReg(2,rm));
1814            *len = 2;
1815            return disAMode_copy2tmp(
1816                   handleSegOverride(sorb,
1817                      binop(Iop_Add16,getIReg(2,rm),mkU16(d))));
1818          }
1819 
1820       case 0x14: case 0x15: case 0x16: case 0x17:
1821          { UChar rm = toUChar(mod_reg_rm & 7);
1822            UInt  d  = getUDisp16(delta);
1823            DIS(buf, "%s0x%x(%s)", sorbTxt(sorb), (Int)d, nameIReg(2,rm));
1824            *len = 3;
1825            return disAMode_copy2tmp(
1826                   handleSegOverride(sorb,
1827                      binop(Iop_Add16,getIReg(2,rm),mkU16(d))));
1828          }
1829 
1830       /* This shouldn't happen. */
1831       case 0x18: case 0x19: case 0x1A: case 0x1B:
1832       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
1833          vpanic("disAMode(x86): not an addr!");
1834 
1835       case 0x06:
1836          { UInt d = getUDisp16(delta);
1837            *len = 3;
1838            DIS(buf, "%s(0x%x)", sorbTxt(sorb), d);
1839            return disAMode_copy2tmp(
1840                      handleSegOverride(sorb, mkU16(d)));
1841          }
1842 
1843 
1844       default:
1845          vpanic("disAMode(x86)");
1846          return 0; /*notreached*/
1847    }
1848 }
1849 
1850 static
disAMode(Int * len,UChar sorb,Int delta,HChar * buf)1851 IRTemp disAMode ( Int* len, UChar sorb, Int delta, HChar* buf ) {
1852    if (current_sz_addr == 4) {
1853      return disAMode32(len, sorb, delta, buf);
1854    } else {
1855      return disAMode16(len, sorb, delta, buf);
1856    }
1857 }
1858 
1859 /* Figure out the number of (insn-stream) bytes constituting the amode
1860    beginning at delta.  Is useful for getting hold of literals beyond
1861    the end of the amode before it has been disassembled.  */
1862 
lengthAMode32(Int delta)1863 static UInt lengthAMode32 ( Int delta )
1864 {
1865    UChar mod_reg_rm = getIByte(delta); delta++;
1866 
1867    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
1868       jump table seems a bit excessive.
1869    */
1870    mod_reg_rm &= 0xC7;               /* is now XX000YYY */
1871    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
1872                                      /* is now XX0XXYYY */
1873    mod_reg_rm &= 0x1F;               /* is now 000XXYYY */
1874    switch (mod_reg_rm) {
1875 
1876       /* (%eax) .. (%edi), not including (%esp) or (%ebp). */
1877       case 0x00: case 0x01: case 0x02: case 0x03:
1878       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
1879          return 1;
1880 
1881       /* d8(%eax) ... d8(%edi), not including d8(%esp). */
1882       case 0x08: case 0x09: case 0x0A: case 0x0B:
1883       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
1884          return 2;
1885 
1886       /* d32(%eax) ... d32(%edi), not including d32(%esp). */
1887       case 0x10: case 0x11: case 0x12: case 0x13:
1888       /* ! 14 */ case 0x15: case 0x16: case 0x17:
1889          return 5;
1890 
1891       /* a register, %eax .. %edi.  (Not an addr, but still handled.) */
1892       case 0x18: case 0x19: case 0x1A: case 0x1B:
1893       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
1894          return 1;
1895 
1896       /* a 32-bit literal address. */
1897       case 0x05: return 5;
1898 
1899       /* SIB, no displacement.  */
1900       case 0x04: {
1901          UChar sib    = getIByte(delta);
1902          UChar base_r = toUChar(sib & 7);
1903          if (base_r == R_EBP) return 6; else return 2;
1904       }
1905       /* SIB, with 8-bit displacement.  */
1906       case 0x0C: return 3;
1907 
1908       /* SIB, with 32-bit displacement.  */
1909       case 0x14: return 6;
1910 
1911       default:
1912          vpanic("lengthAMode");
1913          return 0; /*notreached*/
1914    }
1915 }
1916 
lengthAMode16(Int delta)1917 static UInt lengthAMode16 ( Int delta )
1918 {
1919    UChar mod_reg_rm = getIByte(delta); delta++;
1920 
1921    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
1922       jump table seems a bit excessive.
1923    */
1924    mod_reg_rm &= 0xC7;               /* is now XX000YYY */
1925    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
1926                                      /* is now XX0XXYYY */
1927    mod_reg_rm &= 0x1F;               /* is now 000XXYYY */
1928    switch (mod_reg_rm) {
1929 
1930       case 0x04: case 0x05: case 0x07:
1931       case 0x18: case 0x19: case 0x1A: case 0x1B:
1932       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
1933          return 1;
1934       case 0x00: case 0x01: case 0x02: case 0x03: case 0x06:
1935          return 2;
1936       case 0x08: case 0x09: case 0x0a: case 0x0b:
1937       case 0x0c: case 0x0d: case 0x0e: case 0x0f:
1938       case 0x14: case 0x15: case 0x16: case 0x17:
1939          return 3;
1940       case 0x10: case 0x11: case 0x12: case 0x13:
1941          return 4;
1942       default:
1943          vpanic("lengthAMode16");
1944          return 0; /*notreached*/
1945    }
1946 }
1947 
lengthAMode(Int delta)1948 static UInt lengthAMode ( Int delta )
1949 {
1950    if (protected_mode) {
1951       return lengthAMode32(delta);
1952    } else {
1953       return lengthAMode16(delta);
1954    }
1955 }
1956 
1957 /*------------------------------------------------------------*/
1958 /*--- Disassembling common idioms                          ---*/
1959 /*------------------------------------------------------------*/
1960 
1961 /* Handle binary integer instructions of the form
1962       op E, G  meaning
1963       op reg-or-mem, reg
1964    Is passed the a ptr to the modRM byte, the actual operation, and the
1965    data size.  Returns the address advanced completely over this
1966    instruction.
1967 
1968    E(src) is reg-or-mem
1969    G(dst) is reg.
1970 
1971    If E is reg, -->    GET %G,  tmp
1972                        OP %E,   tmp
1973                        PUT tmp, %G
1974 
1975    If E is mem and OP is not reversible,
1976                 -->    (getAddr E) -> tmpa
1977                        LD (tmpa), tmpa
1978                        GET %G, tmp2
1979                        OP tmpa, tmp2
1980                        PUT tmp2, %G
1981 
1982    If E is mem and OP is reversible
1983                 -->    (getAddr E) -> tmpa
1984                        LD (tmpa), tmpa
1985                        OP %G, tmpa
1986                        PUT tmpa, %G
1987 */
1988 static
dis_op2_E_G(UChar sorb,Bool addSubCarry,IROp op8,Bool keep,Int size,Int delta0,const HChar * t_x86opc)1989 UInt dis_op2_E_G ( UChar       sorb,
1990                    Bool        addSubCarry,
1991                    IROp        op8,
1992                    Bool        keep,
1993                    Int         size,
1994                    Int         delta0,
1995                    const HChar* t_x86opc )
1996 {
1997    HChar   dis_buf[50];
1998    Int     len;
1999    IRType  ty   = szToITy(size);
2000    IRTemp  dst1 = newTemp(ty);
2001    IRTemp  src  = newTemp(ty);
2002    IRTemp  dst0 = newTemp(ty);
2003    UChar   rm   = getUChar(delta0);
2004    IRTemp  addr = IRTemp_INVALID;
2005 
2006    /* addSubCarry == True indicates the intended operation is
2007       add-with-carry or subtract-with-borrow. */
2008    if (addSubCarry) {
2009       vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
2010       vassert(keep);
2011    }
2012 
2013    if (epartIsReg(rm)) {
2014       /* Specially handle XOR reg,reg, because that doesn't really
2015          depend on reg, and doing the obvious thing potentially
2016          generates a spurious value check failure due to the bogus
2017          dependency.  Ditto SBB reg,reg. */
2018       if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
2019           && gregOfRM(rm) == eregOfRM(rm)) {
2020          putIReg(size, gregOfRM(rm), mkU(ty,0));
2021       }
2022       assign( dst0, getIReg(size,gregOfRM(rm)) );
2023       assign( src,  getIReg(size,eregOfRM(rm)) );
2024 
2025       if (addSubCarry && op8 == Iop_Add8) {
2026          helper_ADC( size, dst1, dst0, src,
2027                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2028          putIReg(size, gregOfRM(rm), mkexpr(dst1));
2029       } else
2030       if (addSubCarry && op8 == Iop_Sub8) {
2031          helper_SBB( size, dst1, dst0, src,
2032                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2033          putIReg(size, gregOfRM(rm), mkexpr(dst1));
2034       } else {
2035          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
2036          if (isAddSub(op8))
2037             setFlags_DEP1_DEP2(op8, dst0, src, ty);
2038          else
2039             setFlags_DEP1(op8, dst1, ty);
2040          if (keep)
2041             putIReg(size, gregOfRM(rm), mkexpr(dst1));
2042       }
2043 
2044       DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
2045                           nameIReg(size,eregOfRM(rm)),
2046                           nameIReg(size,gregOfRM(rm)));
2047       return 1+delta0;
2048    } else {
2049       /* E refers to memory */
2050       addr = disAMode ( &len, sorb, delta0, dis_buf);
2051       assign( dst0, getIReg(size,gregOfRM(rm)) );
2052       assign( src,  loadLE(szToITy(size), mkexpr(addr)) );
2053 
2054       if (addSubCarry && op8 == Iop_Add8) {
2055          helper_ADC( size, dst1, dst0, src,
2056                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2057          putIReg(size, gregOfRM(rm), mkexpr(dst1));
2058       } else
2059       if (addSubCarry && op8 == Iop_Sub8) {
2060          helper_SBB( size, dst1, dst0, src,
2061                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2062          putIReg(size, gregOfRM(rm), mkexpr(dst1));
2063       } else {
2064          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
2065          if (isAddSub(op8))
2066             setFlags_DEP1_DEP2(op8, dst0, src, ty);
2067          else
2068             setFlags_DEP1(op8, dst1, ty);
2069          if (keep)
2070             putIReg(size, gregOfRM(rm), mkexpr(dst1));
2071       }
2072 
2073       DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
2074                           dis_buf,nameIReg(size,gregOfRM(rm)));
2075       return len+delta0;
2076    }
2077 }
2078 
2079 
2080 
2081 /* Handle binary integer instructions of the form
2082       op G, E  meaning
2083       op reg, reg-or-mem
2084    Is passed the a ptr to the modRM byte, the actual operation, and the
2085    data size.  Returns the address advanced completely over this
2086    instruction.
2087 
2088    G(src) is reg.
2089    E(dst) is reg-or-mem
2090 
2091    If E is reg, -->    GET %E,  tmp
2092                        OP %G,   tmp
2093                        PUT tmp, %E
2094 
2095    If E is mem, -->    (getAddr E) -> tmpa
2096                        LD (tmpa), tmpv
2097                        OP %G, tmpv
2098                        ST tmpv, (tmpa)
2099 */
2100 static
dis_op2_G_E(UChar sorb,Bool locked,Bool addSubCarry,IROp op8,Bool keep,Int size,Int delta0,const HChar * t_x86opc)2101 UInt dis_op2_G_E ( UChar       sorb,
2102                    Bool        locked,
2103                    Bool        addSubCarry,
2104                    IROp        op8,
2105                    Bool        keep,
2106                    Int         size,
2107                    Int         delta0,
2108                    const HChar* t_x86opc )
2109 {
2110    HChar   dis_buf[50];
2111    Int     len;
2112    IRType  ty   = szToITy(size);
2113    IRTemp  dst1 = newTemp(ty);
2114    IRTemp  src  = newTemp(ty);
2115    IRTemp  dst0 = newTemp(ty);
2116    UChar   rm   = getIByte(delta0);
2117    IRTemp  addr = IRTemp_INVALID;
2118 
2119    /* addSubCarry == True indicates the intended operation is
2120       add-with-carry or subtract-with-borrow. */
2121    if (addSubCarry) {
2122       vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
2123       vassert(keep);
2124    }
2125 
2126    if (epartIsReg(rm)) {
2127       /* Specially handle XOR reg,reg, because that doesn't really
2128          depend on reg, and doing the obvious thing potentially
2129          generates a spurious value check failure due to the bogus
2130          dependency.  Ditto SBB reg,reg.*/
2131       if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
2132           && gregOfRM(rm) == eregOfRM(rm)) {
2133          putIReg(size, eregOfRM(rm), mkU(ty,0));
2134       }
2135       assign(dst0, getIReg(size,eregOfRM(rm)));
2136       assign(src,  getIReg(size,gregOfRM(rm)));
2137 
2138       if (addSubCarry && op8 == Iop_Add8) {
2139          helper_ADC( size, dst1, dst0, src,
2140                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2141          putIReg(size, eregOfRM(rm), mkexpr(dst1));
2142       } else
2143       if (addSubCarry && op8 == Iop_Sub8) {
2144          helper_SBB( size, dst1, dst0, src,
2145                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2146          putIReg(size, eregOfRM(rm), mkexpr(dst1));
2147       } else {
2148          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
2149          if (isAddSub(op8))
2150             setFlags_DEP1_DEP2(op8, dst0, src, ty);
2151          else
2152             setFlags_DEP1(op8, dst1, ty);
2153          if (keep)
2154             putIReg(size, eregOfRM(rm), mkexpr(dst1));
2155       }
2156 
2157       DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
2158                           nameIReg(size,gregOfRM(rm)),
2159                           nameIReg(size,eregOfRM(rm)));
2160       return 1+delta0;
2161    }
2162 
2163    /* E refers to memory */
2164    {
2165       addr = disAMode ( &len, sorb, delta0, dis_buf);
2166       assign(dst0, loadLE(ty,mkexpr(addr)));
2167       assign(src,  getIReg(size,gregOfRM(rm)));
2168 
2169       if (addSubCarry && op8 == Iop_Add8) {
2170          if (locked) {
2171             /* cas-style store */
2172             helper_ADC( size, dst1, dst0, src,
2173                         /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
2174          } else {
2175             /* normal store */
2176             helper_ADC( size, dst1, dst0, src,
2177                         /*store*/addr, IRTemp_INVALID, 0 );
2178          }
2179       } else
2180       if (addSubCarry && op8 == Iop_Sub8) {
2181          if (locked) {
2182             /* cas-style store */
2183             helper_SBB( size, dst1, dst0, src,
2184                         /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
2185          } else {
2186             /* normal store */
2187             helper_SBB( size, dst1, dst0, src,
2188                         /*store*/addr, IRTemp_INVALID, 0 );
2189          }
2190       } else {
2191          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
2192          if (keep) {
2193             if (locked) {
2194                if (0) vex_printf("locked case\n" );
2195                casLE( mkexpr(addr),
2196                       mkexpr(dst0)/*expval*/,
2197                       mkexpr(dst1)/*newval*/, guest_EIP_curr_instr );
2198             } else {
2199                if (0) vex_printf("nonlocked case\n");
2200                storeLE(mkexpr(addr), mkexpr(dst1));
2201             }
2202          }
2203          if (isAddSub(op8))
2204             setFlags_DEP1_DEP2(op8, dst0, src, ty);
2205          else
2206             setFlags_DEP1(op8, dst1, ty);
2207       }
2208 
2209       DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
2210                           nameIReg(size,gregOfRM(rm)), dis_buf);
2211       return len+delta0;
2212    }
2213 }
2214 
2215 
2216 /* Handle move instructions of the form
2217       mov E, G  meaning
2218       mov reg-or-mem, reg
2219    Is passed the a ptr to the modRM byte, and the data size.  Returns
2220    the address advanced completely over this instruction.
2221 
2222    E(src) is reg-or-mem
2223    G(dst) is reg.
2224 
2225    If E is reg, -->    GET %E,  tmpv
2226                        PUT tmpv, %G
2227 
2228    If E is mem  -->    (getAddr E) -> tmpa
2229                        LD (tmpa), tmpb
2230                        PUT tmpb, %G
2231 */
2232 static
dis_mov_E_G(UChar sorb,Int size,Int delta0)2233 UInt dis_mov_E_G ( UChar       sorb,
2234                    Int         size,
2235                    Int         delta0 )
2236 {
2237    Int len;
2238    UChar rm = getIByte(delta0);
2239    HChar dis_buf[50];
2240 
2241    if (epartIsReg(rm)) {
2242       putIReg(size, gregOfRM(rm), getIReg(size, eregOfRM(rm)));
2243       DIP("mov%c %s,%s\n", nameISize(size),
2244                            nameIReg(size,eregOfRM(rm)),
2245                            nameIReg(size,gregOfRM(rm)));
2246       return 1+delta0;
2247    }
2248 
2249    /* E refers to memory */
2250    {
2251       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
2252       putIReg(size, gregOfRM(rm), loadLE(szToITy(size), mkexpr(addr)));
2253       DIP("mov%c %s,%s\n", nameISize(size),
2254                            dis_buf,nameIReg(size,gregOfRM(rm)));
2255       return delta0+len;
2256    }
2257 }
2258 
2259 
2260 /* Handle move instructions of the form
2261       mov G, E  meaning
2262       mov reg, reg-or-mem
2263    Is passed the a ptr to the modRM byte, and the data size.  Returns
2264    the address advanced completely over this instruction.
2265 
2266    G(src) is reg.
2267    E(dst) is reg-or-mem
2268 
2269    If E is reg, -->    GET %G,  tmp
2270                        PUT tmp, %E
2271 
2272    If E is mem, -->    (getAddr E) -> tmpa
2273                        GET %G, tmpv
2274                        ST tmpv, (tmpa)
2275 */
2276 static
dis_mov_G_E(UChar sorb,Int size,Int delta0)2277 UInt dis_mov_G_E ( UChar       sorb,
2278                    Int         size,
2279                    Int         delta0 )
2280 {
2281    Int len;
2282    UChar rm = getIByte(delta0);
2283    HChar dis_buf[50];
2284 
2285    if (epartIsReg(rm)) {
2286       putIReg(size, eregOfRM(rm), getIReg(size, gregOfRM(rm)));
2287       DIP("mov%c %s,%s\n", nameISize(size),
2288                            nameIReg(size,gregOfRM(rm)),
2289                            nameIReg(size,eregOfRM(rm)));
2290       return 1+delta0;
2291    }
2292 
2293    /* E refers to memory */
2294    {
2295       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf);
2296       storeLE( mkexpr(addr), getIReg(size, gregOfRM(rm)) );
2297       DIP("mov%c %s,%s\n", nameISize(size),
2298                            nameIReg(size,gregOfRM(rm)), dis_buf);
2299       return len+delta0;
2300    }
2301 }
2302 
2303 
2304 /* op $immediate, AL/AX/EAX. */
2305 static
dis_op_imm_A(Int size,Bool carrying,IROp op8,Bool keep,Int delta,const HChar * t_x86opc)2306 UInt dis_op_imm_A ( Int    size,
2307                     Bool   carrying,
2308                     IROp   op8,
2309                     Bool   keep,
2310                     Int    delta,
2311                     const HChar* t_x86opc )
2312 {
2313    IRType ty   = szToITy(size);
2314    IRTemp dst0 = newTemp(ty);
2315    IRTemp src  = newTemp(ty);
2316    IRTemp dst1 = newTemp(ty);
2317    UInt lit    = getUDisp(size,delta);
2318    assign(dst0, getIReg(size,R_EAX));
2319    assign(src,  mkU(ty,lit));
2320 
2321    if (isAddSub(op8) && !carrying) {
2322       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
2323       setFlags_DEP1_DEP2(op8, dst0, src, ty);
2324    }
2325    else
2326    if (isLogic(op8)) {
2327       vassert(!carrying);
2328       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
2329       setFlags_DEP1(op8, dst1, ty);
2330    }
2331    else
2332    if (op8 == Iop_Add8 && carrying) {
2333       helper_ADC( size, dst1, dst0, src,
2334                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2335    }
2336    else
2337    if (op8 == Iop_Sub8 && carrying) {
2338       helper_SBB( size, dst1, dst0, src,
2339                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2340    }
2341    else
2342       vpanic("dis_op_imm_A(x86,guest)");
2343 
2344    if (keep)
2345       putIReg(size, R_EAX, mkexpr(dst1));
2346 
2347    DIP("%s%c $0x%x, %s\n", t_x86opc, nameISize(size),
2348                            lit, nameIReg(size,R_EAX));
2349    return delta+size;
2350 }
2351 
2352 
2353 /* Sign- and Zero-extending moves. */
2354 static
dis_movx_E_G(UChar sorb,Int delta,Int szs,Int szd,Bool sign_extend)2355 UInt dis_movx_E_G ( UChar      sorb,
2356                     Int delta, Int szs, Int szd, Bool sign_extend )
2357 {
2358    UChar rm = getIByte(delta);
2359    if (epartIsReg(rm)) {
2360       if (szd == szs) {
2361          // mutant case.  See #250799
2362          putIReg(szd, gregOfRM(rm),
2363                            getIReg(szs,eregOfRM(rm)));
2364       } else {
2365          // normal case
2366          putIReg(szd, gregOfRM(rm),
2367                       unop(mkWidenOp(szs,szd,sign_extend),
2368                            getIReg(szs,eregOfRM(rm))));
2369       }
2370       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
2371                                nameISize(szs), nameISize(szd),
2372                                nameIReg(szs,eregOfRM(rm)),
2373                                nameIReg(szd,gregOfRM(rm)));
2374       return 1+delta;
2375    }
2376 
2377    /* E refers to memory */
2378    {
2379       Int    len;
2380       HChar  dis_buf[50];
2381       IRTemp addr = disAMode ( &len, sorb, delta, dis_buf );
2382       if (szd == szs) {
2383          // mutant case.  See #250799
2384          putIReg(szd, gregOfRM(rm),
2385                            loadLE(szToITy(szs),mkexpr(addr)));
2386       } else {
2387          // normal case
2388          putIReg(szd, gregOfRM(rm),
2389                       unop(mkWidenOp(szs,szd,sign_extend),
2390                            loadLE(szToITy(szs),mkexpr(addr))));
2391       }
2392       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
2393                                nameISize(szs), nameISize(szd),
2394                                dis_buf, nameIReg(szd,gregOfRM(rm)));
2395       return len+delta;
2396    }
2397 }
2398 
2399 
2400 /* Generate code to divide ArchRegs EDX:EAX / DX:AX / AX by the 32 /
2401    16 / 8 bit quantity in the given IRTemp.  */
2402 static
codegen_div(Int sz,IRTemp t,Bool signed_divide)2403 void codegen_div ( Int sz, IRTemp t, Bool signed_divide )
2404 {
2405    IROp   op    = signed_divide ? Iop_DivModS64to32 : Iop_DivModU64to32;
2406    IRTemp src64 = newTemp(Ity_I64);
2407    IRTemp dst64 = newTemp(Ity_I64);
2408    switch (sz) {
2409       case 4:
2410          assign( src64, binop(Iop_32HLto64,
2411                               getIReg(4,R_EDX), getIReg(4,R_EAX)) );
2412          assign( dst64, binop(op, mkexpr(src64), mkexpr(t)) );
2413          putIReg( 4, R_EAX, unop(Iop_64to32,mkexpr(dst64)) );
2414          putIReg( 4, R_EDX, unop(Iop_64HIto32,mkexpr(dst64)) );
2415          break;
2416       case 2: {
2417          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
2418          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
2419          assign( src64, unop(widen3264,
2420                              binop(Iop_16HLto32,
2421                                    getIReg(2,R_EDX), getIReg(2,R_EAX))) );
2422          assign( dst64, binop(op, mkexpr(src64), unop(widen1632,mkexpr(t))) );
2423          putIReg( 2, R_EAX, unop(Iop_32to16,unop(Iop_64to32,mkexpr(dst64))) );
2424          putIReg( 2, R_EDX, unop(Iop_32to16,unop(Iop_64HIto32,mkexpr(dst64))) );
2425          break;
2426       }
2427       case 1: {
2428          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
2429          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
2430          IROp widen816  = signed_divide ? Iop_8Sto16  : Iop_8Uto16;
2431          assign( src64, unop(widen3264, unop(widen1632, getIReg(2,R_EAX))) );
2432          assign( dst64,
2433                  binop(op, mkexpr(src64),
2434                            unop(widen1632, unop(widen816, mkexpr(t)))) );
2435          putIReg( 1, R_AL, unop(Iop_16to8, unop(Iop_32to16,
2436                            unop(Iop_64to32,mkexpr(dst64)))) );
2437          putIReg( 1, R_AH, unop(Iop_16to8, unop(Iop_32to16,
2438                            unop(Iop_64HIto32,mkexpr(dst64)))) );
2439          break;
2440       }
2441       default: vpanic("codegen_div(x86)");
2442    }
2443 }
2444 
2445 
2446 static
dis_Grp1(UChar sorb,Bool locked,Int delta,UChar modrm,Int am_sz,Int d_sz,Int sz,UInt d32)2447 UInt dis_Grp1 ( UChar sorb, Bool locked,
2448                 Int delta, UChar modrm,
2449                 Int am_sz, Int d_sz, Int sz, UInt d32 )
2450 {
2451    Int     len;
2452    HChar   dis_buf[50];
2453    IRType  ty   = szToITy(sz);
2454    IRTemp  dst1 = newTemp(ty);
2455    IRTemp  src  = newTemp(ty);
2456    IRTemp  dst0 = newTemp(ty);
2457    IRTemp  addr = IRTemp_INVALID;
2458    IROp    op8  = Iop_INVALID;
2459    UInt    mask = sz==1 ? 0xFF : (sz==2 ? 0xFFFF : 0xFFFFFFFF);
2460 
2461    switch (gregOfRM(modrm)) {
2462       case 0: op8 = Iop_Add8; break;  case 1: op8 = Iop_Or8;  break;
2463       case 2: break;  // ADC
2464       case 3: break;  // SBB
2465       case 4: op8 = Iop_And8; break;  case 5: op8 = Iop_Sub8; break;
2466       case 6: op8 = Iop_Xor8; break;  case 7: op8 = Iop_Sub8; break;
2467       /*NOTREACHED*/
2468       default: vpanic("dis_Grp1: unhandled case");
2469    }
2470 
2471    if (epartIsReg(modrm)) {
2472       vassert(am_sz == 1);
2473 
2474       assign(dst0, getIReg(sz,eregOfRM(modrm)));
2475       assign(src,  mkU(ty,d32 & mask));
2476 
2477       if (gregOfRM(modrm) == 2 /* ADC */) {
2478          helper_ADC( sz, dst1, dst0, src,
2479                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2480       } else
2481       if (gregOfRM(modrm) == 3 /* SBB */) {
2482          helper_SBB( sz, dst1, dst0, src,
2483                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2484       } else {
2485          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
2486          if (isAddSub(op8))
2487             setFlags_DEP1_DEP2(op8, dst0, src, ty);
2488          else
2489             setFlags_DEP1(op8, dst1, ty);
2490       }
2491 
2492       if (gregOfRM(modrm) < 7)
2493          putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
2494 
2495       delta += (am_sz + d_sz);
2496       DIP("%s%c $0x%x, %s\n", nameGrp1(gregOfRM(modrm)), nameISize(sz), d32,
2497                               nameIReg(sz,eregOfRM(modrm)));
2498    } else {
2499       addr = disAMode ( &len, sorb, delta, dis_buf);
2500 
2501       assign(dst0, loadLE(ty,mkexpr(addr)));
2502       assign(src, mkU(ty,d32 & mask));
2503 
2504       if (gregOfRM(modrm) == 2 /* ADC */) {
2505          if (locked) {
2506             /* cas-style store */
2507             helper_ADC( sz, dst1, dst0, src,
2508                        /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
2509          } else {
2510             /* normal store */
2511             helper_ADC( sz, dst1, dst0, src,
2512                         /*store*/addr, IRTemp_INVALID, 0 );
2513          }
2514       } else
2515       if (gregOfRM(modrm) == 3 /* SBB */) {
2516          if (locked) {
2517             /* cas-style store */
2518             helper_SBB( sz, dst1, dst0, src,
2519                        /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
2520          } else {
2521             /* normal store */
2522             helper_SBB( sz, dst1, dst0, src,
2523                         /*store*/addr, IRTemp_INVALID, 0 );
2524          }
2525       } else {
2526          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
2527          if (gregOfRM(modrm) < 7) {
2528             if (locked) {
2529                casLE( mkexpr(addr), mkexpr(dst0)/*expVal*/,
2530                                     mkexpr(dst1)/*newVal*/,
2531                                     guest_EIP_curr_instr );
2532             } else {
2533                storeLE(mkexpr(addr), mkexpr(dst1));
2534             }
2535          }
2536          if (isAddSub(op8))
2537             setFlags_DEP1_DEP2(op8, dst0, src, ty);
2538          else
2539             setFlags_DEP1(op8, dst1, ty);
2540       }
2541 
2542       delta += (len+d_sz);
2543       DIP("%s%c $0x%x, %s\n", nameGrp1(gregOfRM(modrm)), nameISize(sz),
2544                               d32, dis_buf);
2545    }
2546    return delta;
2547 }
2548 
2549 
2550 /* Group 2 extended opcodes.  shift_expr must be an 8-bit typed
2551    expression. */
2552 
2553 static
dis_Grp2(UChar sorb,Int delta,UChar modrm,Int am_sz,Int d_sz,Int sz,IRExpr * shift_expr,const HChar * shift_expr_txt,Bool * decode_OK)2554 UInt dis_Grp2 ( UChar sorb,
2555                 Int delta, UChar modrm,
2556                 Int am_sz, Int d_sz, Int sz, IRExpr* shift_expr,
2557                 const HChar* shift_expr_txt, Bool* decode_OK )
2558 {
2559    /* delta on entry points at the modrm byte. */
2560    HChar  dis_buf[50];
2561    Int    len;
2562    Bool   isShift, isRotate, isRotateC;
2563    IRType ty    = szToITy(sz);
2564    IRTemp dst0  = newTemp(ty);
2565    IRTemp dst1  = newTemp(ty);
2566    IRTemp addr  = IRTemp_INVALID;
2567 
2568    *decode_OK = True;
2569 
2570    vassert(sz == 1 || sz == 2 || sz == 4);
2571 
2572    /* Put value to shift/rotate in dst0. */
2573    if (epartIsReg(modrm)) {
2574       assign(dst0, getIReg(sz, eregOfRM(modrm)));
2575       delta += (am_sz + d_sz);
2576    } else {
2577       addr = disAMode ( &len, sorb, delta, dis_buf);
2578       assign(dst0, loadLE(ty,mkexpr(addr)));
2579       delta += len + d_sz;
2580    }
2581 
2582    isShift = False;
2583    switch (gregOfRM(modrm)) { case 4: case 5: case 6: case 7: isShift = True; }
2584 
2585    isRotate = False;
2586    switch (gregOfRM(modrm)) { case 0: case 1: isRotate = True; }
2587 
2588    isRotateC = False;
2589    switch (gregOfRM(modrm)) { case 2: case 3: isRotateC = True; }
2590 
2591    if (!isShift && !isRotate && !isRotateC) {
2592       /*NOTREACHED*/
2593       vpanic("dis_Grp2(Reg): unhandled case(x86)");
2594    }
2595 
2596    if (isRotateC) {
2597       /* call a helper; these insns are so ridiculous they do not
2598          deserve better */
2599       Bool     left = toBool(gregOfRM(modrm) == 2);
2600       IRTemp   r64  = newTemp(Ity_I64);
2601       IRExpr** args
2602          = mkIRExprVec_4( widenUto32(mkexpr(dst0)), /* thing to rotate */
2603                           widenUto32(shift_expr),   /* rotate amount */
2604                           widenUto32(mk_x86g_calculate_eflags_all()),
2605                           mkU32(sz) );
2606       assign( r64, mkIRExprCCall(
2607                       Ity_I64,
2608                       0/*regparm*/,
2609                       left ? "x86g_calculate_RCL" : "x86g_calculate_RCR",
2610                       left ? &x86g_calculate_RCL  : &x86g_calculate_RCR,
2611                       args
2612                    )
2613             );
2614       /* new eflags in hi half r64; new value in lo half r64 */
2615       assign( dst1, narrowTo(ty, unop(Iop_64to32, mkexpr(r64))) );
2616       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
2617       stmt( IRStmt_Put( OFFB_CC_DEP1, unop(Iop_64HIto32, mkexpr(r64)) ));
2618       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
2619       /* Set NDEP even though it isn't used.  This makes redundant-PUT
2620          elimination of previous stores to this field work better. */
2621       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
2622    }
2623 
2624    if (isShift) {
2625 
2626       IRTemp pre32     = newTemp(Ity_I32);
2627       IRTemp res32     = newTemp(Ity_I32);
2628       IRTemp res32ss   = newTemp(Ity_I32);
2629       IRTemp shift_amt = newTemp(Ity_I8);
2630       IROp   op32;
2631 
2632       switch (gregOfRM(modrm)) {
2633          case 4: op32 = Iop_Shl32; break;
2634          case 5: op32 = Iop_Shr32; break;
2635          case 6: op32 = Iop_Shl32; break;
2636          case 7: op32 = Iop_Sar32; break;
2637          /*NOTREACHED*/
2638          default: vpanic("dis_Grp2:shift"); break;
2639       }
2640 
2641       /* Widen the value to be shifted to 32 bits, do the shift, and
2642          narrow back down.  This seems surprisingly long-winded, but
2643          unfortunately the Intel semantics requires that 8/16-bit
2644          shifts give defined results for shift values all the way up
2645          to 31, and this seems the simplest way to do it.  It has the
2646          advantage that the only IR level shifts generated are of 32
2647          bit values, and the shift amount is guaranteed to be in the
2648          range 0 .. 31, thereby observing the IR semantics requiring
2649          all shift values to be in the range 0 .. 2^word_size-1. */
2650 
2651       /* shift_amt = shift_expr & 31, regardless of operation size */
2652       assign( shift_amt, binop(Iop_And8, shift_expr, mkU8(31)) );
2653 
2654       /* suitably widen the value to be shifted to 32 bits. */
2655       assign( pre32, op32==Iop_Sar32 ? widenSto32(mkexpr(dst0))
2656                                      : widenUto32(mkexpr(dst0)) );
2657 
2658       /* res32 = pre32 `shift` shift_amt */
2659       assign( res32, binop(op32, mkexpr(pre32), mkexpr(shift_amt)) );
2660 
2661       /* res32ss = pre32 `shift` ((shift_amt - 1) & 31) */
2662       assign( res32ss,
2663               binop(op32,
2664                     mkexpr(pre32),
2665                     binop(Iop_And8,
2666                           binop(Iop_Sub8,
2667                                 mkexpr(shift_amt), mkU8(1)),
2668                           mkU8(31))) );
2669 
2670       /* Build the flags thunk. */
2671       setFlags_DEP1_DEP2_shift(op32, res32, res32ss, ty, shift_amt);
2672 
2673       /* Narrow the result back down. */
2674       assign( dst1, narrowTo(ty, mkexpr(res32)) );
2675 
2676    } /* if (isShift) */
2677 
2678    else
2679    if (isRotate) {
2680       Int    ccOp      = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
2681       Bool   left      = toBool(gregOfRM(modrm) == 0);
2682       IRTemp rot_amt   = newTemp(Ity_I8);
2683       IRTemp rot_amt32 = newTemp(Ity_I8);
2684       IRTemp oldFlags  = newTemp(Ity_I32);
2685 
2686       /* rot_amt = shift_expr & mask */
2687       /* By masking the rotate amount thusly, the IR-level Shl/Shr
2688          expressions never shift beyond the word size and thus remain
2689          well defined. */
2690       assign(rot_amt32, binop(Iop_And8, shift_expr, mkU8(31)));
2691 
2692       if (ty == Ity_I32)
2693          assign(rot_amt, mkexpr(rot_amt32));
2694       else
2695          assign(rot_amt, binop(Iop_And8, mkexpr(rot_amt32), mkU8(8*sz-1)));
2696 
2697       if (left) {
2698 
2699          /* dst1 = (dst0 << rot_amt) | (dst0 >>u (wordsize-rot_amt)) */
2700          assign(dst1,
2701             binop( mkSizedOp(ty,Iop_Or8),
2702                    binop( mkSizedOp(ty,Iop_Shl8),
2703                           mkexpr(dst0),
2704                           mkexpr(rot_amt)
2705                    ),
2706                    binop( mkSizedOp(ty,Iop_Shr8),
2707                           mkexpr(dst0),
2708                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
2709                    )
2710             )
2711          );
2712          ccOp += X86G_CC_OP_ROLB;
2713 
2714       } else { /* right */
2715 
2716          /* dst1 = (dst0 >>u rot_amt) | (dst0 << (wordsize-rot_amt)) */
2717          assign(dst1,
2718             binop( mkSizedOp(ty,Iop_Or8),
2719                    binop( mkSizedOp(ty,Iop_Shr8),
2720                           mkexpr(dst0),
2721                           mkexpr(rot_amt)
2722                    ),
2723                    binop( mkSizedOp(ty,Iop_Shl8),
2724                           mkexpr(dst0),
2725                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
2726                    )
2727             )
2728          );
2729          ccOp += X86G_CC_OP_RORB;
2730 
2731       }
2732 
2733       /* dst1 now holds the rotated value.  Build flag thunk.  We
2734          need the resulting value for this, and the previous flags.
2735          Except don't set it if the rotate count is zero. */
2736 
2737       assign(oldFlags, mk_x86g_calculate_eflags_all());
2738 
2739       /* rot_amt32 :: Ity_I8.  We need to convert it to I1. */
2740       IRTemp rot_amt32b = newTemp(Ity_I1);
2741       assign(rot_amt32b, binop(Iop_CmpNE8, mkexpr(rot_amt32), mkU8(0)) );
2742 
2743       /* CC_DEP1 is the rotated value.  CC_NDEP is flags before. */
2744       stmt( IRStmt_Put( OFFB_CC_OP,
2745                         IRExpr_ITE( mkexpr(rot_amt32b),
2746                                     mkU32(ccOp),
2747                                     IRExpr_Get(OFFB_CC_OP,Ity_I32) ) ));
2748       stmt( IRStmt_Put( OFFB_CC_DEP1,
2749                         IRExpr_ITE( mkexpr(rot_amt32b),
2750                                     widenUto32(mkexpr(dst1)),
2751                                     IRExpr_Get(OFFB_CC_DEP1,Ity_I32) ) ));
2752       stmt( IRStmt_Put( OFFB_CC_DEP2,
2753                         IRExpr_ITE( mkexpr(rot_amt32b),
2754                                     mkU32(0),
2755                                     IRExpr_Get(OFFB_CC_DEP2,Ity_I32) ) ));
2756       stmt( IRStmt_Put( OFFB_CC_NDEP,
2757                         IRExpr_ITE( mkexpr(rot_amt32b),
2758                                     mkexpr(oldFlags),
2759                                     IRExpr_Get(OFFB_CC_NDEP,Ity_I32) ) ));
2760    } /* if (isRotate) */
2761 
2762    /* Save result, and finish up. */
2763    if (epartIsReg(modrm)) {
2764       putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
2765       if (vex_traceflags & VEX_TRACE_FE) {
2766          vex_printf("%s%c ",
2767                     nameGrp2(gregOfRM(modrm)), nameISize(sz) );
2768          if (shift_expr_txt)
2769             vex_printf("%s", shift_expr_txt);
2770          else
2771             ppIRExpr(shift_expr);
2772          vex_printf(", %s\n", nameIReg(sz,eregOfRM(modrm)));
2773       }
2774    } else {
2775       storeLE(mkexpr(addr), mkexpr(dst1));
2776       if (vex_traceflags & VEX_TRACE_FE) {
2777          vex_printf("%s%c ",
2778                     nameGrp2(gregOfRM(modrm)), nameISize(sz) );
2779          if (shift_expr_txt)
2780             vex_printf("%s", shift_expr_txt);
2781          else
2782             ppIRExpr(shift_expr);
2783          vex_printf(", %s\n", dis_buf);
2784       }
2785    }
2786    return delta;
2787 }
2788 
2789 
2790 /* Group 8 extended opcodes (but BT/BTS/BTC/BTR only). */
2791 static
dis_Grp8_Imm(UChar sorb,Bool locked,Int delta,UChar modrm,Int am_sz,Int sz,UInt src_val,Bool * decode_OK)2792 UInt dis_Grp8_Imm ( UChar sorb,
2793                     Bool locked,
2794                     Int delta, UChar modrm,
2795                     Int am_sz, Int sz, UInt src_val,
2796                     Bool* decode_OK )
2797 {
2798    /* src_val denotes a d8.
2799       And delta on entry points at the modrm byte. */
2800 
2801    IRType ty     = szToITy(sz);
2802    IRTemp t2     = newTemp(Ity_I32);
2803    IRTemp t2m    = newTemp(Ity_I32);
2804    IRTemp t_addr = IRTemp_INVALID;
2805    HChar  dis_buf[50];
2806    UInt   mask;
2807 
2808    /* we're optimists :-) */
2809    *decode_OK = True;
2810 
2811    /* Limit src_val -- the bit offset -- to something within a word.
2812       The Intel docs say that literal offsets larger than a word are
2813       masked in this way. */
2814    switch (sz) {
2815       case 2:  src_val &= 15; break;
2816       case 4:  src_val &= 31; break;
2817       default: *decode_OK = False; return delta;
2818    }
2819 
2820    /* Invent a mask suitable for the operation. */
2821    switch (gregOfRM(modrm)) {
2822       case 4: /* BT */  mask = 0;               break;
2823       case 5: /* BTS */ mask = 1 << src_val;    break;
2824       case 6: /* BTR */ mask = ~(1 << src_val); break;
2825       case 7: /* BTC */ mask = 1 << src_val;    break;
2826          /* If this needs to be extended, probably simplest to make a
2827             new function to handle the other cases (0 .. 3).  The
2828             Intel docs do however not indicate any use for 0 .. 3, so
2829             we don't expect this to happen. */
2830       default: *decode_OK = False; return delta;
2831    }
2832 
2833    /* Fetch the value to be tested and modified into t2, which is
2834       32-bits wide regardless of sz. */
2835    if (epartIsReg(modrm)) {
2836       vassert(am_sz == 1);
2837       assign( t2, widenUto32(getIReg(sz, eregOfRM(modrm))) );
2838       delta += (am_sz + 1);
2839       DIP("%s%c $0x%x, %s\n", nameGrp8(gregOfRM(modrm)), nameISize(sz),
2840                               src_val, nameIReg(sz,eregOfRM(modrm)));
2841    } else {
2842       Int len;
2843       t_addr = disAMode ( &len, sorb, delta, dis_buf);
2844       delta  += (len+1);
2845       assign( t2, widenUto32(loadLE(ty, mkexpr(t_addr))) );
2846       DIP("%s%c $0x%x, %s\n", nameGrp8(gregOfRM(modrm)), nameISize(sz),
2847                               src_val, dis_buf);
2848    }
2849 
2850    /* Compute the new value into t2m, if non-BT. */
2851    switch (gregOfRM(modrm)) {
2852       case 4: /* BT */
2853          break;
2854       case 5: /* BTS */
2855          assign( t2m, binop(Iop_Or32, mkU32(mask), mkexpr(t2)) );
2856          break;
2857       case 6: /* BTR */
2858          assign( t2m, binop(Iop_And32, mkU32(mask), mkexpr(t2)) );
2859          break;
2860       case 7: /* BTC */
2861          assign( t2m, binop(Iop_Xor32, mkU32(mask), mkexpr(t2)) );
2862          break;
2863       default:
2864          /*NOTREACHED*/ /*the previous switch guards this*/
2865          vassert(0);
2866    }
2867 
2868    /* Write the result back, if non-BT.  If the CAS fails then we
2869       side-exit from the trace at this point, and so the flag state is
2870       not affected.  This is of course as required. */
2871    if (gregOfRM(modrm) != 4 /* BT */) {
2872       if (epartIsReg(modrm)) {
2873          putIReg(sz, eregOfRM(modrm), narrowTo(ty, mkexpr(t2m)));
2874       } else {
2875          if (locked) {
2876             casLE( mkexpr(t_addr),
2877                    narrowTo(ty, mkexpr(t2))/*expd*/,
2878                    narrowTo(ty, mkexpr(t2m))/*new*/,
2879                    guest_EIP_curr_instr );
2880          } else {
2881             storeLE(mkexpr(t_addr), narrowTo(ty, mkexpr(t2m)));
2882          }
2883       }
2884    }
2885 
2886    /* Copy relevant bit from t2 into the carry flag. */
2887    /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
2888    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
2889    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
2890    stmt( IRStmt_Put(
2891             OFFB_CC_DEP1,
2892             binop(Iop_And32,
2893                   binop(Iop_Shr32, mkexpr(t2), mkU8(src_val)),
2894                   mkU32(1))
2895        ));
2896    /* Set NDEP even though it isn't used.  This makes redundant-PUT
2897       elimination of previous stores to this field work better. */
2898    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
2899 
2900    return delta;
2901 }
2902 
2903 
2904 /* Signed/unsigned widening multiply.  Generate IR to multiply the
2905    value in EAX/AX/AL by the given IRTemp, and park the result in
2906    EDX:EAX/DX:AX/AX.
2907 */
codegen_mulL_A_D(Int sz,Bool syned,IRTemp tmp,const HChar * tmp_txt)2908 static void codegen_mulL_A_D ( Int sz, Bool syned,
2909                                IRTemp tmp, const HChar* tmp_txt )
2910 {
2911    IRType ty = szToITy(sz);
2912    IRTemp t1 = newTemp(ty);
2913 
2914    assign( t1, getIReg(sz, R_EAX) );
2915 
2916    switch (ty) {
2917       case Ity_I32: {
2918          IRTemp res64   = newTemp(Ity_I64);
2919          IRTemp resHi   = newTemp(Ity_I32);
2920          IRTemp resLo   = newTemp(Ity_I32);
2921          IROp   mulOp   = syned ? Iop_MullS32 : Iop_MullU32;
2922          UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
2923          setFlags_MUL ( Ity_I32, t1, tmp, tBaseOp );
2924          assign( res64, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
2925          assign( resHi, unop(Iop_64HIto32,mkexpr(res64)));
2926          assign( resLo, unop(Iop_64to32,mkexpr(res64)));
2927          putIReg(4, R_EDX, mkexpr(resHi));
2928          putIReg(4, R_EAX, mkexpr(resLo));
2929          break;
2930       }
2931       case Ity_I16: {
2932          IRTemp res32   = newTemp(Ity_I32);
2933          IRTemp resHi   = newTemp(Ity_I16);
2934          IRTemp resLo   = newTemp(Ity_I16);
2935          IROp   mulOp   = syned ? Iop_MullS16 : Iop_MullU16;
2936          UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
2937          setFlags_MUL ( Ity_I16, t1, tmp, tBaseOp );
2938          assign( res32, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
2939          assign( resHi, unop(Iop_32HIto16,mkexpr(res32)));
2940          assign( resLo, unop(Iop_32to16,mkexpr(res32)));
2941          putIReg(2, R_EDX, mkexpr(resHi));
2942          putIReg(2, R_EAX, mkexpr(resLo));
2943          break;
2944       }
2945       case Ity_I8: {
2946          IRTemp res16   = newTemp(Ity_I16);
2947          IRTemp resHi   = newTemp(Ity_I8);
2948          IRTemp resLo   = newTemp(Ity_I8);
2949          IROp   mulOp   = syned ? Iop_MullS8 : Iop_MullU8;
2950          UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
2951          setFlags_MUL ( Ity_I8, t1, tmp, tBaseOp );
2952          assign( res16, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
2953          assign( resHi, unop(Iop_16HIto8,mkexpr(res16)));
2954          assign( resLo, unop(Iop_16to8,mkexpr(res16)));
2955          putIReg(2, R_EAX, mkexpr(res16));
2956          break;
2957       }
2958       default:
2959          vpanic("codegen_mulL_A_D(x86)");
2960    }
2961    DIP("%s%c %s\n", syned ? "imul" : "mul", nameISize(sz), tmp_txt);
2962 }
2963 
2964 
2965 /* Group 3 extended opcodes. */
2966 static
dis_Grp3(UChar sorb,Bool locked,Int sz,Int delta,Bool * decode_OK)2967 UInt dis_Grp3 ( UChar sorb, Bool locked, Int sz, Int delta, Bool* decode_OK )
2968 {
2969    UInt    d32;
2970    UChar   modrm;
2971    HChar   dis_buf[50];
2972    Int     len;
2973    IRTemp  addr;
2974    IRType  ty = szToITy(sz);
2975    IRTemp  t1 = newTemp(ty);
2976    IRTemp dst1, src, dst0;
2977 
2978    *decode_OK = True; /* may change this later */
2979 
2980    modrm = getIByte(delta);
2981 
2982    if (locked && (gregOfRM(modrm) != 2 && gregOfRM(modrm) != 3)) {
2983       /* LOCK prefix only allowed with not and neg subopcodes */
2984       *decode_OK = False;
2985       return delta;
2986    }
2987 
2988    if (epartIsReg(modrm)) {
2989       switch (gregOfRM(modrm)) {
2990          case 0: { /* TEST */
2991             delta++; d32 = getUDisp(sz, delta); delta += sz;
2992             dst1 = newTemp(ty);
2993             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
2994                                getIReg(sz,eregOfRM(modrm)),
2995                                mkU(ty,d32)));
2996             setFlags_DEP1( Iop_And8, dst1, ty );
2997             DIP("test%c $0x%x, %s\n", nameISize(sz), d32,
2998                                       nameIReg(sz, eregOfRM(modrm)));
2999             break;
3000          }
3001          case 1: /* UNDEFINED */
3002            /* The Intel docs imply this insn is undefined and binutils
3003               agrees.  Unfortunately Core 2 will run it (with who
3004               knows what result?)  sandpile.org reckons it's an alias
3005               for case 0.  We play safe. */
3006            *decode_OK = False;
3007            break;
3008          case 2: /* NOT */
3009             delta++;
3010             putIReg(sz, eregOfRM(modrm),
3011                         unop(mkSizedOp(ty,Iop_Not8),
3012                              getIReg(sz, eregOfRM(modrm))));
3013             DIP("not%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
3014             break;
3015          case 3: /* NEG */
3016             delta++;
3017             dst0 = newTemp(ty);
3018             src  = newTemp(ty);
3019             dst1 = newTemp(ty);
3020             assign(dst0, mkU(ty,0));
3021             assign(src,  getIReg(sz,eregOfRM(modrm)));
3022             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0), mkexpr(src)));
3023             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
3024             putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
3025             DIP("neg%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
3026             break;
3027          case 4: /* MUL (unsigned widening) */
3028             delta++;
3029             src = newTemp(ty);
3030             assign(src, getIReg(sz,eregOfRM(modrm)));
3031             codegen_mulL_A_D ( sz, False, src, nameIReg(sz,eregOfRM(modrm)) );
3032             break;
3033          case 5: /* IMUL (signed widening) */
3034             delta++;
3035             src = newTemp(ty);
3036             assign(src, getIReg(sz,eregOfRM(modrm)));
3037             codegen_mulL_A_D ( sz, True, src, nameIReg(sz,eregOfRM(modrm)) );
3038             break;
3039          case 6: /* DIV */
3040             delta++;
3041             assign( t1, getIReg(sz, eregOfRM(modrm)) );
3042             codegen_div ( sz, t1, False );
3043             DIP("div%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
3044             break;
3045          case 7: /* IDIV */
3046             delta++;
3047             assign( t1, getIReg(sz, eregOfRM(modrm)) );
3048             codegen_div ( sz, t1, True );
3049             DIP("idiv%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
3050             break;
3051          default:
3052             /* This can't happen - gregOfRM should return 0 .. 7 only */
3053             vpanic("Grp3(x86)");
3054       }
3055    } else {
3056       addr = disAMode ( &len, sorb, delta, dis_buf );
3057       t1   = newTemp(ty);
3058       delta += len;
3059       assign(t1, loadLE(ty,mkexpr(addr)));
3060       switch (gregOfRM(modrm)) {
3061          case 0: { /* TEST */
3062             d32 = getUDisp(sz, delta); delta += sz;
3063             dst1 = newTemp(ty);
3064             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
3065                                mkexpr(t1), mkU(ty,d32)));
3066             setFlags_DEP1( Iop_And8, dst1, ty );
3067             DIP("test%c $0x%x, %s\n", nameISize(sz), d32, dis_buf);
3068             break;
3069          }
3070          case 1: /* UNDEFINED */
3071            /* See comment above on R case */
3072            *decode_OK = False;
3073            break;
3074          case 2: /* NOT */
3075             dst1 = newTemp(ty);
3076             assign(dst1, unop(mkSizedOp(ty,Iop_Not8), mkexpr(t1)));
3077             if (locked) {
3078                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
3079                                     guest_EIP_curr_instr );
3080             } else {
3081                storeLE( mkexpr(addr), mkexpr(dst1) );
3082             }
3083             DIP("not%c %s\n", nameISize(sz), dis_buf);
3084             break;
3085          case 3: /* NEG */
3086             dst0 = newTemp(ty);
3087             src  = newTemp(ty);
3088             dst1 = newTemp(ty);
3089             assign(dst0, mkU(ty,0));
3090             assign(src,  mkexpr(t1));
3091             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8),
3092                                mkexpr(dst0), mkexpr(src)));
3093             if (locked) {
3094                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
3095                                     guest_EIP_curr_instr );
3096             } else {
3097                storeLE( mkexpr(addr), mkexpr(dst1) );
3098             }
3099             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
3100             DIP("neg%c %s\n", nameISize(sz), dis_buf);
3101             break;
3102          case 4: /* MUL */
3103             codegen_mulL_A_D ( sz, False, t1, dis_buf );
3104             break;
3105          case 5: /* IMUL */
3106             codegen_mulL_A_D ( sz, True, t1, dis_buf );
3107             break;
3108          case 6: /* DIV */
3109             codegen_div ( sz, t1, False );
3110             DIP("div%c %s\n", nameISize(sz), dis_buf);
3111             break;
3112          case 7: /* IDIV */
3113             codegen_div ( sz, t1, True );
3114             DIP("idiv%c %s\n", nameISize(sz), dis_buf);
3115             break;
3116          default:
3117             /* This can't happen - gregOfRM should return 0 .. 7 only */
3118             vpanic("Grp3(x86)");
3119       }
3120    }
3121    return delta;
3122 }
3123 
3124 
3125 /* Group 4 extended opcodes. */
3126 static
dis_Grp4(UChar sorb,Bool locked,Int delta,Bool * decode_OK)3127 UInt dis_Grp4 ( UChar sorb, Bool locked, Int delta, Bool* decode_OK )
3128 {
3129    Int   alen;
3130    UChar modrm;
3131    HChar dis_buf[50];
3132    IRType ty = Ity_I8;
3133    IRTemp t1 = newTemp(ty);
3134    IRTemp t2 = newTemp(ty);
3135 
3136    *decode_OK = True;
3137 
3138    modrm = getIByte(delta);
3139 
3140    if (locked && (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)) {
3141       /* LOCK prefix only allowed with inc and dec subopcodes */
3142       *decode_OK = False;
3143       return delta;
3144    }
3145 
3146    if (epartIsReg(modrm)) {
3147       assign(t1, getIReg(1, eregOfRM(modrm)));
3148       switch (gregOfRM(modrm)) {
3149          case 0: /* INC */
3150             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
3151             putIReg(1, eregOfRM(modrm), mkexpr(t2));
3152             setFlags_INC_DEC( True, t2, ty );
3153             break;
3154          case 1: /* DEC */
3155             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
3156             putIReg(1, eregOfRM(modrm), mkexpr(t2));
3157             setFlags_INC_DEC( False, t2, ty );
3158             break;
3159          default:
3160             *decode_OK = False;
3161             return delta;
3162       }
3163       delta++;
3164       DIP("%sb %s\n", nameGrp4(gregOfRM(modrm)),
3165                       nameIReg(1, eregOfRM(modrm)));
3166    } else {
3167       IRTemp addr = disAMode ( &alen, sorb, delta, dis_buf );
3168       assign( t1, loadLE(ty, mkexpr(addr)) );
3169       switch (gregOfRM(modrm)) {
3170          case 0: /* INC */
3171             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
3172             if (locked) {
3173                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
3174                       guest_EIP_curr_instr );
3175             } else {
3176                storeLE( mkexpr(addr), mkexpr(t2) );
3177             }
3178             setFlags_INC_DEC( True, t2, ty );
3179             break;
3180          case 1: /* DEC */
3181             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
3182             if (locked) {
3183                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
3184                       guest_EIP_curr_instr );
3185             } else {
3186                storeLE( mkexpr(addr), mkexpr(t2) );
3187             }
3188             setFlags_INC_DEC( False, t2, ty );
3189             break;
3190          default:
3191             *decode_OK = False;
3192             return delta;
3193       }
3194       delta += alen;
3195       DIP("%sb %s\n", nameGrp4(gregOfRM(modrm)), dis_buf);
3196    }
3197    return delta;
3198 }
3199 
3200 
3201 /* Group 5 extended opcodes. */
3202 static
dis_Grp5(UChar sorb,Bool locked,Int sz,Int delta,DisResult * dres,Bool * decode_OK)3203 UInt dis_Grp5 ( UChar sorb, Bool locked, Int sz, Int delta,
3204                 /*MOD*/DisResult* dres, /*OUT*/Bool* decode_OK )
3205 {
3206    Int     len;
3207    UChar   modrm;
3208    HChar   dis_buf[50];
3209    IRTemp  addr = IRTemp_INVALID;
3210    IRType  ty = szToITy(sz);
3211    IRTemp  t1 = newTemp(ty);
3212    IRTemp  t2 = IRTemp_INVALID;
3213 
3214    *decode_OK = True;
3215 
3216    modrm = getIByte(delta);
3217 
3218    if (locked && (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)) {
3219       /* LOCK prefix only allowed with inc and dec subopcodes */
3220       *decode_OK = False;
3221       return delta;
3222    }
3223 
3224    if (epartIsReg(modrm)) {
3225       assign(t1, getIReg(sz,eregOfRM(modrm)));
3226       switch (gregOfRM(modrm)) {
3227          case 0: /* INC */
3228             vassert(sz == 2 || sz == 4);
3229             t2 = newTemp(ty);
3230             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
3231                              mkexpr(t1), mkU(ty,1)));
3232             setFlags_INC_DEC( True, t2, ty );
3233             putIReg(sz,eregOfRM(modrm),mkexpr(t2));
3234             break;
3235          case 1: /* DEC */
3236             vassert(sz == 2 || sz == 4);
3237             t2 = newTemp(ty);
3238             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
3239                              mkexpr(t1), mkU(ty,1)));
3240             setFlags_INC_DEC( False, t2, ty );
3241             putIReg(sz,eregOfRM(modrm),mkexpr(t2));
3242             break;
3243          case 2: /* call Ev */
3244             vassert(sz == 4);
3245             t2 = newTemp(Ity_I32);
3246             assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
3247             putIReg(4, R_ESP, mkexpr(t2));
3248             storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta+1));
3249             jmp_treg(dres, Ijk_Call, t1);
3250             vassert(dres->whatNext == Dis_StopHere);
3251             break;
3252          case 4: /* jmp Ev */
3253             vassert(sz == 4 || sz == 2);
3254             jmp_treg(dres, Ijk_Boring, t1);
3255             vassert(dres->whatNext == Dis_StopHere);
3256             break;
3257          case 6: /* PUSH Ev */
3258             vassert(sz == 4 || sz == 2);
3259             t2 = newTemp(Ity_I32);
3260             assign( t2, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
3261             putIReg(4, R_ESP, mkexpr(t2) );
3262             storeLE( mkexpr(t2), mkexpr(t1) );
3263             break;
3264          default:
3265             *decode_OK = False;
3266             return delta;
3267       }
3268       delta++;
3269       DIP("%s%c %s\n", nameGrp5(gregOfRM(modrm)),
3270                        nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
3271    } else {
3272       addr = disAMode ( &len, sorb, delta, dis_buf );
3273       assign(t1, loadLE(ty,mkexpr(addr)));
3274       switch (gregOfRM(modrm)) {
3275          case 0: /* INC */
3276             t2 = newTemp(ty);
3277             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
3278                              mkexpr(t1), mkU(ty,1)));
3279             if (locked) {
3280                casLE( mkexpr(addr),
3281                       mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
3282             } else {
3283                storeLE(mkexpr(addr),mkexpr(t2));
3284             }
3285             setFlags_INC_DEC( True, t2, ty );
3286             break;
3287          case 1: /* DEC */
3288             t2 = newTemp(ty);
3289             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
3290                              mkexpr(t1), mkU(ty,1)));
3291             if (locked) {
3292                casLE( mkexpr(addr),
3293                       mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
3294             } else {
3295                storeLE(mkexpr(addr),mkexpr(t2));
3296             }
3297             setFlags_INC_DEC( False, t2, ty );
3298             break;
3299          case 2: /* call Ev */
3300             vassert(sz == 4);
3301             t2 = newTemp(Ity_I32);
3302             assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
3303             putIReg(4, R_ESP, mkexpr(t2));
3304             storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta+len));
3305             jmp_treg(dres, Ijk_Call, t1);
3306             vassert(dres->whatNext == Dis_StopHere);
3307             break;
3308          case 4: /* JMP Ev */
3309             vassert(sz == 4);
3310             jmp_treg(dres, Ijk_Boring, t1);
3311             vassert(dres->whatNext == Dis_StopHere);
3312             break;
3313          case 6: /* PUSH Ev */
3314             vassert(sz == 4 || sz == 2);
3315             t2 = newTemp(Ity_I32);
3316             assign( t2, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
3317             putIReg(4, R_ESP, mkexpr(t2) );
3318             storeLE( mkexpr(t2), mkexpr(t1) );
3319             break;
3320          default:
3321             *decode_OK = False;
3322             return delta;
3323       }
3324       delta += len;
3325       DIP("%s%c %s\n", nameGrp5(gregOfRM(modrm)),
3326                        nameISize(sz), dis_buf);
3327    }
3328    return delta;
3329 }
3330 
3331 
3332 /*------------------------------------------------------------*/
3333 /*--- Disassembling string ops (including REP prefixes)    ---*/
3334 /*------------------------------------------------------------*/
3335 
3336 /* Code shared by all the string ops */
3337 static
dis_string_op_increment(Int sz,Int t_inc)3338 void dis_string_op_increment(Int sz, Int t_inc)
3339 {
3340    if (sz == 4 || sz == 2) {
3341       assign( t_inc,
3342               binop(Iop_Shl32, IRExpr_Get( OFFB_DFLAG, Ity_I32 ),
3343                                mkU8(sz/2) ) );
3344    } else {
3345       assign( t_inc,
3346               IRExpr_Get( OFFB_DFLAG, Ity_I32 ) );
3347    }
3348 }
3349 
3350 static
dis_string_op(void (* dis_OP)(Int,IRTemp),Int sz,const HChar * name,UChar sorb)3351 void dis_string_op( void (*dis_OP)( Int, IRTemp ),
3352                     Int sz, const HChar* name, UChar sorb )
3353 {
3354    IRTemp t_inc = newTemp(Ity_I32);
3355    vassert(sorb == 0); /* hmm.  so what was the point of passing it in? */
3356    dis_string_op_increment(sz, t_inc);
3357    dis_OP( sz, t_inc );
3358    DIP("%s%c\n", name, nameISize(sz));
3359 }
3360 
3361 static
dis_MOVS(Int sz,IRTemp t_inc)3362 void dis_MOVS ( Int sz, IRTemp t_inc )
3363 {
3364    IRType ty = szToITy(sz);
3365    IRTemp td = newTemp(Ity_I32);   /* EDI */
3366    IRTemp ts = newTemp(Ity_I32);   /* ESI */
3367 
3368    assign( td, getIReg(4, R_EDI) );
3369    assign( ts, getIReg(4, R_ESI) );
3370 
3371    storeLE( mkexpr(td), loadLE(ty,mkexpr(ts)) );
3372 
3373    putIReg( 4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
3374    putIReg( 4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
3375 }
3376 
3377 static
dis_LODS(Int sz,IRTemp t_inc)3378 void dis_LODS ( Int sz, IRTemp t_inc )
3379 {
3380    IRType ty = szToITy(sz);
3381    IRTemp ts = newTemp(Ity_I32);   /* ESI */
3382 
3383    assign( ts, getIReg(4, R_ESI) );
3384 
3385    putIReg( sz, R_EAX, loadLE(ty, mkexpr(ts)) );
3386 
3387    putIReg( 4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
3388 }
3389 
3390 static
dis_STOS(Int sz,IRTemp t_inc)3391 void dis_STOS ( Int sz, IRTemp t_inc )
3392 {
3393    IRType ty = szToITy(sz);
3394    IRTemp ta = newTemp(ty);        /* EAX */
3395    IRTemp td = newTemp(Ity_I32);   /* EDI */
3396 
3397    assign( ta, getIReg(sz, R_EAX) );
3398    assign( td, getIReg(4, R_EDI) );
3399 
3400    storeLE( mkexpr(td), mkexpr(ta) );
3401 
3402    putIReg( 4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
3403 }
3404 
3405 static
dis_CMPS(Int sz,IRTemp t_inc)3406 void dis_CMPS ( Int sz, IRTemp t_inc )
3407 {
3408    IRType ty  = szToITy(sz);
3409    IRTemp tdv = newTemp(ty);      /* (EDI) */
3410    IRTemp tsv = newTemp(ty);      /* (ESI) */
3411    IRTemp td  = newTemp(Ity_I32); /*  EDI  */
3412    IRTemp ts  = newTemp(Ity_I32); /*  ESI  */
3413 
3414    assign( td, getIReg(4, R_EDI) );
3415    assign( ts, getIReg(4, R_ESI) );
3416 
3417    assign( tdv, loadLE(ty,mkexpr(td)) );
3418    assign( tsv, loadLE(ty,mkexpr(ts)) );
3419 
3420    setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );
3421 
3422    putIReg(4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
3423    putIReg(4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
3424 }
3425 
3426 static
dis_SCAS(Int sz,IRTemp t_inc)3427 void dis_SCAS ( Int sz, IRTemp t_inc )
3428 {
3429    IRType ty  = szToITy(sz);
3430    IRTemp ta  = newTemp(ty);       /*  EAX  */
3431    IRTemp td  = newTemp(Ity_I32);  /*  EDI  */
3432    IRTemp tdv = newTemp(ty);       /* (EDI) */
3433 
3434    assign( ta, getIReg(sz, R_EAX) );
3435    assign( td, getIReg(4, R_EDI) );
3436 
3437    assign( tdv, loadLE(ty,mkexpr(td)) );
3438    setFlags_DEP1_DEP2 ( Iop_Sub8, ta, tdv, ty );
3439 
3440    putIReg(4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
3441 }
3442 
3443 
3444 /* Wrap the appropriate string op inside a REP/REPE/REPNE.
3445    We assume the insn is the last one in the basic block, and so emit a jump
3446    to the next insn, rather than just falling through. */
3447 static
dis_REP_op(DisResult * dres,X86Condcode cond,void (* dis_OP)(Int,IRTemp),Int sz,Addr32 eip,Addr32 eip_next,const HChar * name)3448 void dis_REP_op ( /*MOD*/DisResult* dres,
3449                   X86Condcode cond,
3450                   void (*dis_OP)(Int, IRTemp),
3451                   Int sz, Addr32 eip, Addr32 eip_next, const HChar* name )
3452 {
3453    IRTemp t_inc = newTemp(Ity_I32);
3454    IRTemp tc    = newTemp(Ity_I32);  /*  ECX  */
3455 
3456    assign( tc, getIReg(4,R_ECX) );
3457 
3458    stmt( IRStmt_Exit( binop(Iop_CmpEQ32,mkexpr(tc),mkU32(0)),
3459                       Ijk_Boring,
3460                       IRConst_U32(eip_next), OFFB_EIP ) );
3461 
3462    putIReg(4, R_ECX, binop(Iop_Sub32, mkexpr(tc), mkU32(1)) );
3463 
3464    dis_string_op_increment(sz, t_inc);
3465    dis_OP (sz, t_inc);
3466 
3467    if (cond == X86CondAlways) {
3468       jmp_lit(dres, Ijk_Boring, eip);
3469       vassert(dres->whatNext == Dis_StopHere);
3470    } else {
3471       stmt( IRStmt_Exit( mk_x86g_calculate_condition(cond),
3472                          Ijk_Boring,
3473                          IRConst_U32(eip), OFFB_EIP ) );
3474       jmp_lit(dres, Ijk_Boring, eip_next);
3475       vassert(dres->whatNext == Dis_StopHere);
3476    }
3477    DIP("%s%c\n", name, nameISize(sz));
3478 }
3479 
3480 
3481 /*------------------------------------------------------------*/
3482 /*--- Arithmetic, etc.                                     ---*/
3483 /*------------------------------------------------------------*/
3484 
3485 /* IMUL E, G.  Supplied eip points to the modR/M byte. */
3486 static
dis_mul_E_G(UChar sorb,Int size,Int delta0)3487 UInt dis_mul_E_G ( UChar       sorb,
3488                    Int         size,
3489                    Int         delta0 )
3490 {
3491    Int    alen;
3492    HChar  dis_buf[50];
3493    UChar  rm = getIByte(delta0);
3494    IRType ty = szToITy(size);
3495    IRTemp te = newTemp(ty);
3496    IRTemp tg = newTemp(ty);
3497    IRTemp resLo = newTemp(ty);
3498 
3499    assign( tg, getIReg(size, gregOfRM(rm)) );
3500    if (epartIsReg(rm)) {
3501       assign( te, getIReg(size, eregOfRM(rm)) );
3502    } else {
3503       IRTemp addr = disAMode( &alen, sorb, delta0, dis_buf );
3504       assign( te, loadLE(ty,mkexpr(addr)) );
3505    }
3506 
3507    setFlags_MUL ( ty, te, tg, X86G_CC_OP_SMULB );
3508 
3509    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tg) ) );
3510 
3511    putIReg(size, gregOfRM(rm), mkexpr(resLo) );
3512 
3513    if (epartIsReg(rm)) {
3514       DIP("imul%c %s, %s\n", nameISize(size),
3515                              nameIReg(size,eregOfRM(rm)),
3516                              nameIReg(size,gregOfRM(rm)));
3517       return 1+delta0;
3518    } else {
3519       DIP("imul%c %s, %s\n", nameISize(size),
3520                              dis_buf, nameIReg(size,gregOfRM(rm)));
3521       return alen+delta0;
3522    }
3523 }
3524 
3525 
3526 /* IMUL I * E -> G.  Supplied eip points to the modR/M byte. */
3527 static
dis_imul_I_E_G(UChar sorb,Int size,Int delta,Int litsize)3528 UInt dis_imul_I_E_G ( UChar       sorb,
3529                       Int         size,
3530                       Int         delta,
3531                       Int         litsize )
3532 {
3533    Int    d32, alen;
3534    HChar  dis_buf[50];
3535    UChar  rm = getIByte(delta);
3536    IRType ty = szToITy(size);
3537    IRTemp te = newTemp(ty);
3538    IRTemp tl = newTemp(ty);
3539    IRTemp resLo = newTemp(ty);
3540 
3541    vassert(size == 1 || size == 2 || size == 4);
3542 
3543    if (epartIsReg(rm)) {
3544       assign(te, getIReg(size, eregOfRM(rm)));
3545       delta++;
3546    } else {
3547       IRTemp addr = disAMode( &alen, sorb, delta, dis_buf );
3548       assign(te, loadLE(ty, mkexpr(addr)));
3549       delta += alen;
3550    }
3551    d32 = getSDisp(litsize,delta);
3552    delta += litsize;
3553 
3554    if (size == 1) d32 &= 0xFF;
3555    if (size == 2) d32 &= 0xFFFF;
3556 
3557    assign(tl, mkU(ty,d32));
3558 
3559    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tl) ));
3560 
3561    setFlags_MUL ( ty, te, tl, X86G_CC_OP_SMULB );
3562 
3563    putIReg(size, gregOfRM(rm), mkexpr(resLo));
3564 
3565    DIP("imul %d, %s, %s\n", d32,
3566        ( epartIsReg(rm) ? nameIReg(size,eregOfRM(rm)) : dis_buf ),
3567        nameIReg(size,gregOfRM(rm)) );
3568    return delta;
3569 }
3570 
3571 
3572 /* Generate an IR sequence to do a count-leading-zeroes operation on
3573    the supplied IRTemp, and return a new IRTemp holding the result.
3574    'ty' may be Ity_I16 or Ity_I32 only.  In the case where the
3575    argument is zero, return the number of bits in the word (the
3576    natural semantics). */
gen_LZCNT(IRType ty,IRTemp src)3577 static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
3578 {
3579    vassert(ty == Ity_I32 || ty == Ity_I16);
3580 
3581    IRTemp src32 = newTemp(Ity_I32);
3582    assign(src32, widenUto32( mkexpr(src) ));
3583 
3584    IRTemp src32x = newTemp(Ity_I32);
3585    assign(src32x,
3586           binop(Iop_Shl32, mkexpr(src32),
3587                            mkU8(32 - 8 * sizeofIRType(ty))));
3588 
3589    // Clz32 has undefined semantics when its input is zero, so
3590    // special-case around that.
3591    IRTemp res32 = newTemp(Ity_I32);
3592    assign(res32,
3593           IRExpr_ITE(
3594              binop(Iop_CmpEQ32, mkexpr(src32x), mkU32(0)),
3595              mkU32(8 * sizeofIRType(ty)),
3596              unop(Iop_Clz32, mkexpr(src32x))
3597    ));
3598 
3599    IRTemp res = newTemp(ty);
3600    assign(res, narrowTo(ty, mkexpr(res32)));
3601    return res;
3602 }
3603 
3604 
3605 /*------------------------------------------------------------*/
3606 /*---                                                      ---*/
3607 /*--- x87 FLOATING POINT INSTRUCTIONS                      ---*/
3608 /*---                                                      ---*/
3609 /*------------------------------------------------------------*/
3610 
3611 /* --- Helper functions for dealing with the register stack. --- */
3612 
3613 /* --- Set the emulation-warning pseudo-register. --- */
3614 
put_emwarn(IRExpr * e)3615 static void put_emwarn ( IRExpr* e /* :: Ity_I32 */ )
3616 {
3617    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
3618    stmt( IRStmt_Put( OFFB_EMNOTE, e ) );
3619 }
3620 
3621 /* --- Produce an IRExpr* denoting a 64-bit QNaN. --- */
3622 
mkQNaN64(void)3623 static IRExpr* mkQNaN64 ( void )
3624 {
3625   /* QNaN is 0 2047 1 0(51times)
3626      == 0b 11111111111b 1 0(51times)
3627      == 0x7FF8 0000 0000 0000
3628    */
3629    return IRExpr_Const(IRConst_F64i(0x7FF8000000000000ULL));
3630 }
3631 
3632 /* --------- Get/put the top-of-stack pointer. --------- */
3633 
get_ftop(void)3634 static IRExpr* get_ftop ( void )
3635 {
3636    return IRExpr_Get( OFFB_FTOP, Ity_I32 );
3637 }
3638 
put_ftop(IRExpr * e)3639 static void put_ftop ( IRExpr* e )
3640 {
3641    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
3642    stmt( IRStmt_Put( OFFB_FTOP, e ) );
3643 }
3644 
3645 /* --------- Get/put the C3210 bits. --------- */
3646 
get_C3210(void)3647 static IRExpr* get_C3210 ( void )
3648 {
3649    return IRExpr_Get( OFFB_FC3210, Ity_I32 );
3650 }
3651 
put_C3210(IRExpr * e)3652 static void put_C3210 ( IRExpr* e )
3653 {
3654    stmt( IRStmt_Put( OFFB_FC3210, e ) );
3655 }
3656 
3657 /* --------- Get/put the FPU rounding mode. --------- */
get_fpround(void)3658 static IRExpr* /* :: Ity_I32 */ get_fpround ( void )
3659 {
3660    return IRExpr_Get( OFFB_FPROUND, Ity_I32 );
3661 }
3662 
put_fpround(IRExpr * e)3663 static void put_fpround ( IRExpr* /* :: Ity_I32 */ e )
3664 {
3665    stmt( IRStmt_Put( OFFB_FPROUND, e ) );
3666 }
3667 
3668 
3669 /* --------- Synthesise a 2-bit FPU rounding mode. --------- */
3670 /* Produces a value in 0 .. 3, which is encoded as per the type
3671    IRRoundingMode.  Since the guest_FPROUND value is also encoded as
3672    per IRRoundingMode, we merely need to get it and mask it for
3673    safety.
3674 */
get_roundingmode(void)3675 static IRExpr* /* :: Ity_I32 */ get_roundingmode ( void )
3676 {
3677    return binop( Iop_And32, get_fpround(), mkU32(3) );
3678 }
3679 
get_FAKE_roundingmode(void)3680 static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
3681 {
3682    return mkU32(Irrm_NEAREST);
3683 }
3684 
3685 
3686 /* --------- Get/set FP register tag bytes. --------- */
3687 
3688 /* Given i, and some expression e, generate 'ST_TAG(i) = e'. */
3689 
put_ST_TAG(Int i,IRExpr * value)3690 static void put_ST_TAG ( Int i, IRExpr* value )
3691 {
3692    IRRegArray* descr;
3693    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_I8);
3694    descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
3695    stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
3696 }
3697 
3698 /* Given i, generate an expression yielding 'ST_TAG(i)'.  This will be
3699    zero to indicate "Empty" and nonzero to indicate "NonEmpty".  */
3700 
get_ST_TAG(Int i)3701 static IRExpr* get_ST_TAG ( Int i )
3702 {
3703    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
3704    return IRExpr_GetI( descr, get_ftop(), i );
3705 }
3706 
3707 
3708 /* --------- Get/set FP registers. --------- */
3709 
3710 /* Given i, and some expression e, emit 'ST(i) = e' and set the
3711    register's tag to indicate the register is full.  The previous
3712    state of the register is not checked. */
3713 
put_ST_UNCHECKED(Int i,IRExpr * value)3714 static void put_ST_UNCHECKED ( Int i, IRExpr* value )
3715 {
3716    IRRegArray* descr;
3717    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_F64);
3718    descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
3719    stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
3720    /* Mark the register as in-use. */
3721    put_ST_TAG(i, mkU8(1));
3722 }
3723 
3724 /* Given i, and some expression e, emit
3725       ST(i) = is_full(i) ? NaN : e
3726    and set the tag accordingly.
3727 */
3728 
put_ST(Int i,IRExpr * value)3729 static void put_ST ( Int i, IRExpr* value )
3730 {
3731    put_ST_UNCHECKED(
3732       i,
3733       IRExpr_ITE( binop(Iop_CmpNE8, get_ST_TAG(i), mkU8(0)),
3734                   /* non-0 means full */
3735                   mkQNaN64(),
3736                   /* 0 means empty */
3737                   value
3738       )
3739    );
3740 }
3741 
3742 
3743 /* Given i, generate an expression yielding 'ST(i)'. */
3744 
get_ST_UNCHECKED(Int i)3745 static IRExpr* get_ST_UNCHECKED ( Int i )
3746 {
3747    IRRegArray* descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
3748    return IRExpr_GetI( descr, get_ftop(), i );
3749 }
3750 
3751 
3752 /* Given i, generate an expression yielding
3753   is_full(i) ? ST(i) : NaN
3754 */
3755 
get_ST(Int i)3756 static IRExpr* get_ST ( Int i )
3757 {
3758    return
3759       IRExpr_ITE( binop(Iop_CmpNE8, get_ST_TAG(i), mkU8(0)),
3760                   /* non-0 means full */
3761                   get_ST_UNCHECKED(i),
3762                   /* 0 means empty */
3763                   mkQNaN64());
3764 }
3765 
3766 
3767 /* Given i, and some expression e, and a condition cond, generate IR
3768    which has the same effect as put_ST(i,e) when cond is true and has
3769    no effect when cond is false.  Given the lack of proper
3770    if-then-else in the IR, this is pretty tricky.
3771 */
3772 
maybe_put_ST(IRTemp cond,Int i,IRExpr * value)3773 static void maybe_put_ST ( IRTemp cond, Int i, IRExpr* value )
3774 {
3775    // new_tag = if cond then FULL else old_tag
3776    // new_val = if cond then (if old_tag==FULL then NaN else val)
3777    //                   else old_val
3778 
3779    IRTemp old_tag = newTemp(Ity_I8);
3780    assign(old_tag, get_ST_TAG(i));
3781    IRTemp new_tag = newTemp(Ity_I8);
3782    assign(new_tag,
3783           IRExpr_ITE(mkexpr(cond), mkU8(1)/*FULL*/, mkexpr(old_tag)));
3784 
3785    IRTemp old_val = newTemp(Ity_F64);
3786    assign(old_val, get_ST_UNCHECKED(i));
3787    IRTemp new_val = newTemp(Ity_F64);
3788    assign(new_val,
3789           IRExpr_ITE(mkexpr(cond),
3790                      IRExpr_ITE(binop(Iop_CmpNE8, mkexpr(old_tag), mkU8(0)),
3791                                 /* non-0 means full */
3792                                 mkQNaN64(),
3793                                 /* 0 means empty */
3794                                 value),
3795                      mkexpr(old_val)));
3796 
3797    put_ST_UNCHECKED(i, mkexpr(new_val));
3798    // put_ST_UNCHECKED incorrectly sets tag(i) to always be FULL.  So
3799    // now set it to new_tag instead.
3800    put_ST_TAG(i, mkexpr(new_tag));
3801 }
3802 
3803 /* Adjust FTOP downwards by one register. */
3804 
fp_push(void)3805 static void fp_push ( void )
3806 {
3807    put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
3808 }
3809 
3810 /* Adjust FTOP downwards by one register when COND is 1:I1.  Else
3811    don't change it. */
3812 
maybe_fp_push(IRTemp cond)3813 static void maybe_fp_push ( IRTemp cond )
3814 {
3815    put_ftop( binop(Iop_Sub32, get_ftop(), unop(Iop_1Uto32,mkexpr(cond))) );
3816 }
3817 
3818 /* Adjust FTOP upwards by one register, and mark the vacated register
3819    as empty.  */
3820 
fp_pop(void)3821 static void fp_pop ( void )
3822 {
3823    put_ST_TAG(0, mkU8(0));
3824    put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
3825 }
3826 
3827 /* Set the C2 bit of the FPU status register to e[0].  Assumes that
3828    e[31:1] == 0.
3829 */
set_C2(IRExpr * e)3830 static void set_C2 ( IRExpr* e )
3831 {
3832    IRExpr* cleared = binop(Iop_And32, get_C3210(), mkU32(~X86G_FC_MASK_C2));
3833    put_C3210( binop(Iop_Or32,
3834                     cleared,
3835                     binop(Iop_Shl32, e, mkU8(X86G_FC_SHIFT_C2))) );
3836 }
3837 
3838 /* Generate code to check that abs(d64) < 2^63 and is finite.  This is
3839    used to do the range checks for FSIN, FCOS, FSINCOS and FPTAN.  The
3840    test is simple, but the derivation of it is not so simple.
3841 
3842    The exponent field for an IEEE754 double is 11 bits.  That means it
3843    can take values 0 through 0x7FF.  If the exponent has value 0x7FF,
3844    the number is either a NaN or an Infinity and so is not finite.
3845    Furthermore, a finite value of exactly 2^63 is the smallest value
3846    that has exponent value 0x43E.  Hence, what we need to do is
3847    extract the exponent, ignoring the sign bit and mantissa, and check
3848    it is < 0x43E, or <= 0x43D.
3849 
3850    To make this easily applicable to 32- and 64-bit targets, a
3851    roundabout approach is used.  First the number is converted to I64,
3852    then the top 32 bits are taken.  Shifting them right by 20 bits
3853    places the sign bit and exponent in the bottom 12 bits.  Anding
3854    with 0x7FF gets rid of the sign bit, leaving just the exponent
3855    available for comparison.
3856 */
math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(IRTemp d64)3857 static IRTemp math_IS_TRIG_ARG_FINITE_AND_IN_RANGE ( IRTemp d64 )
3858 {
3859    IRTemp i64 = newTemp(Ity_I64);
3860    assign(i64, unop(Iop_ReinterpF64asI64, mkexpr(d64)) );
3861    IRTemp exponent = newTemp(Ity_I32);
3862    assign(exponent,
3863           binop(Iop_And32,
3864                 binop(Iop_Shr32, unop(Iop_64HIto32, mkexpr(i64)), mkU8(20)),
3865                 mkU32(0x7FF)));
3866    IRTemp in_range_and_finite = newTemp(Ity_I1);
3867    assign(in_range_and_finite,
3868           binop(Iop_CmpLE32U, mkexpr(exponent), mkU32(0x43D)));
3869    return in_range_and_finite;
3870 }
3871 
3872 /* Invent a plausible-looking FPU status word value:
3873       ((ftop & 7) << 11) | (c3210 & 0x4700)
3874  */
get_FPU_sw(void)3875 static IRExpr* get_FPU_sw ( void )
3876 {
3877    return
3878       unop(Iop_32to16,
3879            binop(Iop_Or32,
3880                  binop(Iop_Shl32,
3881                        binop(Iop_And32, get_ftop(), mkU32(7)),
3882                              mkU8(11)),
3883                        binop(Iop_And32, get_C3210(), mkU32(0x4700))
3884       ));
3885 }
3886 
3887 
3888 /* ------------------------------------------------------- */
3889 /* Given all that stack-mangling junk, we can now go ahead
3890    and describe FP instructions.
3891 */
3892 
3893 /* ST(0) = ST(0) `op` mem64/32(addr)
3894    Need to check ST(0)'s tag on read, but not on write.
3895 */
3896 static
fp_do_op_mem_ST_0(IRTemp addr,const HChar * op_txt,HChar * dis_buf,IROp op,Bool dbl)3897 void fp_do_op_mem_ST_0 ( IRTemp addr, const HChar* op_txt, HChar* dis_buf,
3898                          IROp op, Bool dbl )
3899 {
3900    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
3901    if (dbl) {
3902       put_ST_UNCHECKED(0,
3903          triop( op,
3904                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3905                 get_ST(0),
3906                 loadLE(Ity_F64,mkexpr(addr))
3907          ));
3908    } else {
3909       put_ST_UNCHECKED(0,
3910          triop( op,
3911                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3912                 get_ST(0),
3913                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr)))
3914          ));
3915    }
3916 }
3917 
3918 
3919 /* ST(0) = mem64/32(addr) `op` ST(0)
3920    Need to check ST(0)'s tag on read, but not on write.
3921 */
3922 static
fp_do_oprev_mem_ST_0(IRTemp addr,const HChar * op_txt,HChar * dis_buf,IROp op,Bool dbl)3923 void fp_do_oprev_mem_ST_0 ( IRTemp addr, const HChar* op_txt, HChar* dis_buf,
3924                             IROp op, Bool dbl )
3925 {
3926    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
3927    if (dbl) {
3928       put_ST_UNCHECKED(0,
3929          triop( op,
3930                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3931                 loadLE(Ity_F64,mkexpr(addr)),
3932                 get_ST(0)
3933          ));
3934    } else {
3935       put_ST_UNCHECKED(0,
3936          triop( op,
3937                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3938                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr))),
3939                 get_ST(0)
3940          ));
3941    }
3942 }
3943 
3944 
3945 /* ST(dst) = ST(dst) `op` ST(src).
3946    Check dst and src tags when reading but not on write.
3947 */
3948 static
fp_do_op_ST_ST(const HChar * op_txt,IROp op,UInt st_src,UInt st_dst,Bool pop_after)3949 void fp_do_op_ST_ST ( const HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
3950                       Bool pop_after )
3951 {
3952    DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"",
3953                                  st_src, st_dst);
3954    put_ST_UNCHECKED(
3955       st_dst,
3956       triop( op,
3957              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3958              get_ST(st_dst),
3959              get_ST(st_src) )
3960    );
3961    if (pop_after)
3962       fp_pop();
3963 }
3964 
3965 /* ST(dst) = ST(src) `op` ST(dst).
3966    Check dst and src tags when reading but not on write.
3967 */
3968 static
fp_do_oprev_ST_ST(const HChar * op_txt,IROp op,UInt st_src,UInt st_dst,Bool pop_after)3969 void fp_do_oprev_ST_ST ( const HChar* op_txt, IROp op, UInt st_src,
3970                          UInt st_dst, Bool pop_after )
3971 {
3972    DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"",
3973                                  st_src, st_dst);
3974    put_ST_UNCHECKED(
3975       st_dst,
3976       triop( op,
3977              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3978              get_ST(st_src),
3979              get_ST(st_dst) )
3980    );
3981    if (pop_after)
3982       fp_pop();
3983 }
3984 
3985 /* %eflags(Z,P,C) = UCOMI( st(0), st(i) ) */
fp_do_ucomi_ST0_STi(UInt i,Bool pop_after)3986 static void fp_do_ucomi_ST0_STi ( UInt i, Bool pop_after )
3987 {
3988    DIP("fucomi%s %%st(0),%%st(%u)\n", pop_after ? "p" : "", i);
3989    /* This is a bit of a hack (and isn't really right).  It sets
3990       Z,P,C,O correctly, but forces A and S to zero, whereas the Intel
3991       documentation implies A and S are unchanged.
3992    */
3993    /* It's also fishy in that it is used both for COMIP and
3994       UCOMIP, and they aren't the same (although similar). */
3995    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
3996    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
3997    stmt( IRStmt_Put( OFFB_CC_DEP1,
3998                      binop( Iop_And32,
3999                             binop(Iop_CmpF64, get_ST(0), get_ST(i)),
4000                             mkU32(0x45)
4001        )));
4002    /* Set NDEP even though it isn't used.  This makes redundant-PUT
4003       elimination of previous stores to this field work better. */
4004    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
4005    if (pop_after)
4006       fp_pop();
4007 }
4008 
4009 
4010 static
dis_FPU(Bool * decode_ok,UChar sorb,Int delta)4011 UInt dis_FPU ( Bool* decode_ok, UChar sorb, Int delta )
4012 {
4013    Int    len;
4014    UInt   r_src, r_dst;
4015    HChar  dis_buf[50];
4016    IRTemp t1, t2;
4017 
4018    /* On entry, delta points at the second byte of the insn (the modrm
4019       byte).*/
4020    UChar first_opcode = getIByte(delta-1);
4021    UChar modrm        = getIByte(delta+0);
4022 
4023    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD8 opcodes +-+-+-+-+-+-+-+ */
4024 
4025    if (first_opcode == 0xD8) {
4026       if (modrm < 0xC0) {
4027 
4028          /* bits 5,4,3 are an opcode extension, and the modRM also
4029            specifies an address. */
4030          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
4031          delta += len;
4032 
4033          switch (gregOfRM(modrm)) {
4034 
4035             case 0: /* FADD single-real */
4036                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, False );
4037                break;
4038 
4039             case 1: /* FMUL single-real */
4040                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, False );
4041                break;
4042 
4043             case 2: /* FCOM single-real */
4044                DIP("fcoms %s\n", dis_buf);
4045                /* This forces C1 to zero, which isn't right. */
4046                put_C3210(
4047                    binop( Iop_And32,
4048                           binop(Iop_Shl32,
4049                                 binop(Iop_CmpF64,
4050                                       get_ST(0),
4051                                       unop(Iop_F32toF64,
4052                                            loadLE(Ity_F32,mkexpr(addr)))),
4053                                 mkU8(8)),
4054                           mkU32(0x4500)
4055                    ));
4056                break;
4057 
4058             case 3: /* FCOMP single-real */
4059                DIP("fcomps %s\n", dis_buf);
4060                /* This forces C1 to zero, which isn't right. */
4061                put_C3210(
4062                    binop( Iop_And32,
4063                           binop(Iop_Shl32,
4064                                 binop(Iop_CmpF64,
4065                                       get_ST(0),
4066                                       unop(Iop_F32toF64,
4067                                            loadLE(Ity_F32,mkexpr(addr)))),
4068                                 mkU8(8)),
4069                           mkU32(0x4500)
4070                    ));
4071                fp_pop();
4072                break;
4073 
4074             case 4: /* FSUB single-real */
4075                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, False );
4076                break;
4077 
4078             case 5: /* FSUBR single-real */
4079                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, False );
4080                break;
4081 
4082             case 6: /* FDIV single-real */
4083                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, False );
4084                break;
4085 
4086             case 7: /* FDIVR single-real */
4087                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, False );
4088                break;
4089 
4090             default:
4091                vex_printf("unhandled opc_aux = 0x%2x\n", (UInt)gregOfRM(modrm));
4092                vex_printf("first_opcode == 0xD8\n");
4093                goto decode_fail;
4094          }
4095       } else {
4096          delta++;
4097          switch (modrm) {
4098 
4099             case 0xc0:
4100             case 0xc1:
4101             case 0xc2:
4102             case 0xc3:
4103             case 0xc4:
4104             case 0xc5:
4105             case 0xc6:
4106             case 0xc7: /* FADD %st(?),%st(0) */
4107                fp_do_op_ST_ST ( "add", Iop_AddF64, modrm - 0xC0, 0, False );
4108                break;
4109 
4110             case 0xc8:
4111             case 0xc9:
4112             case 0xca:
4113             case 0xcb:
4114             case 0xcc:
4115             case 0xcd:
4116             case 0xce:
4117             case 0xcf: /* FMUL %st(?),%st(0) */
4118                fp_do_op_ST_ST ( "mul", Iop_MulF64, modrm - 0xC8, 0, False );
4119                break;
4120 
4121             /* Dunno if this is right */
4122             case 0xd0:
4123             case 0xd1:
4124             case 0xd2:
4125             case 0xd3:
4126             case 0xd4:
4127             case 0xd5:
4128             case 0xd6:
4129             case 0xd7: /* FCOM %st(?),%st(0) */
4130                r_dst = (UInt)modrm - 0xD0;
4131                DIP("fcom %%st(0),%%st(%u)\n", r_dst);
4132                /* This forces C1 to zero, which isn't right. */
4133                put_C3210(
4134                    binop( Iop_And32,
4135                           binop(Iop_Shl32,
4136                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
4137                                 mkU8(8)),
4138                           mkU32(0x4500)
4139                    ));
4140                break;
4141 
4142             /* Dunno if this is right */
4143             case 0xd8:
4144             case 0xd9:
4145             case 0xda:
4146             case 0xdb:
4147             case 0xdc:
4148             case 0xdd:
4149             case 0xde:
4150             case 0xdf: /* FCOMP %st(?),%st(0) */
4151                r_dst = (UInt)modrm - 0xD8;
4152                DIP("fcomp %%st(0),%%st(%u)\n", r_dst);
4153                /* This forces C1 to zero, which isn't right. */
4154                put_C3210(
4155                    binop( Iop_And32,
4156                           binop(Iop_Shl32,
4157                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
4158                                 mkU8(8)),
4159                           mkU32(0x4500)
4160                    ));
4161                fp_pop();
4162                break;
4163 
4164             case 0xe0:
4165             case 0xe1:
4166             case 0xe2:
4167             case 0xe3:
4168             case 0xe4:
4169             case 0xe5:
4170             case 0xe6:
4171             case 0xe7: /* FSUB %st(?),%st(0) */
4172                fp_do_op_ST_ST ( "sub", Iop_SubF64, modrm - 0xE0, 0, False );
4173                break;
4174 
4175             case 0xe8:
4176             case 0xe9:
4177             case 0xea:
4178             case 0xeb:
4179             case 0xec:
4180             case 0xed:
4181             case 0xee:
4182             case 0xef: /* FSUBR %st(?),%st(0) */
4183                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, modrm - 0xE8, 0, False );
4184                break;
4185 
4186             case 0xf0:
4187             case 0xf1:
4188             case 0xf2:
4189             case 0xf3:
4190             case 0xf4:
4191             case 0xf5:
4192             case 0xf6:
4193             case 0xf7: /* FDIV %st(?),%st(0) */
4194                fp_do_op_ST_ST ( "div", Iop_DivF64, modrm - 0xF0, 0, False );
4195                break;
4196 
4197             case 0xf8:
4198             case 0xf9:
4199             case 0xfa:
4200             case 0xfb:
4201             case 0xfc:
4202             case 0xfd:
4203             case 0xfe:
4204             case 0xff: /* FDIVR %st(?),%st(0) */
4205                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, modrm - 0xF8, 0, False );
4206                break;
4207 
4208             default:
4209                goto decode_fail;
4210          }
4211       }
4212    }
4213 
4214    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD9 opcodes +-+-+-+-+-+-+-+ */
4215    else
4216    if (first_opcode == 0xD9) {
4217       if (modrm < 0xC0) {
4218 
4219          /* bits 5,4,3 are an opcode extension, and the modRM also
4220             specifies an address. */
4221          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
4222          delta += len;
4223 
4224          switch (gregOfRM(modrm)) {
4225 
4226             case 0: /* FLD single-real */
4227                DIP("flds %s\n", dis_buf);
4228                fp_push();
4229                put_ST(0, unop(Iop_F32toF64,
4230                               loadLE(Ity_F32, mkexpr(addr))));
4231                break;
4232 
4233             case 2: /* FST single-real */
4234                DIP("fsts %s\n", dis_buf);
4235                storeLE(mkexpr(addr),
4236                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
4237                break;
4238 
4239             case 3: /* FSTP single-real */
4240                DIP("fstps %s\n", dis_buf);
4241                storeLE(mkexpr(addr),
4242                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
4243                fp_pop();
4244                break;
4245 
4246             case 4: { /* FLDENV m28 */
4247                /* Uses dirty helper:
4248                      VexEmNote x86g_do_FLDENV ( VexGuestX86State*, HWord ) */
4249                IRTemp   ew = newTemp(Ity_I32);
4250                IRDirty* d  = unsafeIRDirty_0_N (
4251                                 0/*regparms*/,
4252                                 "x86g_dirtyhelper_FLDENV",
4253                                 &x86g_dirtyhelper_FLDENV,
4254                                 mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
4255                              );
4256                d->tmp   = ew;
4257                /* declare we're reading memory */
4258                d->mFx   = Ifx_Read;
4259                d->mAddr = mkexpr(addr);
4260                d->mSize = 28;
4261 
4262                /* declare we're writing guest state */
4263                d->nFxState = 4;
4264                vex_bzero(&d->fxState, sizeof(d->fxState));
4265 
4266                d->fxState[0].fx     = Ifx_Write;
4267                d->fxState[0].offset = OFFB_FTOP;
4268                d->fxState[0].size   = sizeof(UInt);
4269 
4270                d->fxState[1].fx     = Ifx_Write;
4271                d->fxState[1].offset = OFFB_FPTAGS;
4272                d->fxState[1].size   = 8 * sizeof(UChar);
4273 
4274                d->fxState[2].fx     = Ifx_Write;
4275                d->fxState[2].offset = OFFB_FPROUND;
4276                d->fxState[2].size   = sizeof(UInt);
4277 
4278                d->fxState[3].fx     = Ifx_Write;
4279                d->fxState[3].offset = OFFB_FC3210;
4280                d->fxState[3].size   = sizeof(UInt);
4281 
4282                stmt( IRStmt_Dirty(d) );
4283 
4284                /* ew contains any emulation warning we may need to
4285                   issue.  If needed, side-exit to the next insn,
4286                   reporting the warning, so that Valgrind's dispatcher
4287                   sees the warning. */
4288                put_emwarn( mkexpr(ew) );
4289                stmt(
4290                   IRStmt_Exit(
4291                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
4292                      Ijk_EmWarn,
4293                      IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
4294                      OFFB_EIP
4295                   )
4296                );
4297 
4298                DIP("fldenv %s\n", dis_buf);
4299                break;
4300             }
4301 
4302             case 5: {/* FLDCW */
4303                /* The only thing we observe in the control word is the
4304                   rounding mode.  Therefore, pass the 16-bit value
4305                   (x87 native-format control word) to a clean helper,
4306                   getting back a 64-bit value, the lower half of which
4307                   is the FPROUND value to store, and the upper half of
4308                   which is the emulation-warning token which may be
4309                   generated.
4310                */
4311                /* ULong x86h_check_fldcw ( UInt ); */
4312                IRTemp t64 = newTemp(Ity_I64);
4313                IRTemp ew = newTemp(Ity_I32);
4314                DIP("fldcw %s\n", dis_buf);
4315                assign( t64, mkIRExprCCall(
4316                                Ity_I64, 0/*regparms*/,
4317                                "x86g_check_fldcw",
4318                                &x86g_check_fldcw,
4319                                mkIRExprVec_1(
4320                                   unop( Iop_16Uto32,
4321                                         loadLE(Ity_I16, mkexpr(addr)))
4322                                )
4323                             )
4324                      );
4325 
4326                put_fpround( unop(Iop_64to32, mkexpr(t64)) );
4327                assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
4328                put_emwarn( mkexpr(ew) );
4329                /* Finally, if an emulation warning was reported,
4330                   side-exit to the next insn, reporting the warning,
4331                   so that Valgrind's dispatcher sees the warning. */
4332                stmt(
4333                   IRStmt_Exit(
4334                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
4335                      Ijk_EmWarn,
4336                      IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
4337                      OFFB_EIP
4338                   )
4339                );
4340                break;
4341             }
4342 
4343             case 6: { /* FNSTENV m28 */
4344                /* Uses dirty helper:
4345                      void x86g_do_FSTENV ( VexGuestX86State*, HWord ) */
4346                IRDirty* d = unsafeIRDirty_0_N (
4347                                0/*regparms*/,
4348                                "x86g_dirtyhelper_FSTENV",
4349                                &x86g_dirtyhelper_FSTENV,
4350                                mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
4351                             );
4352                /* declare we're writing memory */
4353                d->mFx   = Ifx_Write;
4354                d->mAddr = mkexpr(addr);
4355                d->mSize = 28;
4356 
4357                /* declare we're reading guest state */
4358                d->nFxState = 4;
4359                vex_bzero(&d->fxState, sizeof(d->fxState));
4360 
4361                d->fxState[0].fx     = Ifx_Read;
4362                d->fxState[0].offset = OFFB_FTOP;
4363                d->fxState[0].size   = sizeof(UInt);
4364 
4365                d->fxState[1].fx     = Ifx_Read;
4366                d->fxState[1].offset = OFFB_FPTAGS;
4367                d->fxState[1].size   = 8 * sizeof(UChar);
4368 
4369                d->fxState[2].fx     = Ifx_Read;
4370                d->fxState[2].offset = OFFB_FPROUND;
4371                d->fxState[2].size   = sizeof(UInt);
4372 
4373                d->fxState[3].fx     = Ifx_Read;
4374                d->fxState[3].offset = OFFB_FC3210;
4375                d->fxState[3].size   = sizeof(UInt);
4376 
4377                stmt( IRStmt_Dirty(d) );
4378 
4379                DIP("fnstenv %s\n", dis_buf);
4380                break;
4381             }
4382 
4383             case 7: /* FNSTCW */
4384               /* Fake up a native x87 FPU control word.  The only
4385                  thing it depends on is FPROUND[1:0], so call a clean
4386                  helper to cook it up. */
4387                /* UInt x86h_create_fpucw ( UInt fpround ) */
4388                DIP("fnstcw %s\n", dis_buf);
4389                storeLE(
4390                   mkexpr(addr),
4391                   unop( Iop_32to16,
4392                         mkIRExprCCall(
4393                            Ity_I32, 0/*regp*/,
4394                            "x86g_create_fpucw", &x86g_create_fpucw,
4395                            mkIRExprVec_1( get_fpround() )
4396                         )
4397                   )
4398                );
4399                break;
4400 
4401             default:
4402                vex_printf("unhandled opc_aux = 0x%2x\n", (UInt)gregOfRM(modrm));
4403                vex_printf("first_opcode == 0xD9\n");
4404                goto decode_fail;
4405          }
4406 
4407       } else {
4408          delta++;
4409          switch (modrm) {
4410 
4411             case 0xc0:
4412             case 0xc1:
4413             case 0xc2:
4414             case 0xc3:
4415             case 0xc4:
4416             case 0xc5:
4417             case 0xc6:
4418             case 0xc7: /* FLD %st(?) */
4419                r_src = (UInt)modrm - 0xC0;
4420                DIP("fld %%st(%u)\n", r_src);
4421                t1 = newTemp(Ity_F64);
4422                assign(t1, get_ST(r_src));
4423                fp_push();
4424                put_ST(0, mkexpr(t1));
4425                break;
4426 
4427             case 0xc8:
4428             case 0xc9:
4429             case 0xca:
4430             case 0xcb:
4431             case 0xcc:
4432             case 0xcd:
4433             case 0xce:
4434             case 0xcf: /* FXCH %st(?) */
4435                r_src = (UInt)modrm - 0xC8;
4436                DIP("fxch %%st(%u)\n", r_src);
4437                t1 = newTemp(Ity_F64);
4438                t2 = newTemp(Ity_F64);
4439                assign(t1, get_ST(0));
4440                assign(t2, get_ST(r_src));
4441                put_ST_UNCHECKED(0, mkexpr(t2));
4442                put_ST_UNCHECKED(r_src, mkexpr(t1));
4443                break;
4444 
4445             case 0xE0: /* FCHS */
4446                DIP("fchs\n");
4447                put_ST_UNCHECKED(0, unop(Iop_NegF64, get_ST(0)));
4448                break;
4449 
4450             case 0xE1: /* FABS */
4451                DIP("fabs\n");
4452                put_ST_UNCHECKED(0, unop(Iop_AbsF64, get_ST(0)));
4453                break;
4454 
4455             case 0xE4: /* FTST */
4456                DIP("ftst\n");
4457                /* This forces C1 to zero, which isn't right. */
4458                /* Well, in fact the Intel docs say (bizarrely): "C1 is
4459                   set to 0 if stack underflow occurred; otherwise, set
4460                   to 0" which is pretty nonsensical.  I guess it's a
4461                    typo. */
4462                put_C3210(
4463                    binop( Iop_And32,
4464                           binop(Iop_Shl32,
4465                                 binop(Iop_CmpF64,
4466                                       get_ST(0),
4467                                       IRExpr_Const(IRConst_F64i(0x0ULL))),
4468                                 mkU8(8)),
4469                           mkU32(0x4500)
4470                    ));
4471                break;
4472 
4473             case 0xE5: { /* FXAM */
4474                /* This is an interesting one.  It examines %st(0),
4475                   regardless of whether the tag says it's empty or not.
4476                   Here, just pass both the tag (in our format) and the
4477                   value (as a double, actually a ULong) to a helper
4478                   function. */
4479                IRExpr** args
4480                   = mkIRExprVec_2( unop(Iop_8Uto32, get_ST_TAG(0)),
4481                                    unop(Iop_ReinterpF64asI64,
4482                                         get_ST_UNCHECKED(0)) );
4483                put_C3210(mkIRExprCCall(
4484                             Ity_I32,
4485                             0/*regparm*/,
4486                             "x86g_calculate_FXAM", &x86g_calculate_FXAM,
4487                             args
4488                         ));
4489                DIP("fxam\n");
4490                break;
4491             }
4492 
4493             case 0xE8: /* FLD1 */
4494                DIP("fld1\n");
4495                fp_push();
4496                /* put_ST(0, IRExpr_Const(IRConst_F64(1.0))); */
4497                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL)));
4498                break;
4499 
4500             case 0xE9: /* FLDL2T */
4501                DIP("fldl2t\n");
4502                fp_push();
4503                /* put_ST(0, IRExpr_Const(IRConst_F64(3.32192809488736234781))); */
4504                put_ST(0, IRExpr_Const(IRConst_F64i(0x400a934f0979a371ULL)));
4505                break;
4506 
4507             case 0xEA: /* FLDL2E */
4508                DIP("fldl2e\n");
4509                fp_push();
4510                /* put_ST(0, IRExpr_Const(IRConst_F64(1.44269504088896340739))); */
4511                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff71547652b82feULL)));
4512                break;
4513 
4514             case 0xEB: /* FLDPI */
4515                DIP("fldpi\n");
4516                fp_push();
4517                /* put_ST(0, IRExpr_Const(IRConst_F64(3.14159265358979323851))); */
4518                put_ST(0, IRExpr_Const(IRConst_F64i(0x400921fb54442d18ULL)));
4519                break;
4520 
4521             case 0xEC: /* FLDLG2 */
4522                DIP("fldlg2\n");
4523                fp_push();
4524                /* put_ST(0, IRExpr_Const(IRConst_F64(0.301029995663981143))); */
4525                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fd34413509f79ffULL)));
4526                break;
4527 
4528             case 0xED: /* FLDLN2 */
4529                DIP("fldln2\n");
4530                fp_push();
4531                /* put_ST(0, IRExpr_Const(IRConst_F64(0.69314718055994530942))); */
4532                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fe62e42fefa39efULL)));
4533                break;
4534 
4535             case 0xEE: /* FLDZ */
4536                DIP("fldz\n");
4537                fp_push();
4538                /* put_ST(0, IRExpr_Const(IRConst_F64(0.0))); */
4539                put_ST(0, IRExpr_Const(IRConst_F64i(0x0000000000000000ULL)));
4540                break;
4541 
4542             case 0xF0: /* F2XM1 */
4543                DIP("f2xm1\n");
4544                put_ST_UNCHECKED(0,
4545                   binop(Iop_2xm1F64,
4546                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4547                         get_ST(0)));
4548                break;
4549 
4550             case 0xF1: /* FYL2X */
4551                DIP("fyl2x\n");
4552                put_ST_UNCHECKED(1,
4553                   triop(Iop_Yl2xF64,
4554                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4555                         get_ST(1),
4556                         get_ST(0)));
4557                fp_pop();
4558                break;
4559 
4560             case 0xF2: { /* FPTAN */
4561                DIP("fptan\n");
4562                IRTemp argD = newTemp(Ity_F64);
4563                assign(argD, get_ST(0));
4564                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
4565                IRTemp resD = newTemp(Ity_F64);
4566                assign(resD,
4567                   IRExpr_ITE(
4568                      mkexpr(argOK),
4569                      binop(Iop_TanF64,
4570                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4571                            mkexpr(argD)),
4572                      mkexpr(argD))
4573                );
4574                put_ST_UNCHECKED(0, mkexpr(resD));
4575                /* Conditionally push 1.0 on the stack, if the arg is
4576                   in range */
4577                maybe_fp_push(argOK);
4578                maybe_put_ST(argOK, 0,
4579                             IRExpr_Const(IRConst_F64(1.0)));
4580                set_C2( binop(Iop_Xor32,
4581                              unop(Iop_1Uto32, mkexpr(argOK)),
4582                              mkU32(1)) );
4583                break;
4584             }
4585 
4586             case 0xF3: /* FPATAN */
4587                DIP("fpatan\n");
4588                put_ST_UNCHECKED(1,
4589                   triop(Iop_AtanF64,
4590                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4591                         get_ST(1),
4592                         get_ST(0)));
4593                fp_pop();
4594                break;
4595 
4596             case 0xF4: { /* FXTRACT */
4597                IRTemp argF = newTemp(Ity_F64);
4598                IRTemp sigF = newTemp(Ity_F64);
4599                IRTemp expF = newTemp(Ity_F64);
4600                IRTemp argI = newTemp(Ity_I64);
4601                IRTemp sigI = newTemp(Ity_I64);
4602                IRTemp expI = newTemp(Ity_I64);
4603                DIP("fxtract\n");
4604                assign( argF, get_ST(0) );
4605                assign( argI, unop(Iop_ReinterpF64asI64, mkexpr(argF)));
4606                assign( sigI,
4607                        mkIRExprCCall(
4608                           Ity_I64, 0/*regparms*/,
4609                           "x86amd64g_calculate_FXTRACT",
4610                           &x86amd64g_calculate_FXTRACT,
4611                           mkIRExprVec_2( mkexpr(argI),
4612                                          mkIRExpr_HWord(0)/*sig*/ ))
4613                );
4614                assign( expI,
4615                        mkIRExprCCall(
4616                           Ity_I64, 0/*regparms*/,
4617                           "x86amd64g_calculate_FXTRACT",
4618                           &x86amd64g_calculate_FXTRACT,
4619                           mkIRExprVec_2( mkexpr(argI),
4620                                          mkIRExpr_HWord(1)/*exp*/ ))
4621                );
4622                assign( sigF, unop(Iop_ReinterpI64asF64, mkexpr(sigI)) );
4623                assign( expF, unop(Iop_ReinterpI64asF64, mkexpr(expI)) );
4624                /* exponent */
4625                put_ST_UNCHECKED(0, mkexpr(expF) );
4626                fp_push();
4627                /* significand */
4628                put_ST(0, mkexpr(sigF) );
4629                break;
4630             }
4631 
4632             case 0xF5: { /* FPREM1 -- IEEE compliant */
4633                IRTemp a1 = newTemp(Ity_F64);
4634                IRTemp a2 = newTemp(Ity_F64);
4635                DIP("fprem1\n");
4636                /* Do FPREM1 twice, once to get the remainder, and once
4637                   to get the C3210 flag values. */
4638                assign( a1, get_ST(0) );
4639                assign( a2, get_ST(1) );
4640                put_ST_UNCHECKED(0,
4641                   triop(Iop_PRem1F64,
4642                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4643                         mkexpr(a1),
4644                         mkexpr(a2)));
4645                put_C3210(
4646                   triop(Iop_PRem1C3210F64,
4647                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4648                         mkexpr(a1),
4649                         mkexpr(a2)) );
4650                break;
4651             }
4652 
4653             case 0xF7: /* FINCSTP */
4654                DIP("fprem\n");
4655                put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
4656                break;
4657 
4658             case 0xF8: { /* FPREM -- not IEEE compliant */
4659                IRTemp a1 = newTemp(Ity_F64);
4660                IRTemp a2 = newTemp(Ity_F64);
4661                DIP("fprem\n");
4662                /* Do FPREM twice, once to get the remainder, and once
4663                   to get the C3210 flag values. */
4664                assign( a1, get_ST(0) );
4665                assign( a2, get_ST(1) );
4666                put_ST_UNCHECKED(0,
4667                   triop(Iop_PRemF64,
4668                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4669                         mkexpr(a1),
4670                         mkexpr(a2)));
4671                put_C3210(
4672                   triop(Iop_PRemC3210F64,
4673                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4674                         mkexpr(a1),
4675                         mkexpr(a2)) );
4676                break;
4677             }
4678 
4679             case 0xF9: /* FYL2XP1 */
4680                DIP("fyl2xp1\n");
4681                put_ST_UNCHECKED(1,
4682                   triop(Iop_Yl2xp1F64,
4683                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4684                         get_ST(1),
4685                         get_ST(0)));
4686                fp_pop();
4687                break;
4688 
4689             case 0xFA: /* FSQRT */
4690                DIP("fsqrt\n");
4691                put_ST_UNCHECKED(0,
4692                   binop(Iop_SqrtF64,
4693                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4694                         get_ST(0)));
4695                break;
4696 
4697             case 0xFB: { /* FSINCOS */
4698                DIP("fsincos\n");
4699                IRTemp argD = newTemp(Ity_F64);
4700                assign(argD, get_ST(0));
4701                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
4702                IRTemp resD = newTemp(Ity_F64);
4703                assign(resD,
4704                   IRExpr_ITE(
4705                      mkexpr(argOK),
4706                      binop(Iop_SinF64,
4707                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4708                            mkexpr(argD)),
4709                      mkexpr(argD))
4710                );
4711                put_ST_UNCHECKED(0, mkexpr(resD));
4712                /* Conditionally push the cos value on the stack, if
4713                   the arg is in range */
4714                maybe_fp_push(argOK);
4715                maybe_put_ST(argOK, 0,
4716                   binop(Iop_CosF64,
4717                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4718                         mkexpr(argD)));
4719                set_C2( binop(Iop_Xor32,
4720                              unop(Iop_1Uto32, mkexpr(argOK)),
4721                              mkU32(1)) );
4722                break;
4723             }
4724 
4725             case 0xFC: /* FRNDINT */
4726                DIP("frndint\n");
4727                put_ST_UNCHECKED(0,
4728                   binop(Iop_RoundF64toInt, get_roundingmode(), get_ST(0)) );
4729                break;
4730 
4731             case 0xFD: /* FSCALE */
4732                DIP("fscale\n");
4733                put_ST_UNCHECKED(0,
4734                   triop(Iop_ScaleF64,
4735                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4736                         get_ST(0),
4737                         get_ST(1)));
4738                break;
4739 
4740             case 0xFE:   /* FSIN */
4741             case 0xFF: { /* FCOS */
4742                Bool isSIN = modrm == 0xFE;
4743                DIP("%s\n", isSIN ? "fsin" : "fcos");
4744                IRTemp argD = newTemp(Ity_F64);
4745                assign(argD, get_ST(0));
4746                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
4747                IRTemp resD = newTemp(Ity_F64);
4748                assign(resD,
4749                   IRExpr_ITE(
4750                      mkexpr(argOK),
4751                      binop(isSIN ? Iop_SinF64 : Iop_CosF64,
4752                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4753                            mkexpr(argD)),
4754                      mkexpr(argD))
4755                );
4756                put_ST_UNCHECKED(0, mkexpr(resD));
4757                set_C2( binop(Iop_Xor32,
4758                              unop(Iop_1Uto32, mkexpr(argOK)),
4759                              mkU32(1)) );
4760                break;
4761             }
4762 
4763             default:
4764                goto decode_fail;
4765          }
4766       }
4767    }
4768 
4769    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDA opcodes +-+-+-+-+-+-+-+ */
4770    else
4771    if (first_opcode == 0xDA) {
4772 
4773       if (modrm < 0xC0) {
4774 
4775          /* bits 5,4,3 are an opcode extension, and the modRM also
4776             specifies an address. */
4777          IROp   fop;
4778          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
4779          delta += len;
4780          switch (gregOfRM(modrm)) {
4781 
4782             case 0: /* FIADD m32int */ /* ST(0) += m32int */
4783                DIP("fiaddl %s\n", dis_buf);
4784                fop = Iop_AddF64;
4785                goto do_fop_m32;
4786 
4787             case 1: /* FIMUL m32int */ /* ST(0) *= m32int */
4788                DIP("fimull %s\n", dis_buf);
4789                fop = Iop_MulF64;
4790                goto do_fop_m32;
4791 
4792             case 2: /* FICOM m32int */
4793                DIP("ficoml %s\n", dis_buf);
4794                /* This forces C1 to zero, which isn't right. */
4795                put_C3210(
4796                    binop( Iop_And32,
4797                           binop(Iop_Shl32,
4798                                 binop(Iop_CmpF64,
4799                                       get_ST(0),
4800                                       unop(Iop_I32StoF64,
4801                                            loadLE(Ity_I32,mkexpr(addr)))),
4802                                 mkU8(8)),
4803                           mkU32(0x4500)
4804                    ));
4805                break;
4806 
4807             case 3: /* FICOMP m32int */
4808                DIP("ficompl %s\n", dis_buf);
4809                /* This forces C1 to zero, which isn't right. */
4810                put_C3210(
4811                    binop( Iop_And32,
4812                           binop(Iop_Shl32,
4813                                 binop(Iop_CmpF64,
4814                                       get_ST(0),
4815                                       unop(Iop_I32StoF64,
4816                                            loadLE(Ity_I32,mkexpr(addr)))),
4817                                 mkU8(8)),
4818                           mkU32(0x4500)
4819                    ));
4820                fp_pop();
4821                break;
4822 
4823             case 4: /* FISUB m32int */ /* ST(0) -= m32int */
4824                DIP("fisubl %s\n", dis_buf);
4825                fop = Iop_SubF64;
4826                goto do_fop_m32;
4827 
4828             case 5: /* FISUBR m32int */ /* ST(0) = m32int - ST(0) */
4829                DIP("fisubrl %s\n", dis_buf);
4830                fop = Iop_SubF64;
4831                goto do_foprev_m32;
4832 
4833             case 6: /* FIDIV m32int */ /* ST(0) /= m32int */
4834                DIP("fidivl %s\n", dis_buf);
4835                fop = Iop_DivF64;
4836                goto do_fop_m32;
4837 
4838             case 7: /* FIDIVR m32int */ /* ST(0) = m32int / ST(0) */
4839                DIP("fidivrl %s\n", dis_buf);
4840                fop = Iop_DivF64;
4841                goto do_foprev_m32;
4842 
4843             do_fop_m32:
4844                put_ST_UNCHECKED(0,
4845                   triop(fop,
4846                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4847                         get_ST(0),
4848                         unop(Iop_I32StoF64,
4849                              loadLE(Ity_I32, mkexpr(addr)))));
4850                break;
4851 
4852             do_foprev_m32:
4853                put_ST_UNCHECKED(0,
4854                   triop(fop,
4855                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4856                         unop(Iop_I32StoF64,
4857                              loadLE(Ity_I32, mkexpr(addr))),
4858                         get_ST(0)));
4859                break;
4860 
4861             default:
4862                vex_printf("unhandled opc_aux = 0x%2x\n", (UInt)gregOfRM(modrm));
4863                vex_printf("first_opcode == 0xDA\n");
4864                goto decode_fail;
4865          }
4866 
4867       } else {
4868 
4869          delta++;
4870          switch (modrm) {
4871 
4872             case 0xc0:
4873             case 0xc1:
4874             case 0xc2:
4875             case 0xc3:
4876             case 0xc4:
4877             case 0xc5:
4878             case 0xc6:
4879             case 0xc7: /* FCMOVB ST(i), ST(0) */
4880                r_src = (UInt)modrm - 0xC0;
4881                DIP("fcmovb %%st(%u), %%st(0)\n", r_src);
4882                put_ST_UNCHECKED(0,
4883                                 IRExpr_ITE(
4884                                     mk_x86g_calculate_condition(X86CondB),
4885                                     get_ST(r_src), get_ST(0)) );
4886                break;
4887 
4888             case 0xc8:
4889             case 0xc9:
4890             case 0xca:
4891             case 0xcb:
4892             case 0xcc:
4893             case 0xcd:
4894             case 0xce:
4895             case 0xcf: /* FCMOVE(Z) ST(i), ST(0) */
4896                r_src = (UInt)modrm - 0xC8;
4897                DIP("fcmovz %%st(%u), %%st(0)\n", r_src);
4898                put_ST_UNCHECKED(0,
4899                                 IRExpr_ITE(
4900                                     mk_x86g_calculate_condition(X86CondZ),
4901                                     get_ST(r_src), get_ST(0)) );
4902                break;
4903 
4904             case 0xd0:
4905             case 0xd1:
4906             case 0xd2:
4907             case 0xd3:
4908             case 0xd4:
4909             case 0xd5:
4910             case 0xd6:
4911             case 0xd7: /* FCMOVBE ST(i), ST(0) */
4912                r_src = (UInt)modrm - 0xD0;
4913                DIP("fcmovbe %%st(%u), %%st(0)\n", r_src);
4914                put_ST_UNCHECKED(0,
4915                                 IRExpr_ITE(
4916                                     mk_x86g_calculate_condition(X86CondBE),
4917                                     get_ST(r_src), get_ST(0)) );
4918                break;
4919 
4920             case 0xd8:
4921             case 0xd9:
4922             case 0xda:
4923             case 0xdb:
4924             case 0xdc:
4925             case 0xdd:
4926             case 0xde:
4927             case 0xdf: /* FCMOVU ST(i), ST(0) */
4928                r_src = (UInt)modrm - 0xD8;
4929                DIP("fcmovu %%st(%u), %%st(0)\n", r_src);
4930                put_ST_UNCHECKED(0,
4931                                 IRExpr_ITE(
4932                                     mk_x86g_calculate_condition(X86CondP),
4933                                     get_ST(r_src), get_ST(0)) );
4934                break;
4935 
4936             case 0xE9: /* FUCOMPP %st(0),%st(1) */
4937                DIP("fucompp %%st(0),%%st(1)\n");
4938                /* This forces C1 to zero, which isn't right. */
4939                put_C3210(
4940                    binop( Iop_And32,
4941                           binop(Iop_Shl32,
4942                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
4943                                 mkU8(8)),
4944                           mkU32(0x4500)
4945                    ));
4946                fp_pop();
4947                fp_pop();
4948                break;
4949 
4950             default:
4951                goto decode_fail;
4952          }
4953 
4954       }
4955    }
4956 
4957    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDB opcodes +-+-+-+-+-+-+-+ */
4958    else
4959    if (first_opcode == 0xDB) {
4960       if (modrm < 0xC0) {
4961 
4962          /* bits 5,4,3 are an opcode extension, and the modRM also
4963             specifies an address. */
4964          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
4965          delta += len;
4966 
4967          switch (gregOfRM(modrm)) {
4968 
4969             case 0: /* FILD m32int */
4970                DIP("fildl %s\n", dis_buf);
4971                fp_push();
4972                put_ST(0, unop(Iop_I32StoF64,
4973                               loadLE(Ity_I32, mkexpr(addr))));
4974                break;
4975 
4976             case 1: /* FISTTPL m32 (SSE3) */
4977                DIP("fisttpl %s\n", dis_buf);
4978                storeLE( mkexpr(addr),
4979                         binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) );
4980                fp_pop();
4981                break;
4982 
4983             case 2: /* FIST m32 */
4984                DIP("fistl %s\n", dis_buf);
4985                storeLE( mkexpr(addr),
4986                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
4987                break;
4988 
4989             case 3: /* FISTP m32 */
4990                DIP("fistpl %s\n", dis_buf);
4991                storeLE( mkexpr(addr),
4992                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
4993                fp_pop();
4994                break;
4995 
4996             case 5: { /* FLD extended-real */
4997                /* Uses dirty helper:
4998                      ULong x86g_loadF80le ( UInt )
4999                   addr holds the address.  First, do a dirty call to
5000                   get hold of the data. */
5001                IRTemp   val  = newTemp(Ity_I64);
5002                IRExpr** args = mkIRExprVec_1 ( mkexpr(addr) );
5003 
5004                IRDirty* d = unsafeIRDirty_1_N (
5005                                val,
5006                                0/*regparms*/,
5007                                "x86g_dirtyhelper_loadF80le",
5008                                &x86g_dirtyhelper_loadF80le,
5009                                args
5010                             );
5011                /* declare that we're reading memory */
5012                d->mFx   = Ifx_Read;
5013                d->mAddr = mkexpr(addr);
5014                d->mSize = 10;
5015 
5016                /* execute the dirty call, dumping the result in val. */
5017                stmt( IRStmt_Dirty(d) );
5018                fp_push();
5019                put_ST(0, unop(Iop_ReinterpI64asF64, mkexpr(val)));
5020 
5021                DIP("fldt %s\n", dis_buf);
5022                break;
5023             }
5024 
5025             case 7: { /* FSTP extended-real */
5026                /* Uses dirty helper: void x86g_storeF80le ( UInt, ULong ) */
5027                IRExpr** args
5028                   = mkIRExprVec_2( mkexpr(addr),
5029                                    unop(Iop_ReinterpF64asI64, get_ST(0)) );
5030 
5031                IRDirty* d = unsafeIRDirty_0_N (
5032                                0/*regparms*/,
5033                                "x86g_dirtyhelper_storeF80le",
5034                                &x86g_dirtyhelper_storeF80le,
5035                                args
5036                             );
5037                /* declare we're writing memory */
5038                d->mFx   = Ifx_Write;
5039                d->mAddr = mkexpr(addr);
5040                d->mSize = 10;
5041 
5042                /* execute the dirty call. */
5043                stmt( IRStmt_Dirty(d) );
5044                fp_pop();
5045 
5046                DIP("fstpt\n %s", dis_buf);
5047                break;
5048             }
5049 
5050             default:
5051                vex_printf("unhandled opc_aux = 0x%2x\n", (UInt)gregOfRM(modrm));
5052                vex_printf("first_opcode == 0xDB\n");
5053                goto decode_fail;
5054          }
5055 
5056       } else {
5057 
5058          delta++;
5059          switch (modrm) {
5060 
5061             case 0xc0:
5062             case 0xc1:
5063             case 0xc2:
5064             case 0xc3:
5065             case 0xc4:
5066             case 0xc5:
5067             case 0xc6:
5068             case 0xc7: /* FCMOVNB ST(i), ST(0) */
5069                r_src = (UInt)modrm - 0xC0;
5070                DIP("fcmovnb %%st(%u), %%st(0)\n", r_src);
5071                put_ST_UNCHECKED(0,
5072                                 IRExpr_ITE(
5073                                     mk_x86g_calculate_condition(X86CondNB),
5074                                     get_ST(r_src), get_ST(0)) );
5075                break;
5076 
5077             case 0xc8:
5078             case 0xc9:
5079             case 0xca:
5080             case 0xcb:
5081             case 0xcc:
5082             case 0xcd:
5083             case 0xce:
5084             case 0xcf: /* FCMOVNE(NZ) ST(i), ST(0) */
5085                r_src = (UInt)modrm - 0xC8;
5086                DIP("fcmovnz %%st(%u), %%st(0)\n", r_src);
5087                put_ST_UNCHECKED(0,
5088                                 IRExpr_ITE(
5089                                     mk_x86g_calculate_condition(X86CondNZ),
5090                                     get_ST(r_src), get_ST(0)) );
5091                break;
5092 
5093             case 0xd0:
5094             case 0xd1:
5095             case 0xd2:
5096             case 0xd3:
5097             case 0xd4:
5098             case 0xd5:
5099             case 0xd6:
5100             case 0xd7: /* FCMOVNBE ST(i), ST(0) */
5101                r_src = (UInt)modrm - 0xD0;
5102                DIP("fcmovnbe %%st(%u), %%st(0)\n", r_src);
5103                put_ST_UNCHECKED(0,
5104                                 IRExpr_ITE(
5105                                     mk_x86g_calculate_condition(X86CondNBE),
5106                                     get_ST(r_src), get_ST(0)) );
5107                break;
5108 
5109             case 0xd8:
5110             case 0xd9:
5111             case 0xda:
5112             case 0xdb:
5113             case 0xdc:
5114             case 0xdd:
5115             case 0xde:
5116             case 0xdf: /* FCMOVNU ST(i), ST(0) */
5117                r_src = (UInt)modrm - 0xD8;
5118                DIP("fcmovnu %%st(%u), %%st(0)\n", r_src);
5119                put_ST_UNCHECKED(0,
5120                                 IRExpr_ITE(
5121                                     mk_x86g_calculate_condition(X86CondNP),
5122                                     get_ST(r_src), get_ST(0)) );
5123                break;
5124 
5125             case 0xE2:
5126                DIP("fnclex\n");
5127                break;
5128 
5129             case 0xE3: {
5130                /* Uses dirty helper:
5131                      void x86g_do_FINIT ( VexGuestX86State* ) */
5132                IRDirty* d  = unsafeIRDirty_0_N (
5133                                 0/*regparms*/,
5134                                 "x86g_dirtyhelper_FINIT",
5135                                 &x86g_dirtyhelper_FINIT,
5136                                 mkIRExprVec_1(IRExpr_GSPTR())
5137                              );
5138 
5139                /* declare we're writing guest state */
5140                d->nFxState = 5;
5141                vex_bzero(&d->fxState, sizeof(d->fxState));
5142 
5143                d->fxState[0].fx     = Ifx_Write;
5144                d->fxState[0].offset = OFFB_FTOP;
5145                d->fxState[0].size   = sizeof(UInt);
5146 
5147                d->fxState[1].fx     = Ifx_Write;
5148                d->fxState[1].offset = OFFB_FPREGS;
5149                d->fxState[1].size   = 8 * sizeof(ULong);
5150 
5151                d->fxState[2].fx     = Ifx_Write;
5152                d->fxState[2].offset = OFFB_FPTAGS;
5153                d->fxState[2].size   = 8 * sizeof(UChar);
5154 
5155                d->fxState[3].fx     = Ifx_Write;
5156                d->fxState[3].offset = OFFB_FPROUND;
5157                d->fxState[3].size   = sizeof(UInt);
5158 
5159                d->fxState[4].fx     = Ifx_Write;
5160                d->fxState[4].offset = OFFB_FC3210;
5161                d->fxState[4].size   = sizeof(UInt);
5162 
5163                stmt( IRStmt_Dirty(d) );
5164 
5165                DIP("fninit\n");
5166                break;
5167             }
5168 
5169             case 0xe8:
5170             case 0xe9:
5171             case 0xea:
5172             case 0xeb:
5173             case 0xec:
5174             case 0xed:
5175             case 0xee:
5176             case 0xef: /* FUCOMI %st(0),%st(?) */
5177                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, False );
5178                break;
5179 
5180             case 0xf0:
5181             case 0xf1:
5182             case 0xf2:
5183             case 0xf3:
5184             case 0xf4:
5185             case 0xf5:
5186             case 0xf6:
5187             case 0xf7: /* FCOMI %st(0),%st(?) */
5188                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, False );
5189                break;
5190 
5191             default:
5192                goto decode_fail;
5193          }
5194       }
5195    }
5196 
5197    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDC opcodes +-+-+-+-+-+-+-+ */
5198    else
5199    if (first_opcode == 0xDC) {
5200       if (modrm < 0xC0) {
5201 
5202          /* bits 5,4,3 are an opcode extension, and the modRM also
5203             specifies an address. */
5204          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5205          delta += len;
5206 
5207          switch (gregOfRM(modrm)) {
5208 
5209             case 0: /* FADD double-real */
5210                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, True );
5211                break;
5212 
5213             case 1: /* FMUL double-real */
5214                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, True );
5215                break;
5216 
5217             case 2: /* FCOM double-real */
5218                DIP("fcoml %s\n", dis_buf);
5219                /* This forces C1 to zero, which isn't right. */
5220                put_C3210(
5221                    binop( Iop_And32,
5222                           binop(Iop_Shl32,
5223                                 binop(Iop_CmpF64,
5224                                       get_ST(0),
5225                                       loadLE(Ity_F64,mkexpr(addr))),
5226                                 mkU8(8)),
5227                           mkU32(0x4500)
5228                    ));
5229                break;
5230 
5231             case 3: /* FCOMP double-real */
5232                DIP("fcompl %s\n", dis_buf);
5233                /* This forces C1 to zero, which isn't right. */
5234                put_C3210(
5235                    binop( Iop_And32,
5236                           binop(Iop_Shl32,
5237                                 binop(Iop_CmpF64,
5238                                       get_ST(0),
5239                                       loadLE(Ity_F64,mkexpr(addr))),
5240                                 mkU8(8)),
5241                           mkU32(0x4500)
5242                    ));
5243                fp_pop();
5244                break;
5245 
5246             case 4: /* FSUB double-real */
5247                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, True );
5248                break;
5249 
5250             case 5: /* FSUBR double-real */
5251                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, True );
5252                break;
5253 
5254             case 6: /* FDIV double-real */
5255                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, True );
5256                break;
5257 
5258             case 7: /* FDIVR double-real */
5259                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, True );
5260                break;
5261 
5262             default:
5263                vex_printf("unhandled opc_aux = 0x%2x\n", (UInt)gregOfRM(modrm));
5264                vex_printf("first_opcode == 0xDC\n");
5265                goto decode_fail;
5266          }
5267 
5268       } else {
5269 
5270          delta++;
5271          switch (modrm) {
5272 
5273             case 0xc0:
5274             case 0xc1:
5275             case 0xc2:
5276             case 0xc3:
5277             case 0xc4:
5278             case 0xc5:
5279             case 0xc6:
5280             case 0xc7: /* FADD %st(0),%st(?) */
5281                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, False );
5282                break;
5283 
5284             case 0xc8:
5285             case 0xc9:
5286             case 0xca:
5287             case 0xcb:
5288             case 0xcc:
5289             case 0xcd:
5290             case 0xce:
5291             case 0xcf: /* FMUL %st(0),%st(?) */
5292                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, False );
5293                break;
5294 
5295             case 0xe0:
5296             case 0xe1:
5297             case 0xe2:
5298             case 0xe3:
5299             case 0xe4:
5300             case 0xe5:
5301             case 0xe6:
5302             case 0xe7: /* FSUBR %st(0),%st(?) */
5303                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0, modrm - 0xE0, False );
5304                break;
5305 
5306             case 0xe8:
5307             case 0xe9:
5308             case 0xea:
5309             case 0xeb:
5310             case 0xec:
5311             case 0xed:
5312             case 0xee:
5313             case 0xef: /* FSUB %st(0),%st(?) */
5314                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0, modrm - 0xE8, False );
5315                break;
5316 
5317             case 0xf0:
5318             case 0xf1:
5319             case 0xf2:
5320             case 0xf3:
5321             case 0xf4:
5322             case 0xf5:
5323             case 0xf6:
5324             case 0xf7: /* FDIVR %st(0),%st(?) */
5325                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, False );
5326                break;
5327 
5328             case 0xf8:
5329             case 0xf9:
5330             case 0xfa:
5331             case 0xfb:
5332             case 0xfc:
5333             case 0xfd:
5334             case 0xfe:
5335             case 0xff: /* FDIV %st(0),%st(?) */
5336                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, False );
5337                break;
5338 
5339             default:
5340                goto decode_fail;
5341          }
5342 
5343       }
5344    }
5345 
5346    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDD opcodes +-+-+-+-+-+-+-+ */
5347    else
5348    if (first_opcode == 0xDD) {
5349 
5350       if (modrm < 0xC0) {
5351 
5352          /* bits 5,4,3 are an opcode extension, and the modRM also
5353             specifies an address. */
5354          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5355          delta += len;
5356 
5357          switch (gregOfRM(modrm)) {
5358 
5359             case 0: /* FLD double-real */
5360                DIP("fldl %s\n", dis_buf);
5361                fp_push();
5362                put_ST(0, loadLE(Ity_F64, mkexpr(addr)));
5363                break;
5364 
5365             case 1: /* FISTTPQ m64 (SSE3) */
5366                DIP("fistppll %s\n", dis_buf);
5367                storeLE( mkexpr(addr),
5368                         binop(Iop_F64toI64S, mkU32(Irrm_ZERO), get_ST(0)) );
5369                fp_pop();
5370                break;
5371 
5372             case 2: /* FST double-real */
5373                DIP("fstl %s\n", dis_buf);
5374                storeLE(mkexpr(addr), get_ST(0));
5375                break;
5376 
5377             case 3: /* FSTP double-real */
5378                DIP("fstpl %s\n", dis_buf);
5379                storeLE(mkexpr(addr), get_ST(0));
5380                fp_pop();
5381                break;
5382 
5383             case 4: { /* FRSTOR m108 */
5384                /* Uses dirty helper:
5385                      VexEmNote x86g_do_FRSTOR ( VexGuestX86State*, Addr32 ) */
5386                IRTemp   ew = newTemp(Ity_I32);
5387                IRDirty* d  = unsafeIRDirty_0_N (
5388                                 0/*regparms*/,
5389                                 "x86g_dirtyhelper_FRSTOR",
5390                                 &x86g_dirtyhelper_FRSTOR,
5391                                 mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
5392                              );
5393                d->tmp   = ew;
5394                /* declare we're reading memory */
5395                d->mFx   = Ifx_Read;
5396                d->mAddr = mkexpr(addr);
5397                d->mSize = 108;
5398 
5399                /* declare we're writing guest state */
5400                d->nFxState = 5;
5401                vex_bzero(&d->fxState, sizeof(d->fxState));
5402 
5403                d->fxState[0].fx     = Ifx_Write;
5404                d->fxState[0].offset = OFFB_FTOP;
5405                d->fxState[0].size   = sizeof(UInt);
5406 
5407                d->fxState[1].fx     = Ifx_Write;
5408                d->fxState[1].offset = OFFB_FPREGS;
5409                d->fxState[1].size   = 8 * sizeof(ULong);
5410 
5411                d->fxState[2].fx     = Ifx_Write;
5412                d->fxState[2].offset = OFFB_FPTAGS;
5413                d->fxState[2].size   = 8 * sizeof(UChar);
5414 
5415                d->fxState[3].fx     = Ifx_Write;
5416                d->fxState[3].offset = OFFB_FPROUND;
5417                d->fxState[3].size   = sizeof(UInt);
5418 
5419                d->fxState[4].fx     = Ifx_Write;
5420                d->fxState[4].offset = OFFB_FC3210;
5421                d->fxState[4].size   = sizeof(UInt);
5422 
5423                stmt( IRStmt_Dirty(d) );
5424 
5425                /* ew contains any emulation warning we may need to
5426                   issue.  If needed, side-exit to the next insn,
5427                   reporting the warning, so that Valgrind's dispatcher
5428                   sees the warning. */
5429                put_emwarn( mkexpr(ew) );
5430                stmt(
5431                   IRStmt_Exit(
5432                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
5433                      Ijk_EmWarn,
5434                      IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
5435                      OFFB_EIP
5436                   )
5437                );
5438 
5439                DIP("frstor %s\n", dis_buf);
5440                break;
5441             }
5442 
5443             case 6: { /* FNSAVE m108 */
5444                /* Uses dirty helper:
5445                      void x86g_do_FSAVE ( VexGuestX86State*, UInt ) */
5446                IRDirty* d = unsafeIRDirty_0_N (
5447                                0/*regparms*/,
5448                                "x86g_dirtyhelper_FSAVE",
5449                                &x86g_dirtyhelper_FSAVE,
5450                                mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
5451                             );
5452                /* declare we're writing memory */
5453                d->mFx   = Ifx_Write;
5454                d->mAddr = mkexpr(addr);
5455                d->mSize = 108;
5456 
5457                /* declare we're reading guest state */
5458                d->nFxState = 5;
5459                vex_bzero(&d->fxState, sizeof(d->fxState));
5460 
5461                d->fxState[0].fx     = Ifx_Read;
5462                d->fxState[0].offset = OFFB_FTOP;
5463                d->fxState[0].size   = sizeof(UInt);
5464 
5465                d->fxState[1].fx     = Ifx_Read;
5466                d->fxState[1].offset = OFFB_FPREGS;
5467                d->fxState[1].size   = 8 * sizeof(ULong);
5468 
5469                d->fxState[2].fx     = Ifx_Read;
5470                d->fxState[2].offset = OFFB_FPTAGS;
5471                d->fxState[2].size   = 8 * sizeof(UChar);
5472 
5473                d->fxState[3].fx     = Ifx_Read;
5474                d->fxState[3].offset = OFFB_FPROUND;
5475                d->fxState[3].size   = sizeof(UInt);
5476 
5477                d->fxState[4].fx     = Ifx_Read;
5478                d->fxState[4].offset = OFFB_FC3210;
5479                d->fxState[4].size   = sizeof(UInt);
5480 
5481                stmt( IRStmt_Dirty(d) );
5482 
5483                DIP("fnsave %s\n", dis_buf);
5484                break;
5485             }
5486 
5487             case 7: { /* FNSTSW m16 */
5488                IRExpr* sw = get_FPU_sw();
5489                vassert(typeOfIRExpr(irsb->tyenv, sw) == Ity_I16);
5490                storeLE( mkexpr(addr), sw );
5491                DIP("fnstsw %s\n", dis_buf);
5492                break;
5493             }
5494 
5495             default:
5496                vex_printf("unhandled opc_aux = 0x%2x\n", (UInt)gregOfRM(modrm));
5497                vex_printf("first_opcode == 0xDD\n");
5498                goto decode_fail;
5499          }
5500       } else {
5501          delta++;
5502          switch (modrm) {
5503 
5504             case 0xc0:
5505             case 0xc1:
5506             case 0xc2:
5507             case 0xc3:
5508             case 0xc4:
5509             case 0xc5:
5510             case 0xc6:
5511             case 0xc7: /* FFREE %st(?) */
5512                r_dst = (UInt)modrm - 0xC0;
5513                DIP("ffree %%st(%u)\n", r_dst);
5514                put_ST_TAG ( r_dst, mkU8(0) );
5515                break;
5516 
5517             case 0xd0:
5518             case 0xd1:
5519             case 0xd2:
5520             case 0xd3:
5521             case 0xd4:
5522             case 0xd5:
5523             case 0xd6:
5524             case 0xd7: /* FST %st(0),%st(?) */
5525                r_dst = (UInt)modrm - 0xD0;
5526                DIP("fst %%st(0),%%st(%u)\n", r_dst);
5527                /* P4 manual says: "If the destination operand is a
5528                   non-empty register, the invalid-operation exception
5529                   is not generated.  Hence put_ST_UNCHECKED. */
5530                put_ST_UNCHECKED(r_dst, get_ST(0));
5531                break;
5532 
5533             case 0xd8:
5534             case 0xd9:
5535             case 0xda:
5536             case 0xdb:
5537             case 0xdc:
5538             case 0xdd:
5539             case 0xde:
5540             case 0xdf: /* FSTP %st(0),%st(?) */
5541                r_dst = (UInt)modrm - 0xD8;
5542                DIP("fstp %%st(0),%%st(%u)\n", r_dst);
5543                /* P4 manual says: "If the destination operand is a
5544                   non-empty register, the invalid-operation exception
5545                   is not generated.  Hence put_ST_UNCHECKED. */
5546                put_ST_UNCHECKED(r_dst, get_ST(0));
5547                fp_pop();
5548                break;
5549 
5550             case 0xe0:
5551             case 0xe1:
5552             case 0xe2:
5553             case 0xe3:
5554             case 0xe4:
5555             case 0xe5:
5556             case 0xe6:
5557             case 0xe7: /* FUCOM %st(0),%st(?) */
5558                r_dst = (UInt)modrm - 0xE0;
5559                DIP("fucom %%st(0),%%st(%u)\n", r_dst);
5560                /* This forces C1 to zero, which isn't right. */
5561                put_C3210(
5562                    binop( Iop_And32,
5563                           binop(Iop_Shl32,
5564                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
5565                                 mkU8(8)),
5566                           mkU32(0x4500)
5567                    ));
5568                break;
5569 
5570             case 0xe8:
5571             case 0xe9:
5572             case 0xea:
5573             case 0xeb:
5574             case 0xec:
5575             case 0xed:
5576             case 0xee:
5577             case 0xef: /* FUCOMP %st(0),%st(?) */
5578                r_dst = (UInt)modrm - 0xE8;
5579                DIP("fucomp %%st(0),%%st(%u)\n", r_dst);
5580                /* This forces C1 to zero, which isn't right. */
5581                put_C3210(
5582                    binop( Iop_And32,
5583                           binop(Iop_Shl32,
5584                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
5585                                 mkU8(8)),
5586                           mkU32(0x4500)
5587                    ));
5588                fp_pop();
5589                break;
5590 
5591             default:
5592                goto decode_fail;
5593          }
5594       }
5595    }
5596 
5597    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDE opcodes +-+-+-+-+-+-+-+ */
5598    else
5599    if (first_opcode == 0xDE) {
5600 
5601       if (modrm < 0xC0) {
5602 
5603          /* bits 5,4,3 are an opcode extension, and the modRM also
5604             specifies an address. */
5605          IROp   fop;
5606          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5607          delta += len;
5608 
5609          switch (gregOfRM(modrm)) {
5610 
5611             case 0: /* FIADD m16int */ /* ST(0) += m16int */
5612                DIP("fiaddw %s\n", dis_buf);
5613                fop = Iop_AddF64;
5614                goto do_fop_m16;
5615 
5616             case 1: /* FIMUL m16int */ /* ST(0) *= m16int */
5617                DIP("fimulw %s\n", dis_buf);
5618                fop = Iop_MulF64;
5619                goto do_fop_m16;
5620 
5621             case 2: /* FICOM m16int */
5622                DIP("ficomw %s\n", dis_buf);
5623                /* This forces C1 to zero, which isn't right. */
5624                put_C3210(
5625                    binop( Iop_And32,
5626                           binop(Iop_Shl32,
5627                                 binop(Iop_CmpF64,
5628                                       get_ST(0),
5629                                       unop(Iop_I32StoF64,
5630                                          unop(Iop_16Sto32,
5631                                            loadLE(Ity_I16,mkexpr(addr))))),
5632                                 mkU8(8)),
5633                           mkU32(0x4500)
5634                    ));
5635                break;
5636 
5637             case 3: /* FICOMP m16int */
5638                DIP("ficompw %s\n", dis_buf);
5639                /* This forces C1 to zero, which isn't right. */
5640                put_C3210(
5641                    binop( Iop_And32,
5642                           binop(Iop_Shl32,
5643                                 binop(Iop_CmpF64,
5644                                       get_ST(0),
5645                                       unop(Iop_I32StoF64,
5646                                          unop(Iop_16Sto32,
5647                                               loadLE(Ity_I16,mkexpr(addr))))),
5648                                 mkU8(8)),
5649                           mkU32(0x4500)
5650                    ));
5651                fp_pop();
5652                break;
5653 
5654             case 4: /* FISUB m16int */ /* ST(0) -= m16int */
5655                DIP("fisubw %s\n", dis_buf);
5656                fop = Iop_SubF64;
5657                goto do_fop_m16;
5658 
5659             case 5: /* FISUBR m16int */ /* ST(0) = m16int - ST(0) */
5660                DIP("fisubrw %s\n", dis_buf);
5661                fop = Iop_SubF64;
5662                goto do_foprev_m16;
5663 
5664             case 6: /* FIDIV m16int */ /* ST(0) /= m16int */
5665                DIP("fisubw %s\n", dis_buf);
5666                fop = Iop_DivF64;
5667                goto do_fop_m16;
5668 
5669             case 7: /* FIDIVR m16int */ /* ST(0) = m16int / ST(0) */
5670                DIP("fidivrw %s\n", dis_buf);
5671                fop = Iop_DivF64;
5672                goto do_foprev_m16;
5673 
5674             do_fop_m16:
5675                put_ST_UNCHECKED(0,
5676                   triop(fop,
5677                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5678                         get_ST(0),
5679                         unop(Iop_I32StoF64,
5680                              unop(Iop_16Sto32,
5681                                   loadLE(Ity_I16, mkexpr(addr))))));
5682                break;
5683 
5684             do_foprev_m16:
5685                put_ST_UNCHECKED(0,
5686                   triop(fop,
5687                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5688                         unop(Iop_I32StoF64,
5689                              unop(Iop_16Sto32,
5690                                   loadLE(Ity_I16, mkexpr(addr)))),
5691                         get_ST(0)));
5692                break;
5693 
5694             default:
5695                vex_printf("unhandled opc_aux = 0x%2x\n", (UInt)gregOfRM(modrm));
5696                vex_printf("first_opcode == 0xDE\n");
5697                goto decode_fail;
5698          }
5699 
5700       } else {
5701 
5702          delta++;
5703          switch (modrm) {
5704 
5705             case 0xc0:
5706             case 0xc1:
5707             case 0xc2:
5708             case 0xc3:
5709             case 0xc4:
5710             case 0xc5:
5711             case 0xc6:
5712             case 0xc7: /* FADDP %st(0),%st(?) */
5713                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, True );
5714                break;
5715 
5716             case 0xc8:
5717             case 0xc9:
5718             case 0xca:
5719             case 0xcb:
5720             case 0xcc:
5721             case 0xcd:
5722             case 0xce:
5723             case 0xcf: /* FMULP %st(0),%st(?) */
5724                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, True );
5725                break;
5726 
5727             case 0xD9: /* FCOMPP %st(0),%st(1) */
5728                DIP("fuompp %%st(0),%%st(1)\n");
5729                /* This forces C1 to zero, which isn't right. */
5730                put_C3210(
5731                    binop( Iop_And32,
5732                           binop(Iop_Shl32,
5733                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
5734                                 mkU8(8)),
5735                           mkU32(0x4500)
5736                    ));
5737                fp_pop();
5738                fp_pop();
5739                break;
5740 
5741             case 0xe0:
5742             case 0xe1:
5743             case 0xe2:
5744             case 0xe3:
5745             case 0xe4:
5746             case 0xe5:
5747             case 0xe6:
5748             case 0xe7: /* FSUBRP %st(0),%st(?) */
5749                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0,  modrm - 0xE0, True );
5750                break;
5751 
5752             case 0xe8:
5753             case 0xe9:
5754             case 0xea:
5755             case 0xeb:
5756             case 0xec:
5757             case 0xed:
5758             case 0xee:
5759             case 0xef: /* FSUBP %st(0),%st(?) */
5760                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0,  modrm - 0xE8, True );
5761                break;
5762 
5763             case 0xf0:
5764             case 0xf1:
5765             case 0xf2:
5766             case 0xf3:
5767             case 0xf4:
5768             case 0xf5:
5769             case 0xf6:
5770             case 0xf7: /* FDIVRP %st(0),%st(?) */
5771                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, True );
5772                break;
5773 
5774             case 0xf8:
5775             case 0xf9:
5776             case 0xfa:
5777             case 0xfb:
5778             case 0xfc:
5779             case 0xfd:
5780             case 0xfe:
5781             case 0xff: /* FDIVP %st(0),%st(?) */
5782                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, True );
5783                break;
5784 
5785             default:
5786                goto decode_fail;
5787          }
5788 
5789       }
5790    }
5791 
5792    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDF opcodes +-+-+-+-+-+-+-+ */
5793    else
5794    if (first_opcode == 0xDF) {
5795 
5796       if (modrm < 0xC0) {
5797 
5798          /* bits 5,4,3 are an opcode extension, and the modRM also
5799             specifies an address. */
5800          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5801          delta += len;
5802 
5803          switch (gregOfRM(modrm)) {
5804 
5805             case 0: /* FILD m16int */
5806                DIP("fildw %s\n", dis_buf);
5807                fp_push();
5808                put_ST(0, unop(Iop_I32StoF64,
5809                               unop(Iop_16Sto32,
5810                                    loadLE(Ity_I16, mkexpr(addr)))));
5811                break;
5812 
5813             case 1: /* FISTTPS m16 (SSE3) */
5814                DIP("fisttps %s\n", dis_buf);
5815                storeLE( mkexpr(addr),
5816                         binop(Iop_F64toI16S, mkU32(Irrm_ZERO), get_ST(0)) );
5817                fp_pop();
5818                break;
5819 
5820             case 2: /* FIST m16 */
5821                DIP("fistp %s\n", dis_buf);
5822                storeLE( mkexpr(addr),
5823                         binop(Iop_F64toI16S, get_roundingmode(), get_ST(0)) );
5824                break;
5825 
5826             case 3: /* FISTP m16 */
5827                DIP("fistps %s\n", dis_buf);
5828                storeLE( mkexpr(addr),
5829                         binop(Iop_F64toI16S, get_roundingmode(), get_ST(0)) );
5830                fp_pop();
5831                break;
5832 
5833             case 5: /* FILD m64 */
5834                DIP("fildll %s\n", dis_buf);
5835                fp_push();
5836                put_ST(0, binop(Iop_I64StoF64,
5837                                get_roundingmode(),
5838                                loadLE(Ity_I64, mkexpr(addr))));
5839                break;
5840 
5841             case 7: /* FISTP m64 */
5842                DIP("fistpll %s\n", dis_buf);
5843                storeLE( mkexpr(addr),
5844                         binop(Iop_F64toI64S, get_roundingmode(), get_ST(0)) );
5845                fp_pop();
5846                break;
5847 
5848             default:
5849                vex_printf("unhandled opc_aux = 0x%2x\n", (UInt)gregOfRM(modrm));
5850                vex_printf("first_opcode == 0xDF\n");
5851                goto decode_fail;
5852          }
5853 
5854       } else {
5855 
5856          delta++;
5857          switch (modrm) {
5858 
5859             case 0xC0: /* FFREEP %st(0) */
5860                DIP("ffreep %%st(%d)\n", 0);
5861                put_ST_TAG ( 0, mkU8(0) );
5862                fp_pop();
5863                break;
5864 
5865             case 0xE0: /* FNSTSW %ax */
5866                DIP("fnstsw %%ax\n");
5867                /* Get the FPU status word value and dump it in %AX. */
5868                if (0) {
5869                   /* The obvious thing to do is simply dump the 16-bit
5870                      status word value in %AX.  However, due to a
5871                      limitation in Memcheck's origin tracking
5872                      machinery, this causes Memcheck not to track the
5873                      origin of any undefinedness into %AH (only into
5874                      %AL/%AX/%EAX), which means origins are lost in
5875                      the sequence "fnstsw %ax; test $M,%ah; jcond .." */
5876                   putIReg(2, R_EAX, get_FPU_sw());
5877                } else {
5878                   /* So a somewhat lame kludge is to make it very
5879                      clear to Memcheck that the value is written to
5880                      both %AH and %AL.  This generates marginally
5881                      worse code, but I don't think it matters much. */
5882                   IRTemp t16 = newTemp(Ity_I16);
5883                   assign(t16, get_FPU_sw());
5884                   putIReg( 1, R_AL, unop(Iop_16to8, mkexpr(t16)) );
5885                   putIReg( 1, R_AH, unop(Iop_16HIto8, mkexpr(t16)) );
5886                }
5887                break;
5888 
5889             case 0xe8:
5890             case 0xe9:
5891             case 0xea:
5892             case 0xeb:
5893             case 0xec:
5894             case 0xed:
5895             case 0xee:
5896             case 0xef: /* FUCOMIP %st(0),%st(?) */
5897                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, True );
5898                break;
5899 
5900             case 0xf0:
5901             case 0xf1:
5902             case 0xf2:
5903             case 0xf3:
5904             case 0xf4:
5905             case 0xf5:
5906             case 0xf6:
5907             case 0xf7: /* FCOMIP %st(0),%st(?) */
5908                /* not really right since COMIP != UCOMIP */
5909                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, True );
5910                break;
5911 
5912             default:
5913                goto decode_fail;
5914          }
5915       }
5916 
5917    }
5918 
5919    else
5920    vpanic("dis_FPU(x86): invalid primary opcode");
5921 
5922    *decode_ok = True;
5923    return delta;
5924 
5925   decode_fail:
5926    *decode_ok = False;
5927    return delta;
5928 }
5929 
5930 
5931 /*------------------------------------------------------------*/
5932 /*---                                                      ---*/
5933 /*--- MMX INSTRUCTIONS                                     ---*/
5934 /*---                                                      ---*/
5935 /*------------------------------------------------------------*/
5936 
5937 /* Effect of MMX insns on x87 FPU state (table 11-2 of
5938    IA32 arch manual, volume 3):
5939 
5940    Read from, or write to MMX register (viz, any insn except EMMS):
5941    * All tags set to Valid (non-empty) -- FPTAGS[i] := nonzero
5942    * FP stack pointer set to zero
5943 
5944    EMMS:
5945    * All tags set to Invalid (empty) -- FPTAGS[i] := zero
5946    * FP stack pointer set to zero
5947 */
5948 
do_MMX_preamble(void)5949 static void do_MMX_preamble ( void )
5950 {
5951    Int         i;
5952    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
5953    IRExpr*     zero  = mkU32(0);
5954    IRExpr*     tag1  = mkU8(1);
5955    put_ftop(zero);
5956    for (i = 0; i < 8; i++)
5957       stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag1) ) );
5958 }
5959 
do_EMMS_preamble(void)5960 static void do_EMMS_preamble ( void )
5961 {
5962    Int         i;
5963    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
5964    IRExpr*     zero  = mkU32(0);
5965    IRExpr*     tag0  = mkU8(0);
5966    put_ftop(zero);
5967    for (i = 0; i < 8; i++)
5968       stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag0) ) );
5969 }
5970 
5971 
getMMXReg(UInt archreg)5972 static IRExpr* getMMXReg ( UInt archreg )
5973 {
5974    vassert(archreg < 8);
5975    return IRExpr_Get( OFFB_FPREGS + 8 * archreg, Ity_I64 );
5976 }
5977 
5978 
putMMXReg(UInt archreg,IRExpr * e)5979 static void putMMXReg ( UInt archreg, IRExpr* e )
5980 {
5981    vassert(archreg < 8);
5982    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
5983    stmt( IRStmt_Put( OFFB_FPREGS + 8 * archreg, e ) );
5984 }
5985 
5986 
5987 /* Helper for non-shift MMX insns.  Note this is incomplete in the
5988    sense that it does not first call do_MMX_preamble() -- that is the
5989    responsibility of its caller. */
5990 
5991 static
dis_MMXop_regmem_to_reg(UChar sorb,Int delta,UChar opc,const HChar * name,Bool show_granularity)5992 UInt dis_MMXop_regmem_to_reg ( UChar  sorb,
5993                                Int    delta,
5994                                UChar  opc,
5995                                const HChar* name,
5996                                Bool   show_granularity )
5997 {
5998    HChar   dis_buf[50];
5999    UChar   modrm = getIByte(delta);
6000    Bool    isReg = epartIsReg(modrm);
6001    IRExpr* argL  = NULL;
6002    IRExpr* argR  = NULL;
6003    IRExpr* argG  = NULL;
6004    IRExpr* argE  = NULL;
6005    IRTemp  res   = newTemp(Ity_I64);
6006 
6007    Bool    invG  = False;
6008    IROp    op    = Iop_INVALID;
6009    void*   hAddr = NULL;
6010    Bool    eLeft = False;
6011    const HChar*  hName = NULL;
6012 
6013 #  define XXX(_name) do { hAddr = &_name; hName = #_name; } while (0)
6014 
6015    switch (opc) {
6016       /* Original MMX ones */
6017       case 0xFC: op = Iop_Add8x8; break;
6018       case 0xFD: op = Iop_Add16x4; break;
6019       case 0xFE: op = Iop_Add32x2; break;
6020 
6021       case 0xEC: op = Iop_QAdd8Sx8; break;
6022       case 0xED: op = Iop_QAdd16Sx4; break;
6023 
6024       case 0xDC: op = Iop_QAdd8Ux8; break;
6025       case 0xDD: op = Iop_QAdd16Ux4; break;
6026 
6027       case 0xF8: op = Iop_Sub8x8;  break;
6028       case 0xF9: op = Iop_Sub16x4; break;
6029       case 0xFA: op = Iop_Sub32x2; break;
6030 
6031       case 0xE8: op = Iop_QSub8Sx8; break;
6032       case 0xE9: op = Iop_QSub16Sx4; break;
6033 
6034       case 0xD8: op = Iop_QSub8Ux8; break;
6035       case 0xD9: op = Iop_QSub16Ux4; break;
6036 
6037       case 0xE5: op = Iop_MulHi16Sx4; break;
6038       case 0xD5: op = Iop_Mul16x4; break;
6039       case 0xF5: XXX(x86g_calculate_mmx_pmaddwd); break;
6040 
6041       case 0x74: op = Iop_CmpEQ8x8; break;
6042       case 0x75: op = Iop_CmpEQ16x4; break;
6043       case 0x76: op = Iop_CmpEQ32x2; break;
6044 
6045       case 0x64: op = Iop_CmpGT8Sx8; break;
6046       case 0x65: op = Iop_CmpGT16Sx4; break;
6047       case 0x66: op = Iop_CmpGT32Sx2; break;
6048 
6049       case 0x6B: op = Iop_QNarrowBin32Sto16Sx4; eLeft = True; break;
6050       case 0x63: op = Iop_QNarrowBin16Sto8Sx8;  eLeft = True; break;
6051       case 0x67: op = Iop_QNarrowBin16Sto8Ux8;  eLeft = True; break;
6052 
6053       case 0x68: op = Iop_InterleaveHI8x8;  eLeft = True; break;
6054       case 0x69: op = Iop_InterleaveHI16x4; eLeft = True; break;
6055       case 0x6A: op = Iop_InterleaveHI32x2; eLeft = True; break;
6056 
6057       case 0x60: op = Iop_InterleaveLO8x8;  eLeft = True; break;
6058       case 0x61: op = Iop_InterleaveLO16x4; eLeft = True; break;
6059       case 0x62: op = Iop_InterleaveLO32x2; eLeft = True; break;
6060 
6061       case 0xDB: op = Iop_And64; break;
6062       case 0xDF: op = Iop_And64; invG = True; break;
6063       case 0xEB: op = Iop_Or64; break;
6064       case 0xEF: /* Possibly do better here if argL and argR are the
6065                     same reg */
6066                  op = Iop_Xor64; break;
6067 
6068       /* Introduced in SSE1 */
6069       case 0xE0: op = Iop_Avg8Ux8;    break;
6070       case 0xE3: op = Iop_Avg16Ux4;   break;
6071       case 0xEE: op = Iop_Max16Sx4;   break;
6072       case 0xDE: op = Iop_Max8Ux8;    break;
6073       case 0xEA: op = Iop_Min16Sx4;   break;
6074       case 0xDA: op = Iop_Min8Ux8;    break;
6075       case 0xE4: op = Iop_MulHi16Ux4; break;
6076       case 0xF6: XXX(x86g_calculate_mmx_psadbw); break;
6077 
6078       /* Introduced in SSE2 */
6079       case 0xD4: op = Iop_Add64; break;
6080       case 0xFB: op = Iop_Sub64; break;
6081 
6082       default:
6083          vex_printf("\n0x%x\n", opc);
6084          vpanic("dis_MMXop_regmem_to_reg");
6085    }
6086 
6087 #  undef XXX
6088 
6089    argG = getMMXReg(gregOfRM(modrm));
6090    if (invG)
6091       argG = unop(Iop_Not64, argG);
6092 
6093    if (isReg) {
6094       delta++;
6095       argE = getMMXReg(eregOfRM(modrm));
6096    } else {
6097       Int    len;
6098       IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
6099       delta += len;
6100       argE = loadLE(Ity_I64, mkexpr(addr));
6101    }
6102 
6103    if (eLeft) {
6104       argL = argE;
6105       argR = argG;
6106    } else {
6107       argL = argG;
6108       argR = argE;
6109    }
6110 
6111    if (op != Iop_INVALID) {
6112       vassert(hName == NULL);
6113       vassert(hAddr == NULL);
6114       assign(res, binop(op, argL, argR));
6115    } else {
6116       vassert(hName != NULL);
6117       vassert(hAddr != NULL);
6118       assign( res,
6119               mkIRExprCCall(
6120                  Ity_I64,
6121                  0/*regparms*/, hName, hAddr,
6122                  mkIRExprVec_2( argL, argR )
6123               )
6124             );
6125    }
6126 
6127    putMMXReg( gregOfRM(modrm), mkexpr(res) );
6128 
6129    DIP("%s%s %s, %s\n",
6130        name, show_granularity ? nameMMXGran(opc & 3) : "",
6131        ( isReg ? nameMMXReg(eregOfRM(modrm)) : dis_buf ),
6132        nameMMXReg(gregOfRM(modrm)) );
6133 
6134    return delta;
6135 }
6136 
6137 
6138 /* Vector by scalar shift of G by the amount specified at the bottom
6139    of E.  This is a straight copy of dis_SSE_shiftG_byE. */
6140 
dis_MMX_shiftG_byE(UChar sorb,Int delta,const HChar * opname,IROp op)6141 static UInt dis_MMX_shiftG_byE ( UChar sorb, Int delta,
6142                                  const HChar* opname, IROp op )
6143 {
6144    HChar   dis_buf[50];
6145    Int     alen, size;
6146    IRTemp  addr;
6147    Bool    shl, shr, sar;
6148    UChar   rm   = getIByte(delta);
6149    IRTemp  g0   = newTemp(Ity_I64);
6150    IRTemp  g1   = newTemp(Ity_I64);
6151    IRTemp  amt  = newTemp(Ity_I32);
6152    IRTemp  amt8 = newTemp(Ity_I8);
6153 
6154    if (epartIsReg(rm)) {
6155       assign( amt, unop(Iop_64to32, getMMXReg(eregOfRM(rm))) );
6156       DIP("%s %s,%s\n", opname,
6157                         nameMMXReg(eregOfRM(rm)),
6158                         nameMMXReg(gregOfRM(rm)) );
6159       delta++;
6160    } else {
6161       addr = disAMode ( &alen, sorb, delta, dis_buf );
6162       assign( amt, loadLE(Ity_I32, mkexpr(addr)) );
6163       DIP("%s %s,%s\n", opname,
6164                         dis_buf,
6165                         nameMMXReg(gregOfRM(rm)) );
6166       delta += alen;
6167    }
6168    assign( g0,   getMMXReg(gregOfRM(rm)) );
6169    assign( amt8, unop(Iop_32to8, mkexpr(amt)) );
6170 
6171    shl = shr = sar = False;
6172    size = 0;
6173    switch (op) {
6174       case Iop_ShlN16x4: shl = True; size = 32; break;
6175       case Iop_ShlN32x2: shl = True; size = 32; break;
6176       case Iop_Shl64:    shl = True; size = 64; break;
6177       case Iop_ShrN16x4: shr = True; size = 16; break;
6178       case Iop_ShrN32x2: shr = True; size = 32; break;
6179       case Iop_Shr64:    shr = True; size = 64; break;
6180       case Iop_SarN16x4: sar = True; size = 16; break;
6181       case Iop_SarN32x2: sar = True; size = 32; break;
6182       default: vassert(0);
6183    }
6184 
6185    if (shl || shr) {
6186      assign(
6187         g1,
6188         IRExpr_ITE(
6189            binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size)),
6190            binop(op, mkexpr(g0), mkexpr(amt8)),
6191            mkU64(0)
6192         )
6193      );
6194    } else
6195    if (sar) {
6196      assign(
6197         g1,
6198         IRExpr_ITE(
6199            binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size)),
6200            binop(op, mkexpr(g0), mkexpr(amt8)),
6201            binop(op, mkexpr(g0), mkU8(size-1))
6202         )
6203      );
6204    } else {
6205       /*NOTREACHED*/
6206       vassert(0);
6207    }
6208 
6209    putMMXReg( gregOfRM(rm), mkexpr(g1) );
6210    return delta;
6211 }
6212 
6213 
6214 /* Vector by scalar shift of E by an immediate byte.  This is a
6215    straight copy of dis_SSE_shiftE_imm. */
6216 
6217 static
dis_MMX_shiftE_imm(Int delta,const HChar * opname,IROp op)6218 UInt dis_MMX_shiftE_imm ( Int delta, const HChar* opname, IROp op )
6219 {
6220    Bool    shl, shr, sar;
6221    UChar   rm   = getIByte(delta);
6222    IRTemp  e0   = newTemp(Ity_I64);
6223    IRTemp  e1   = newTemp(Ity_I64);
6224    UChar   amt, size;
6225    vassert(epartIsReg(rm));
6226    vassert(gregOfRM(rm) == 2
6227            || gregOfRM(rm) == 4 || gregOfRM(rm) == 6);
6228    amt = getIByte(delta+1);
6229    delta += 2;
6230    DIP("%s $%d,%s\n", opname,
6231                       (Int)amt,
6232                       nameMMXReg(eregOfRM(rm)) );
6233 
6234    assign( e0, getMMXReg(eregOfRM(rm)) );
6235 
6236    shl = shr = sar = False;
6237    size = 0;
6238    switch (op) {
6239       case Iop_ShlN16x4: shl = True; size = 16; break;
6240       case Iop_ShlN32x2: shl = True; size = 32; break;
6241       case Iop_Shl64:    shl = True; size = 64; break;
6242       case Iop_SarN16x4: sar = True; size = 16; break;
6243       case Iop_SarN32x2: sar = True; size = 32; break;
6244       case Iop_ShrN16x4: shr = True; size = 16; break;
6245       case Iop_ShrN32x2: shr = True; size = 32; break;
6246       case Iop_Shr64:    shr = True; size = 64; break;
6247       default: vassert(0);
6248    }
6249 
6250    if (shl || shr) {
6251       assign( e1, amt >= size
6252                      ? mkU64(0)
6253                      : binop(op, mkexpr(e0), mkU8(amt))
6254       );
6255    } else
6256    if (sar) {
6257       assign( e1, amt >= size
6258                      ? binop(op, mkexpr(e0), mkU8(size-1))
6259                      : binop(op, mkexpr(e0), mkU8(amt))
6260       );
6261    } else {
6262       /*NOTREACHED*/
6263       vassert(0);
6264    }
6265 
6266    putMMXReg( eregOfRM(rm), mkexpr(e1) );
6267    return delta;
6268 }
6269 
6270 
6271 /* Completely handle all MMX instructions except emms. */
6272 
6273 static
dis_MMX(Bool * decode_ok,UChar sorb,Int sz,Int delta)6274 UInt dis_MMX ( Bool* decode_ok, UChar sorb, Int sz, Int delta )
6275 {
6276    Int   len;
6277    UChar modrm;
6278    HChar dis_buf[50];
6279    UChar opc = getIByte(delta);
6280    delta++;
6281 
6282    /* dis_MMX handles all insns except emms. */
6283    do_MMX_preamble();
6284 
6285    switch (opc) {
6286 
6287       case 0x6E:
6288          /* MOVD (src)ireg-or-mem (E), (dst)mmxreg (G)*/
6289          if (sz != 4)
6290             goto mmx_decode_failure;
6291          modrm = getIByte(delta);
6292          if (epartIsReg(modrm)) {
6293             delta++;
6294             putMMXReg(
6295                gregOfRM(modrm),
6296                binop( Iop_32HLto64,
6297                       mkU32(0),
6298                       getIReg(4, eregOfRM(modrm)) ) );
6299             DIP("movd %s, %s\n",
6300                 nameIReg(4,eregOfRM(modrm)), nameMMXReg(gregOfRM(modrm)));
6301          } else {
6302             IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
6303             delta += len;
6304             putMMXReg(
6305                gregOfRM(modrm),
6306                binop( Iop_32HLto64,
6307                       mkU32(0),
6308                       loadLE(Ity_I32, mkexpr(addr)) ) );
6309             DIP("movd %s, %s\n", dis_buf, nameMMXReg(gregOfRM(modrm)));
6310          }
6311          break;
6312 
6313       case 0x7E: /* MOVD (src)mmxreg (G), (dst)ireg-or-mem (E) */
6314          if (sz != 4)
6315             goto mmx_decode_failure;
6316          modrm = getIByte(delta);
6317          if (epartIsReg(modrm)) {
6318             delta++;
6319             putIReg( 4, eregOfRM(modrm),
6320                      unop(Iop_64to32, getMMXReg(gregOfRM(modrm)) ) );
6321             DIP("movd %s, %s\n",
6322                 nameMMXReg(gregOfRM(modrm)), nameIReg(4,eregOfRM(modrm)));
6323          } else {
6324             IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
6325             delta += len;
6326             storeLE( mkexpr(addr),
6327                      unop(Iop_64to32, getMMXReg(gregOfRM(modrm)) ) );
6328             DIP("movd %s, %s\n", nameMMXReg(gregOfRM(modrm)), dis_buf);
6329          }
6330          break;
6331 
6332       case 0x6F:
6333          /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
6334          if (sz != 4)
6335             goto mmx_decode_failure;
6336          modrm = getIByte(delta);
6337          if (epartIsReg(modrm)) {
6338             delta++;
6339             putMMXReg( gregOfRM(modrm), getMMXReg(eregOfRM(modrm)) );
6340             DIP("movq %s, %s\n",
6341                 nameMMXReg(eregOfRM(modrm)), nameMMXReg(gregOfRM(modrm)));
6342          } else {
6343             IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
6344             delta += len;
6345             putMMXReg( gregOfRM(modrm), loadLE(Ity_I64, mkexpr(addr)) );
6346             DIP("movq %s, %s\n",
6347                 dis_buf, nameMMXReg(gregOfRM(modrm)));
6348          }
6349          break;
6350 
6351       case 0x7F:
6352          /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
6353          if (sz != 4)
6354             goto mmx_decode_failure;
6355          modrm = getIByte(delta);
6356          if (epartIsReg(modrm)) {
6357             delta++;
6358             putMMXReg( eregOfRM(modrm), getMMXReg(gregOfRM(modrm)) );
6359             DIP("movq %s, %s\n",
6360                 nameMMXReg(gregOfRM(modrm)), nameMMXReg(eregOfRM(modrm)));
6361          } else {
6362             IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
6363             delta += len;
6364             storeLE( mkexpr(addr), getMMXReg(gregOfRM(modrm)) );
6365             DIP("mov(nt)q %s, %s\n",
6366                 nameMMXReg(gregOfRM(modrm)), dis_buf);
6367          }
6368          break;
6369 
6370       case 0xFC:
6371       case 0xFD:
6372       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
6373          if (sz != 4)
6374             goto mmx_decode_failure;
6375          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "padd", True );
6376          break;
6377 
6378       case 0xEC:
6379       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
6380          if (sz != 4)
6381             goto mmx_decode_failure;
6382          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "padds", True );
6383          break;
6384 
6385       case 0xDC:
6386       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
6387          if (sz != 4)
6388             goto mmx_decode_failure;
6389          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "paddus", True );
6390          break;
6391 
6392       case 0xF8:
6393       case 0xF9:
6394       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
6395          if (sz != 4)
6396             goto mmx_decode_failure;
6397          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psub", True );
6398          break;
6399 
6400       case 0xE8:
6401       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
6402          if (sz != 4)
6403             goto mmx_decode_failure;
6404          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psubs", True );
6405          break;
6406 
6407       case 0xD8:
6408       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
6409          if (sz != 4)
6410             goto mmx_decode_failure;
6411          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psubus", True );
6412          break;
6413 
6414       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
6415          if (sz != 4)
6416             goto mmx_decode_failure;
6417          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmulhw", False );
6418          break;
6419 
6420       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
6421          if (sz != 4)
6422             goto mmx_decode_failure;
6423          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmullw", False );
6424          break;
6425 
6426       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
6427          vassert(sz == 4);
6428          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmaddwd", False );
6429          break;
6430 
6431       case 0x74:
6432       case 0x75:
6433       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
6434          if (sz != 4)
6435             goto mmx_decode_failure;
6436          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pcmpeq", True );
6437          break;
6438 
6439       case 0x64:
6440       case 0x65:
6441       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
6442          if (sz != 4)
6443             goto mmx_decode_failure;
6444          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pcmpgt", True );
6445          break;
6446 
6447       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
6448          if (sz != 4)
6449             goto mmx_decode_failure;
6450          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packssdw", False );
6451          break;
6452 
6453       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
6454          if (sz != 4)
6455             goto mmx_decode_failure;
6456          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packsswb", False );
6457          break;
6458 
6459       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
6460          if (sz != 4)
6461             goto mmx_decode_failure;
6462          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packuswb", False );
6463          break;
6464 
6465       case 0x68:
6466       case 0x69:
6467       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
6468          if (sz != 4)
6469             goto mmx_decode_failure;
6470          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "punpckh", True );
6471          break;
6472 
6473       case 0x60:
6474       case 0x61:
6475       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
6476          if (sz != 4)
6477             goto mmx_decode_failure;
6478          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "punpckl", True );
6479          break;
6480 
6481       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
6482          if (sz != 4)
6483             goto mmx_decode_failure;
6484          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pand", False );
6485          break;
6486 
6487       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
6488          if (sz != 4)
6489             goto mmx_decode_failure;
6490          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pandn", False );
6491          break;
6492 
6493       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
6494          if (sz != 4)
6495             goto mmx_decode_failure;
6496          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "por", False );
6497          break;
6498 
6499       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
6500          if (sz != 4)
6501             goto mmx_decode_failure;
6502          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pxor", False );
6503          break;
6504 
6505 #     define SHIFT_BY_REG(_name,_op)                                 \
6506                 delta = dis_MMX_shiftG_byE(sorb, delta, _name, _op); \
6507                 break;
6508 
6509       /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
6510       case 0xF1: SHIFT_BY_REG("psllw", Iop_ShlN16x4);
6511       case 0xF2: SHIFT_BY_REG("pslld", Iop_ShlN32x2);
6512       case 0xF3: SHIFT_BY_REG("psllq", Iop_Shl64);
6513 
6514       /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
6515       case 0xD1: SHIFT_BY_REG("psrlw", Iop_ShrN16x4);
6516       case 0xD2: SHIFT_BY_REG("psrld", Iop_ShrN32x2);
6517       case 0xD3: SHIFT_BY_REG("psrlq", Iop_Shr64);
6518 
6519       /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
6520       case 0xE1: SHIFT_BY_REG("psraw", Iop_SarN16x4);
6521       case 0xE2: SHIFT_BY_REG("psrad", Iop_SarN32x2);
6522 
6523 #     undef SHIFT_BY_REG
6524 
6525       case 0x71:
6526       case 0x72:
6527       case 0x73: {
6528          /* (sz==4): PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
6529          UChar byte2, subopc;
6530          if (sz != 4)
6531             goto mmx_decode_failure;
6532          byte2  = getIByte(delta);           /* amode / sub-opcode */
6533          subopc = toUChar( (byte2 >> 3) & 7 );
6534 
6535 #        define SHIFT_BY_IMM(_name,_op)                         \
6536              do { delta = dis_MMX_shiftE_imm(delta,_name,_op);  \
6537              } while (0)
6538 
6539               if (subopc == 2 /*SRL*/ && opc == 0x71)
6540                  SHIFT_BY_IMM("psrlw", Iop_ShrN16x4);
6541          else if (subopc == 2 /*SRL*/ && opc == 0x72)
6542                  SHIFT_BY_IMM("psrld", Iop_ShrN32x2);
6543          else if (subopc == 2 /*SRL*/ && opc == 0x73)
6544                  SHIFT_BY_IMM("psrlq", Iop_Shr64);
6545 
6546          else if (subopc == 4 /*SAR*/ && opc == 0x71)
6547                  SHIFT_BY_IMM("psraw", Iop_SarN16x4);
6548          else if (subopc == 4 /*SAR*/ && opc == 0x72)
6549                  SHIFT_BY_IMM("psrad", Iop_SarN32x2);
6550 
6551          else if (subopc == 6 /*SHL*/ && opc == 0x71)
6552                  SHIFT_BY_IMM("psllw", Iop_ShlN16x4);
6553          else if (subopc == 6 /*SHL*/ && opc == 0x72)
6554                  SHIFT_BY_IMM("pslld", Iop_ShlN32x2);
6555          else if (subopc == 6 /*SHL*/ && opc == 0x73)
6556                  SHIFT_BY_IMM("psllq", Iop_Shl64);
6557 
6558          else goto mmx_decode_failure;
6559 
6560 #        undef SHIFT_BY_IMM
6561          break;
6562       }
6563 
6564       case 0xF7: {
6565          IRTemp addr    = newTemp(Ity_I32);
6566          IRTemp regD    = newTemp(Ity_I64);
6567          IRTemp regM    = newTemp(Ity_I64);
6568          IRTemp mask    = newTemp(Ity_I64);
6569          IRTemp olddata = newTemp(Ity_I64);
6570          IRTemp newdata = newTemp(Ity_I64);
6571 
6572          modrm = getIByte(delta);
6573          if (sz != 4 || (!epartIsReg(modrm)))
6574             goto mmx_decode_failure;
6575          delta++;
6576 
6577          assign( addr, handleSegOverride( sorb, getIReg(4, R_EDI) ));
6578          assign( regM, getMMXReg( eregOfRM(modrm) ));
6579          assign( regD, getMMXReg( gregOfRM(modrm) ));
6580          assign( mask, binop(Iop_SarN8x8, mkexpr(regM), mkU8(7)) );
6581          assign( olddata, loadLE( Ity_I64, mkexpr(addr) ));
6582          assign( newdata,
6583                  binop(Iop_Or64,
6584                        binop(Iop_And64,
6585                              mkexpr(regD),
6586                              mkexpr(mask) ),
6587                        binop(Iop_And64,
6588                              mkexpr(olddata),
6589                              unop(Iop_Not64, mkexpr(mask)))) );
6590          storeLE( mkexpr(addr), mkexpr(newdata) );
6591          DIP("maskmovq %s,%s\n", nameMMXReg( eregOfRM(modrm) ),
6592                                  nameMMXReg( gregOfRM(modrm) ) );
6593          break;
6594       }
6595 
6596       /* --- MMX decode failure --- */
6597       default:
6598       mmx_decode_failure:
6599          *decode_ok = False;
6600          return delta; /* ignored */
6601 
6602    }
6603 
6604    *decode_ok = True;
6605    return delta;
6606 }
6607 
6608 
6609 /*------------------------------------------------------------*/
6610 /*--- More misc arithmetic and other obscure insns.        ---*/
6611 /*------------------------------------------------------------*/
6612 
6613 /* Double length left and right shifts.  Apparently only required in
6614    v-size (no b- variant). */
6615 static
dis_SHLRD_Gv_Ev(UChar sorb,Int delta,UChar modrm,Int sz,IRExpr * shift_amt,Bool amt_is_literal,const HChar * shift_amt_txt,Bool left_shift)6616 UInt dis_SHLRD_Gv_Ev ( UChar sorb,
6617                        Int delta, UChar modrm,
6618                        Int sz,
6619                        IRExpr* shift_amt,
6620                        Bool amt_is_literal,
6621                        const HChar* shift_amt_txt,
6622                        Bool left_shift )
6623 {
6624    /* shift_amt :: Ity_I8 is the amount to shift.  shift_amt_txt is used
6625       for printing it.   And eip on entry points at the modrm byte. */
6626    Int len;
6627    HChar dis_buf[50];
6628 
6629    IRType ty       = szToITy(sz);
6630    IRTemp gsrc     = newTemp(ty);
6631    IRTemp esrc     = newTemp(ty);
6632    IRTemp addr     = IRTemp_INVALID;
6633    IRTemp tmpSH    = newTemp(Ity_I8);
6634    IRTemp tmpL     = IRTemp_INVALID;
6635    IRTemp tmpRes   = IRTemp_INVALID;
6636    IRTemp tmpSubSh = IRTemp_INVALID;
6637    IROp   mkpair;
6638    IROp   getres;
6639    IROp   shift;
6640    IRExpr* mask = NULL;
6641 
6642    vassert(sz == 2 || sz == 4);
6643 
6644    /* The E-part is the destination; this is shifted.  The G-part
6645       supplies bits to be shifted into the E-part, but is not
6646       changed.
6647 
6648       If shifting left, form a double-length word with E at the top
6649       and G at the bottom, and shift this left.  The result is then in
6650       the high part.
6651 
6652       If shifting right, form a double-length word with G at the top
6653       and E at the bottom, and shift this right.  The result is then
6654       at the bottom.  */
6655 
6656    /* Fetch the operands. */
6657 
6658    assign( gsrc, getIReg(sz, gregOfRM(modrm)) );
6659 
6660    if (epartIsReg(modrm)) {
6661       delta++;
6662       assign( esrc, getIReg(sz, eregOfRM(modrm)) );
6663       DIP("sh%cd%c %s, %s, %s\n",
6664           ( left_shift ? 'l' : 'r' ), nameISize(sz),
6665           shift_amt_txt,
6666           nameIReg(sz, gregOfRM(modrm)), nameIReg(sz, eregOfRM(modrm)));
6667    } else {
6668       addr = disAMode ( &len, sorb, delta, dis_buf );
6669       delta += len;
6670       assign( esrc, loadLE(ty, mkexpr(addr)) );
6671       DIP("sh%cd%c %s, %s, %s\n",
6672           ( left_shift ? 'l' : 'r' ), nameISize(sz),
6673           shift_amt_txt,
6674           nameIReg(sz, gregOfRM(modrm)), dis_buf);
6675    }
6676 
6677    /* Round up the relevant primops. */
6678 
6679    if (sz == 4) {
6680       tmpL     = newTemp(Ity_I64);
6681       tmpRes   = newTemp(Ity_I32);
6682       tmpSubSh = newTemp(Ity_I32);
6683       mkpair   = Iop_32HLto64;
6684       getres   = left_shift ? Iop_64HIto32 : Iop_64to32;
6685       shift    = left_shift ? Iop_Shl64 : Iop_Shr64;
6686       mask     = mkU8(31);
6687    } else {
6688       /* sz == 2 */
6689       tmpL     = newTemp(Ity_I32);
6690       tmpRes   = newTemp(Ity_I16);
6691       tmpSubSh = newTemp(Ity_I16);
6692       mkpair   = Iop_16HLto32;
6693       getres   = left_shift ? Iop_32HIto16 : Iop_32to16;
6694       shift    = left_shift ? Iop_Shl32 : Iop_Shr32;
6695       mask     = mkU8(15);
6696    }
6697 
6698    /* Do the shift, calculate the subshift value, and set
6699       the flag thunk. */
6700 
6701    assign( tmpSH, binop(Iop_And8, shift_amt, mask) );
6702 
6703    if (left_shift)
6704       assign( tmpL, binop(mkpair, mkexpr(esrc), mkexpr(gsrc)) );
6705    else
6706       assign( tmpL, binop(mkpair, mkexpr(gsrc), mkexpr(esrc)) );
6707 
6708    assign( tmpRes, unop(getres, binop(shift, mkexpr(tmpL), mkexpr(tmpSH)) ) );
6709    assign( tmpSubSh,
6710            unop(getres,
6711                 binop(shift,
6712                       mkexpr(tmpL),
6713                       binop(Iop_And8,
6714                             binop(Iop_Sub8, mkexpr(tmpSH), mkU8(1) ),
6715                             mask))) );
6716 
6717    setFlags_DEP1_DEP2_shift ( left_shift ? Iop_Shl32 : Iop_Sar32,
6718                               tmpRes, tmpSubSh, ty, tmpSH );
6719 
6720    /* Put result back. */
6721 
6722    if (epartIsReg(modrm)) {
6723       putIReg(sz, eregOfRM(modrm), mkexpr(tmpRes));
6724    } else {
6725       storeLE( mkexpr(addr), mkexpr(tmpRes) );
6726    }
6727 
6728    if (amt_is_literal) delta++;
6729    return delta;
6730 }
6731 
6732 
6733 /* Handle BT/BTS/BTR/BTC Gv, Ev.  Apparently b-size is not
6734    required. */
6735 
6736 typedef enum { BtOpNone, BtOpSet, BtOpReset, BtOpComp } BtOp;
6737 
nameBtOp(BtOp op)6738 static const HChar* nameBtOp ( BtOp op )
6739 {
6740    switch (op) {
6741       case BtOpNone:  return "";
6742       case BtOpSet:   return "s";
6743       case BtOpReset: return "r";
6744       case BtOpComp:  return "c";
6745       default: vpanic("nameBtOp(x86)");
6746    }
6747 }
6748 
6749 
6750 static
dis_bt_G_E(const VexAbiInfo * vbi,UChar sorb,Bool locked,Int sz,Int delta,BtOp op)6751 UInt dis_bt_G_E ( const VexAbiInfo* vbi,
6752                   UChar sorb, Bool locked, Int sz, Int delta, BtOp op )
6753 {
6754    HChar  dis_buf[50];
6755    UChar  modrm;
6756    Int    len;
6757    IRTemp t_fetched, t_bitno0, t_bitno1, t_bitno2, t_addr0,
6758           t_addr1, t_esp, t_mask, t_new;
6759 
6760    vassert(sz == 2 || sz == 4);
6761 
6762    t_fetched = t_bitno0 = t_bitno1 = t_bitno2
6763              = t_addr0 = t_addr1 = t_esp
6764              = t_mask = t_new = IRTemp_INVALID;
6765 
6766    t_fetched = newTemp(Ity_I8);
6767    t_new     = newTemp(Ity_I8);
6768    t_bitno0  = newTemp(Ity_I32);
6769    t_bitno1  = newTemp(Ity_I32);
6770    t_bitno2  = newTemp(Ity_I8);
6771    t_addr1   = newTemp(Ity_I32);
6772    modrm     = getIByte(delta);
6773 
6774    assign( t_bitno0, widenSto32(getIReg(sz, gregOfRM(modrm))) );
6775 
6776    if (epartIsReg(modrm)) {
6777       delta++;
6778       /* Get it onto the client's stack. */
6779       t_esp = newTemp(Ity_I32);
6780       t_addr0 = newTemp(Ity_I32);
6781 
6782       /* For the choice of the value 128, see comment in dis_bt_G_E in
6783          guest_amd64_toIR.c.  We point out here only that 128 is
6784          fast-cased in Memcheck and is > 0, so seems like a good
6785          choice. */
6786       vassert(vbi->guest_stack_redzone_size == 0);
6787       assign( t_esp, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(128)) );
6788       putIReg(4, R_ESP, mkexpr(t_esp));
6789 
6790       storeLE( mkexpr(t_esp), getIReg(sz, eregOfRM(modrm)) );
6791 
6792       /* Make t_addr0 point at it. */
6793       assign( t_addr0, mkexpr(t_esp) );
6794 
6795       /* Mask out upper bits of the shift amount, since we're doing a
6796          reg. */
6797       assign( t_bitno1, binop(Iop_And32,
6798                               mkexpr(t_bitno0),
6799                               mkU32(sz == 4 ? 31 : 15)) );
6800 
6801    } else {
6802       t_addr0 = disAMode ( &len, sorb, delta, dis_buf );
6803       delta += len;
6804       assign( t_bitno1, mkexpr(t_bitno0) );
6805    }
6806 
6807    /* At this point: t_addr0 is the address being operated on.  If it
6808       was a reg, we will have pushed it onto the client's stack.
6809       t_bitno1 is the bit number, suitably masked in the case of a
6810       reg.  */
6811 
6812    /* Now the main sequence. */
6813    assign( t_addr1,
6814            binop(Iop_Add32,
6815                  mkexpr(t_addr0),
6816                  binop(Iop_Sar32, mkexpr(t_bitno1), mkU8(3))) );
6817 
6818    /* t_addr1 now holds effective address */
6819 
6820    assign( t_bitno2,
6821            unop(Iop_32to8,
6822                 binop(Iop_And32, mkexpr(t_bitno1), mkU32(7))) );
6823 
6824    /* t_bitno2 contains offset of bit within byte */
6825 
6826    if (op != BtOpNone) {
6827       t_mask = newTemp(Ity_I8);
6828       assign( t_mask, binop(Iop_Shl8, mkU8(1), mkexpr(t_bitno2)) );
6829    }
6830 
6831    /* t_mask is now a suitable byte mask */
6832 
6833    assign( t_fetched, loadLE(Ity_I8, mkexpr(t_addr1)) );
6834 
6835    if (op != BtOpNone) {
6836       switch (op) {
6837          case BtOpSet:
6838             assign( t_new,
6839                     binop(Iop_Or8, mkexpr(t_fetched), mkexpr(t_mask)) );
6840             break;
6841          case BtOpComp:
6842             assign( t_new,
6843                     binop(Iop_Xor8, mkexpr(t_fetched), mkexpr(t_mask)) );
6844             break;
6845          case BtOpReset:
6846             assign( t_new,
6847                     binop(Iop_And8, mkexpr(t_fetched),
6848                                     unop(Iop_Not8, mkexpr(t_mask))) );
6849             break;
6850          default:
6851             vpanic("dis_bt_G_E(x86)");
6852       }
6853       if (locked && !epartIsReg(modrm)) {
6854          casLE( mkexpr(t_addr1), mkexpr(t_fetched)/*expd*/,
6855                                  mkexpr(t_new)/*new*/,
6856                                  guest_EIP_curr_instr );
6857       } else {
6858          storeLE( mkexpr(t_addr1), mkexpr(t_new) );
6859       }
6860    }
6861 
6862    /* Side effect done; now get selected bit into Carry flag */
6863    /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
6864    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
6865    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
6866    stmt( IRStmt_Put(
6867             OFFB_CC_DEP1,
6868             binop(Iop_And32,
6869                   binop(Iop_Shr32,
6870                         unop(Iop_8Uto32, mkexpr(t_fetched)),
6871                         mkexpr(t_bitno2)),
6872                   mkU32(1)))
6873        );
6874    /* Set NDEP even though it isn't used.  This makes redundant-PUT
6875       elimination of previous stores to this field work better. */
6876    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
6877 
6878    /* Move reg operand from stack back to reg */
6879    if (epartIsReg(modrm)) {
6880       /* t_esp still points at it. */
6881       putIReg(sz, eregOfRM(modrm), loadLE(szToITy(sz), mkexpr(t_esp)) );
6882       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t_esp), mkU32(128)) );
6883    }
6884 
6885    DIP("bt%s%c %s, %s\n",
6886        nameBtOp(op), nameISize(sz), nameIReg(sz, gregOfRM(modrm)),
6887        ( epartIsReg(modrm) ? nameIReg(sz, eregOfRM(modrm)) : dis_buf ) );
6888 
6889    return delta;
6890 }
6891 
6892 
6893 
6894 /* Handle BSF/BSR.  Only v-size seems necessary. */
6895 static
dis_bs_E_G(UChar sorb,Int sz,Int delta,Bool fwds)6896 UInt dis_bs_E_G ( UChar sorb, Int sz, Int delta, Bool fwds )
6897 {
6898    Bool   isReg;
6899    UChar  modrm;
6900    HChar  dis_buf[50];
6901 
6902    IRType ty  = szToITy(sz);
6903    IRTemp src = newTemp(ty);
6904    IRTemp dst = newTemp(ty);
6905 
6906    IRTemp src32 = newTemp(Ity_I32);
6907    IRTemp dst32 = newTemp(Ity_I32);
6908    IRTemp srcB  = newTemp(Ity_I1);
6909 
6910    vassert(sz == 4 || sz == 2);
6911 
6912    modrm = getIByte(delta);
6913 
6914    isReg = epartIsReg(modrm);
6915    if (isReg) {
6916       delta++;
6917       assign( src, getIReg(sz, eregOfRM(modrm)) );
6918    } else {
6919       Int    len;
6920       IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
6921       delta += len;
6922       assign( src, loadLE(ty, mkexpr(addr)) );
6923    }
6924 
6925    DIP("bs%c%c %s, %s\n",
6926        fwds ? 'f' : 'r', nameISize(sz),
6927        ( isReg ? nameIReg(sz, eregOfRM(modrm)) : dis_buf ),
6928        nameIReg(sz, gregOfRM(modrm)));
6929 
6930    /* Generate a bool expression which is zero iff the original is
6931       zero, and nonzero otherwise.  Ask for a CmpNE version which, if
6932       instrumented by Memcheck, is instrumented expensively, since
6933       this may be used on the output of a preceding movmskb insn,
6934       which has been known to be partially defined, and in need of
6935       careful handling. */
6936    assign( srcB, binop(mkSizedOp(ty,Iop_ExpCmpNE8),
6937                        mkexpr(src), mkU(ty,0)) );
6938 
6939    /* Flags: Z is 1 iff source value is zero.  All others
6940       are undefined -- we force them to zero. */
6941    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
6942    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
6943    stmt( IRStmt_Put(
6944             OFFB_CC_DEP1,
6945             IRExpr_ITE( mkexpr(srcB),
6946                         /* src!=0 */
6947                         mkU32(0),
6948                         /* src==0 */
6949                         mkU32(X86G_CC_MASK_Z)
6950                         )
6951        ));
6952    /* Set NDEP even though it isn't used.  This makes redundant-PUT
6953       elimination of previous stores to this field work better. */
6954    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
6955 
6956    /* Result: iff source value is zero, we can't use
6957       Iop_Clz32/Iop_Ctz32 as they have no defined result in that case.
6958       But anyway, Intel x86 semantics say the result is undefined in
6959       such situations.  Hence handle the zero case specially. */
6960 
6961    /* Bleh.  What we compute:
6962 
6963           bsf32:  if src == 0 then 0 else  Ctz32(src)
6964           bsr32:  if src == 0 then 0 else  31 - Clz32(src)
6965 
6966           bsf16:  if src == 0 then 0 else  Ctz32(16Uto32(src))
6967           bsr16:  if src == 0 then 0 else  31 - Clz32(16Uto32(src))
6968 
6969       First, widen src to 32 bits if it is not already.
6970 
6971       Postscript 15 Oct 04: it seems that at least VIA Nehemiah leaves the
6972       dst register unchanged when src == 0.  Hence change accordingly.
6973    */
6974    if (sz == 2)
6975       assign( src32, unop(Iop_16Uto32, mkexpr(src)) );
6976    else
6977       assign( src32, mkexpr(src) );
6978 
6979    /* The main computation, guarding against zero. */
6980    assign( dst32,
6981            IRExpr_ITE(
6982               mkexpr(srcB),
6983               /* src != 0 */
6984               fwds ? unop(Iop_Ctz32, mkexpr(src32))
6985                    : binop(Iop_Sub32,
6986                            mkU32(31),
6987                            unop(Iop_Clz32, mkexpr(src32))),
6988               /* src == 0 -- leave dst unchanged */
6989               widenUto32( getIReg( sz, gregOfRM(modrm) ) )
6990            )
6991          );
6992 
6993    if (sz == 2)
6994       assign( dst, unop(Iop_32to16, mkexpr(dst32)) );
6995    else
6996       assign( dst, mkexpr(dst32) );
6997 
6998    /* dump result back */
6999    putIReg( sz, gregOfRM(modrm), mkexpr(dst) );
7000 
7001    return delta;
7002 }
7003 
7004 
7005 static
codegen_xchg_eAX_Reg(Int sz,Int reg)7006 void codegen_xchg_eAX_Reg ( Int sz, Int reg )
7007 {
7008    IRType ty = szToITy(sz);
7009    IRTemp t1 = newTemp(ty);
7010    IRTemp t2 = newTemp(ty);
7011    vassert(sz == 2 || sz == 4);
7012    assign( t1, getIReg(sz, R_EAX) );
7013    assign( t2, getIReg(sz, reg) );
7014    putIReg( sz, R_EAX, mkexpr(t2) );
7015    putIReg( sz, reg, mkexpr(t1) );
7016    DIP("xchg%c %s, %s\n",
7017        nameISize(sz), nameIReg(sz, R_EAX), nameIReg(sz, reg));
7018 }
7019 
7020 
7021 static
codegen_SAHF(void)7022 void codegen_SAHF ( void )
7023 {
7024    /* Set the flags to:
7025       (x86g_calculate_flags_all() & X86G_CC_MASK_O)  -- retain the old O flag
7026       | (%AH & (X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
7027                 |X86G_CC_MASK_P|X86G_CC_MASK_C)
7028    */
7029    UInt   mask_SZACP = X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
7030                        |X86G_CC_MASK_C|X86G_CC_MASK_P;
7031    IRTemp oldflags   = newTemp(Ity_I32);
7032    assign( oldflags, mk_x86g_calculate_eflags_all() );
7033    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
7034    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
7035    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
7036    stmt( IRStmt_Put( OFFB_CC_DEP1,
7037          binop(Iop_Or32,
7038                binop(Iop_And32, mkexpr(oldflags), mkU32(X86G_CC_MASK_O)),
7039                binop(Iop_And32,
7040                      binop(Iop_Shr32, getIReg(4, R_EAX), mkU8(8)),
7041                      mkU32(mask_SZACP))
7042               )
7043    ));
7044    /* Set NDEP even though it isn't used.  This makes redundant-PUT
7045       elimination of previous stores to this field work better. */
7046    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
7047 }
7048 
7049 
7050 static
codegen_LAHF(void)7051 void codegen_LAHF ( void  )
7052 {
7053    /* AH <- EFLAGS(SF:ZF:0:AF:0:PF:1:CF) */
7054    IRExpr* eax_with_hole;
7055    IRExpr* new_byte;
7056    IRExpr* new_eax;
7057    UInt    mask_SZACP = X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
7058                         |X86G_CC_MASK_C|X86G_CC_MASK_P;
7059 
7060    IRTemp  flags = newTemp(Ity_I32);
7061    assign( flags, mk_x86g_calculate_eflags_all() );
7062 
7063    eax_with_hole
7064       = binop(Iop_And32, getIReg(4, R_EAX), mkU32(0xFFFF00FF));
7065    new_byte
7066       = binop(Iop_Or32, binop(Iop_And32, mkexpr(flags), mkU32(mask_SZACP)),
7067                         mkU32(1<<1));
7068    new_eax
7069       = binop(Iop_Or32, eax_with_hole,
7070                         binop(Iop_Shl32, new_byte, mkU8(8)));
7071    putIReg(4, R_EAX, new_eax);
7072 }
7073 
7074 
7075 static
dis_cmpxchg_G_E(UChar sorb,Bool locked,Int size,Int delta0)7076 UInt dis_cmpxchg_G_E ( UChar       sorb,
7077                        Bool        locked,
7078                        Int         size,
7079                        Int         delta0 )
7080 {
7081    HChar dis_buf[50];
7082    Int   len;
7083 
7084    IRType ty    = szToITy(size);
7085    IRTemp acc   = newTemp(ty);
7086    IRTemp src   = newTemp(ty);
7087    IRTemp dest  = newTemp(ty);
7088    IRTemp dest2 = newTemp(ty);
7089    IRTemp acc2  = newTemp(ty);
7090    IRTemp cond  = newTemp(Ity_I1);
7091    IRTemp addr  = IRTemp_INVALID;
7092    UChar  rm    = getUChar(delta0);
7093 
7094    /* There are 3 cases to consider:
7095 
7096       reg-reg: ignore any lock prefix, generate sequence based
7097                on ITE
7098 
7099       reg-mem, not locked: ignore any lock prefix, generate sequence
7100                            based on ITE
7101 
7102       reg-mem, locked: use IRCAS
7103    */
7104    if (epartIsReg(rm)) {
7105       /* case 1 */
7106       assign( dest, getIReg(size, eregOfRM(rm)) );
7107       delta0++;
7108       assign( src, getIReg(size, gregOfRM(rm)) );
7109       assign( acc, getIReg(size, R_EAX) );
7110       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
7111       assign( cond, mk_x86g_calculate_condition(X86CondZ) );
7112       assign( dest2, IRExpr_ITE(mkexpr(cond), mkexpr(src), mkexpr(dest)) );
7113       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
7114       putIReg(size, R_EAX, mkexpr(acc2));
7115       putIReg(size, eregOfRM(rm), mkexpr(dest2));
7116       DIP("cmpxchg%c %s,%s\n", nameISize(size),
7117                                nameIReg(size,gregOfRM(rm)),
7118                                nameIReg(size,eregOfRM(rm)) );
7119    }
7120    else if (!epartIsReg(rm) && !locked) {
7121       /* case 2 */
7122       addr = disAMode ( &len, sorb, delta0, dis_buf );
7123       assign( dest, loadLE(ty, mkexpr(addr)) );
7124       delta0 += len;
7125       assign( src, getIReg(size, gregOfRM(rm)) );
7126       assign( acc, getIReg(size, R_EAX) );
7127       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
7128       assign( cond, mk_x86g_calculate_condition(X86CondZ) );
7129       assign( dest2, IRExpr_ITE(mkexpr(cond), mkexpr(src), mkexpr(dest)) );
7130       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
7131       putIReg(size, R_EAX, mkexpr(acc2));
7132       storeLE( mkexpr(addr), mkexpr(dest2) );
7133       DIP("cmpxchg%c %s,%s\n", nameISize(size),
7134                                nameIReg(size,gregOfRM(rm)), dis_buf);
7135    }
7136    else if (!epartIsReg(rm) && locked) {
7137       /* case 3 */
7138       /* src is new value.  acc is expected value.  dest is old value.
7139          Compute success from the output of the IRCAS, and steer the
7140          new value for EAX accordingly: in case of success, EAX is
7141          unchanged. */
7142       addr = disAMode ( &len, sorb, delta0, dis_buf );
7143       delta0 += len;
7144       assign( src, getIReg(size, gregOfRM(rm)) );
7145       assign( acc, getIReg(size, R_EAX) );
7146       stmt( IRStmt_CAS(
7147          mkIRCAS( IRTemp_INVALID, dest, Iend_LE, mkexpr(addr),
7148                   NULL, mkexpr(acc), NULL, mkexpr(src) )
7149       ));
7150       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
7151       assign( cond, mk_x86g_calculate_condition(X86CondZ) );
7152       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
7153       putIReg(size, R_EAX, mkexpr(acc2));
7154       DIP("cmpxchg%c %s,%s\n", nameISize(size),
7155                                nameIReg(size,gregOfRM(rm)), dis_buf);
7156    }
7157    else vassert(0);
7158 
7159    return delta0;
7160 }
7161 
7162 
7163 /* Handle conditional move instructions of the form
7164       cmovcc E(reg-or-mem), G(reg)
7165 
7166    E(src) is reg-or-mem
7167    G(dst) is reg.
7168 
7169    If E is reg, -->    GET %E, tmps
7170                        GET %G, tmpd
7171                        CMOVcc tmps, tmpd
7172                        PUT tmpd, %G
7173 
7174    If E is mem  -->    (getAddr E) -> tmpa
7175                        LD (tmpa), tmps
7176                        GET %G, tmpd
7177                        CMOVcc tmps, tmpd
7178                        PUT tmpd, %G
7179 */
7180 static
dis_cmov_E_G(UChar sorb,Int sz,X86Condcode cond,Int delta0)7181 UInt dis_cmov_E_G ( UChar       sorb,
7182                     Int         sz,
7183                     X86Condcode cond,
7184                     Int         delta0 )
7185 {
7186    UChar rm  = getIByte(delta0);
7187    HChar dis_buf[50];
7188    Int   len;
7189 
7190    IRType ty   = szToITy(sz);
7191    IRTemp tmps = newTemp(ty);
7192    IRTemp tmpd = newTemp(ty);
7193 
7194    if (epartIsReg(rm)) {
7195       assign( tmps, getIReg(sz, eregOfRM(rm)) );
7196       assign( tmpd, getIReg(sz, gregOfRM(rm)) );
7197 
7198       putIReg(sz, gregOfRM(rm),
7199                   IRExpr_ITE( mk_x86g_calculate_condition(cond),
7200                               mkexpr(tmps),
7201                               mkexpr(tmpd) )
7202              );
7203       DIP("cmov%c%s %s,%s\n", nameISize(sz),
7204                               name_X86Condcode(cond),
7205                               nameIReg(sz,eregOfRM(rm)),
7206                               nameIReg(sz,gregOfRM(rm)));
7207       return 1+delta0;
7208    }
7209 
7210    /* E refers to memory */
7211    {
7212       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
7213       assign( tmps, loadLE(ty, mkexpr(addr)) );
7214       assign( tmpd, getIReg(sz, gregOfRM(rm)) );
7215 
7216       putIReg(sz, gregOfRM(rm),
7217                   IRExpr_ITE( mk_x86g_calculate_condition(cond),
7218                               mkexpr(tmps),
7219                               mkexpr(tmpd) )
7220              );
7221 
7222       DIP("cmov%c%s %s,%s\n", nameISize(sz),
7223                               name_X86Condcode(cond),
7224                               dis_buf,
7225                               nameIReg(sz,gregOfRM(rm)));
7226       return len+delta0;
7227    }
7228 }
7229 
7230 
7231 static
dis_xadd_G_E(UChar sorb,Bool locked,Int sz,Int delta0,Bool * decodeOK)7232 UInt dis_xadd_G_E ( UChar sorb, Bool locked, Int sz, Int delta0,
7233                     Bool* decodeOK )
7234 {
7235    Int   len;
7236    UChar rm = getIByte(delta0);
7237    HChar dis_buf[50];
7238 
7239    IRType ty    = szToITy(sz);
7240    IRTemp tmpd  = newTemp(ty);
7241    IRTemp tmpt0 = newTemp(ty);
7242    IRTemp tmpt1 = newTemp(ty);
7243 
7244    /* There are 3 cases to consider:
7245 
7246       reg-reg: ignore any lock prefix,
7247                generate 'naive' (non-atomic) sequence
7248 
7249       reg-mem, not locked: ignore any lock prefix, generate 'naive'
7250                            (non-atomic) sequence
7251 
7252       reg-mem, locked: use IRCAS
7253    */
7254 
7255    if (epartIsReg(rm)) {
7256       /* case 1 */
7257       assign( tmpd,  getIReg(sz, eregOfRM(rm)));
7258       assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
7259       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
7260                            mkexpr(tmpd), mkexpr(tmpt0)) );
7261       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
7262       putIReg(sz, eregOfRM(rm), mkexpr(tmpt1));
7263       putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
7264       DIP("xadd%c %s, %s\n",
7265           nameISize(sz), nameIReg(sz,gregOfRM(rm)),
7266           				 nameIReg(sz,eregOfRM(rm)));
7267       *decodeOK = True;
7268       return 1+delta0;
7269    }
7270    else if (!epartIsReg(rm) && !locked) {
7271       /* case 2 */
7272       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
7273       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
7274       assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
7275       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
7276                            mkexpr(tmpd), mkexpr(tmpt0)) );
7277       storeLE( mkexpr(addr), mkexpr(tmpt1) );
7278       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
7279       putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
7280       DIP("xadd%c %s, %s\n",
7281           nameISize(sz), nameIReg(sz,gregOfRM(rm)), dis_buf);
7282       *decodeOK = True;
7283       return len+delta0;
7284    }
7285    else if (!epartIsReg(rm) && locked) {
7286       /* case 3 */
7287       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
7288       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
7289       assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
7290       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
7291                            mkexpr(tmpd), mkexpr(tmpt0)) );
7292       casLE( mkexpr(addr), mkexpr(tmpd)/*expVal*/,
7293                            mkexpr(tmpt1)/*newVal*/, guest_EIP_curr_instr );
7294       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
7295       putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
7296       DIP("xadd%c %s, %s\n",
7297           nameISize(sz), nameIReg(sz,gregOfRM(rm)), dis_buf);
7298       *decodeOK = True;
7299       return len+delta0;
7300    }
7301    /*UNREACHED*/
7302    vassert(0);
7303 }
7304 
7305 /* Move 16 bits from Ew (ireg or mem) to G (a segment register). */
7306 
7307 static
dis_mov_Ew_Sw(UChar sorb,Int delta0)7308 UInt dis_mov_Ew_Sw ( UChar sorb, Int delta0 )
7309 {
7310    Int    len;
7311    IRTemp addr;
7312    UChar  rm  = getIByte(delta0);
7313    HChar  dis_buf[50];
7314 
7315    if (epartIsReg(rm)) {
7316       putSReg( gregOfRM(rm), getIReg(2, eregOfRM(rm)) );
7317       DIP("movw %s,%s\n", nameIReg(2,eregOfRM(rm)), nameSReg(gregOfRM(rm)));
7318       return 1+delta0;
7319    } else {
7320       addr = disAMode ( &len, sorb, delta0, dis_buf );
7321       putSReg( gregOfRM(rm), loadLE(Ity_I16, mkexpr(addr)) );
7322       DIP("movw %s,%s\n", dis_buf, nameSReg(gregOfRM(rm)));
7323       return len+delta0;
7324    }
7325 }
7326 
7327 /* Move 16 bits from G (a segment register) to Ew (ireg or mem).  If
7328    dst is ireg and sz==4, zero out top half of it.  */
7329 
7330 static
dis_mov_Sw_Ew(UChar sorb,Int sz,Int delta0)7331 UInt dis_mov_Sw_Ew ( UChar sorb,
7332                      Int   sz,
7333                      Int   delta0 )
7334 {
7335    Int    len;
7336    IRTemp addr;
7337    UChar  rm  = getIByte(delta0);
7338    HChar  dis_buf[50];
7339 
7340    vassert(sz == 2 || sz == 4);
7341 
7342    if (epartIsReg(rm)) {
7343       if (sz == 4)
7344          putIReg(4, eregOfRM(rm), unop(Iop_16Uto32, getSReg(gregOfRM(rm))));
7345       else
7346          putIReg(2, eregOfRM(rm), getSReg(gregOfRM(rm)));
7347 
7348       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), nameIReg(sz,eregOfRM(rm)));
7349       return 1+delta0;
7350    } else {
7351       addr = disAMode ( &len, sorb, delta0, dis_buf );
7352       storeLE( mkexpr(addr), getSReg(gregOfRM(rm)) );
7353       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), dis_buf);
7354       return len+delta0;
7355    }
7356 }
7357 
7358 
7359 static
dis_push_segreg(UInt sreg,Int sz)7360 void dis_push_segreg ( UInt sreg, Int sz )
7361 {
7362     IRTemp t1 = newTemp(Ity_I16);
7363     IRTemp ta = newTemp(Ity_I32);
7364     vassert(sz == 2 || sz == 4);
7365 
7366     assign( t1, getSReg(sreg) );
7367     assign( ta, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)) );
7368     putIReg(4, R_ESP, mkexpr(ta));
7369     storeLE( mkexpr(ta), mkexpr(t1) );
7370 
7371     DIP("push%c %s\n", sz==2 ? 'w' : 'l', nameSReg(sreg));
7372 }
7373 
7374 static
dis_pop_segreg(UInt sreg,Int sz)7375 void dis_pop_segreg ( UInt sreg, Int sz )
7376 {
7377     IRTemp t1 = newTemp(Ity_I16);
7378     IRTemp ta = newTemp(Ity_I32);
7379     vassert(sz == 2 || sz == 4);
7380 
7381     assign( ta, getIReg(4, R_ESP) );
7382     assign( t1, loadLE(Ity_I16, mkexpr(ta)) );
7383 
7384     putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(ta), mkU32(sz)) );
7385     putSReg( sreg, mkexpr(t1) );
7386     DIP("pop%c %s\n", sz==2 ? 'w' : 'l', nameSReg(sreg));
7387 }
7388 
7389 static
dis_ret(DisResult * dres,UInt d32)7390 void dis_ret ( /*MOD*/DisResult* dres, UInt d32 )
7391 {
7392    IRTemp t1 = newTemp(Ity_I32);
7393    IRTemp t2 = newTemp(Ity_I32);
7394    assign(t1, getIReg(4,R_ESP));
7395    assign(t2, loadLE(Ity_I32,mkexpr(t1)));
7396    putIReg(4, R_ESP,binop(Iop_Add32, mkexpr(t1), mkU32(4+d32)));
7397    jmp_treg(dres, Ijk_Ret, t2);
7398    vassert(dres->whatNext == Dis_StopHere);
7399 }
7400 
7401 /*------------------------------------------------------------*/
7402 /*--- SSE/SSE2/SSE3 helpers                                ---*/
7403 /*------------------------------------------------------------*/
7404 
7405 /* Indicates whether the op requires a rounding-mode argument.  Note
7406    that this covers only vector floating point arithmetic ops, and
7407    omits the scalar ones that need rounding modes.  Note also that
7408    inconsistencies here will get picked up later by the IR sanity
7409    checker, so this isn't correctness-critical. */
requiresRMode(IROp op)7410 static Bool requiresRMode ( IROp op )
7411 {
7412    switch (op) {
7413       /* 128 bit ops */
7414       case Iop_Add32Fx4: case Iop_Sub32Fx4:
7415       case Iop_Mul32Fx4: case Iop_Div32Fx4:
7416       case Iop_Add64Fx2: case Iop_Sub64Fx2:
7417       case Iop_Mul64Fx2: case Iop_Div64Fx2:
7418          return True;
7419       default:
7420          break;
7421    }
7422    return False;
7423 }
7424 
7425 
7426 /* Worker function; do not call directly.
7427    Handles full width G = G `op` E   and   G = (not G) `op` E.
7428 */
7429 
dis_SSE_E_to_G_all_wrk(UChar sorb,Int delta,const HChar * opname,IROp op,Bool invertG)7430 static UInt dis_SSE_E_to_G_all_wrk (
7431                UChar sorb, Int delta,
7432                const HChar* opname, IROp op,
7433                Bool   invertG
7434             )
7435 {
7436    HChar   dis_buf[50];
7437    Int     alen;
7438    IRTemp  addr;
7439    UChar   rm = getIByte(delta);
7440    IRExpr* gpart
7441       = invertG ? unop(Iop_NotV128, getXMMReg(gregOfRM(rm)))
7442                 : getXMMReg(gregOfRM(rm));
7443    if (epartIsReg(rm)) {
7444       putXMMReg(
7445          gregOfRM(rm),
7446          requiresRMode(op)
7447             ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
7448                         gpart,
7449                         getXMMReg(eregOfRM(rm)))
7450             : binop(op, gpart,
7451                         getXMMReg(eregOfRM(rm)))
7452       );
7453       DIP("%s %s,%s\n", opname,
7454                         nameXMMReg(eregOfRM(rm)),
7455                         nameXMMReg(gregOfRM(rm)) );
7456       return delta+1;
7457    } else {
7458       addr = disAMode ( &alen, sorb, delta, dis_buf );
7459       putXMMReg(
7460          gregOfRM(rm),
7461          requiresRMode(op)
7462             ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
7463                         gpart,
7464                         loadLE(Ity_V128, mkexpr(addr)))
7465             : binop(op, gpart,
7466                         loadLE(Ity_V128, mkexpr(addr)))
7467       );
7468       DIP("%s %s,%s\n", opname,
7469                         dis_buf,
7470                         nameXMMReg(gregOfRM(rm)) );
7471       return delta+alen;
7472    }
7473 }
7474 
7475 
7476 /* All lanes SSE binary operation, G = G `op` E. */
7477 
7478 static
dis_SSE_E_to_G_all(UChar sorb,Int delta,const HChar * opname,IROp op)7479 UInt dis_SSE_E_to_G_all ( UChar sorb, Int delta, const HChar* opname, IROp op )
7480 {
7481    return dis_SSE_E_to_G_all_wrk( sorb, delta, opname, op, False );
7482 }
7483 
7484 /* All lanes SSE binary operation, G = (not G) `op` E. */
7485 
7486 static
dis_SSE_E_to_G_all_invG(UChar sorb,Int delta,const HChar * opname,IROp op)7487 UInt dis_SSE_E_to_G_all_invG ( UChar sorb, Int delta,
7488                                const HChar* opname, IROp op )
7489 {
7490    return dis_SSE_E_to_G_all_wrk( sorb, delta, opname, op, True );
7491 }
7492 
7493 
7494 /* Lowest 32-bit lane only SSE binary operation, G = G `op` E. */
7495 
dis_SSE_E_to_G_lo32(UChar sorb,Int delta,const HChar * opname,IROp op)7496 static UInt dis_SSE_E_to_G_lo32 ( UChar sorb, Int delta,
7497                                   const HChar* opname, IROp op )
7498 {
7499    HChar   dis_buf[50];
7500    Int     alen;
7501    IRTemp  addr;
7502    UChar   rm = getIByte(delta);
7503    IRExpr* gpart = getXMMReg(gregOfRM(rm));
7504    if (epartIsReg(rm)) {
7505       putXMMReg( gregOfRM(rm),
7506                  binop(op, gpart,
7507                            getXMMReg(eregOfRM(rm))) );
7508       DIP("%s %s,%s\n", opname,
7509                         nameXMMReg(eregOfRM(rm)),
7510                         nameXMMReg(gregOfRM(rm)) );
7511       return delta+1;
7512    } else {
7513       /* We can only do a 32-bit memory read, so the upper 3/4 of the
7514          E operand needs to be made simply of zeroes. */
7515       IRTemp epart = newTemp(Ity_V128);
7516       addr = disAMode ( &alen, sorb, delta, dis_buf );
7517       assign( epart, unop( Iop_32UtoV128,
7518                            loadLE(Ity_I32, mkexpr(addr))) );
7519       putXMMReg( gregOfRM(rm),
7520                  binop(op, gpart, mkexpr(epart)) );
7521       DIP("%s %s,%s\n", opname,
7522                         dis_buf,
7523                         nameXMMReg(gregOfRM(rm)) );
7524       return delta+alen;
7525    }
7526 }
7527 
7528 
7529 /* Lower 64-bit lane only SSE binary operation, G = G `op` E. */
7530 
dis_SSE_E_to_G_lo64(UChar sorb,Int delta,const HChar * opname,IROp op)7531 static UInt dis_SSE_E_to_G_lo64 ( UChar sorb, Int delta,
7532                                   const HChar* opname, IROp op )
7533 {
7534    HChar   dis_buf[50];
7535    Int     alen;
7536    IRTemp  addr;
7537    UChar   rm = getIByte(delta);
7538    IRExpr* gpart = getXMMReg(gregOfRM(rm));
7539    if (epartIsReg(rm)) {
7540       putXMMReg( gregOfRM(rm),
7541                  binop(op, gpart,
7542                            getXMMReg(eregOfRM(rm))) );
7543       DIP("%s %s,%s\n", opname,
7544                         nameXMMReg(eregOfRM(rm)),
7545                         nameXMMReg(gregOfRM(rm)) );
7546       return delta+1;
7547    } else {
7548       /* We can only do a 64-bit memory read, so the upper half of the
7549          E operand needs to be made simply of zeroes. */
7550       IRTemp epart = newTemp(Ity_V128);
7551       addr = disAMode ( &alen, sorb, delta, dis_buf );
7552       assign( epart, unop( Iop_64UtoV128,
7553                            loadLE(Ity_I64, mkexpr(addr))) );
7554       putXMMReg( gregOfRM(rm),
7555                  binop(op, gpart, mkexpr(epart)) );
7556       DIP("%s %s,%s\n", opname,
7557                         dis_buf,
7558                         nameXMMReg(gregOfRM(rm)) );
7559       return delta+alen;
7560    }
7561 }
7562 
7563 
7564 /* All lanes unary SSE operation, G = op(E). */
7565 
dis_SSE_E_to_G_unary_all(UChar sorb,Int delta,const HChar * opname,IROp op)7566 static UInt dis_SSE_E_to_G_unary_all (
7567                UChar sorb, Int delta,
7568                const HChar* opname, IROp op
7569             )
7570 {
7571    HChar   dis_buf[50];
7572    Int     alen;
7573    IRTemp  addr;
7574    UChar   rm = getIByte(delta);
7575    // Sqrt32Fx4 and Sqrt64Fx2 take a rounding mode, which is faked
7576    // up in the usual way.
7577    Bool needsIRRM = op == Iop_Sqrt32Fx4 || op == Iop_Sqrt64Fx2;
7578    if (epartIsReg(rm)) {
7579       IRExpr* src = getXMMReg(eregOfRM(rm));
7580       /* XXXROUNDINGFIXME */
7581       IRExpr* res = needsIRRM ? binop(op, get_FAKE_roundingmode(), src)
7582                               : unop(op, src);
7583       putXMMReg( gregOfRM(rm), res );
7584       DIP("%s %s,%s\n", opname,
7585                         nameXMMReg(eregOfRM(rm)),
7586                         nameXMMReg(gregOfRM(rm)) );
7587       return delta+1;
7588    } else {
7589       addr = disAMode ( &alen, sorb, delta, dis_buf );
7590       IRExpr* src = loadLE(Ity_V128, mkexpr(addr));
7591       /* XXXROUNDINGFIXME */
7592       IRExpr* res = needsIRRM ? binop(op, get_FAKE_roundingmode(), src)
7593                               : unop(op, src);
7594       putXMMReg( gregOfRM(rm), res );
7595       DIP("%s %s,%s\n", opname,
7596                         dis_buf,
7597                         nameXMMReg(gregOfRM(rm)) );
7598       return delta+alen;
7599    }
7600 }
7601 
7602 
7603 /* Lowest 32-bit lane only unary SSE operation, G = op(E). */
7604 
dis_SSE_E_to_G_unary_lo32(UChar sorb,Int delta,const HChar * opname,IROp op)7605 static UInt dis_SSE_E_to_G_unary_lo32 (
7606                UChar sorb, Int delta,
7607                const HChar* opname, IROp op
7608             )
7609 {
7610    /* First we need to get the old G value and patch the low 32 bits
7611       of the E operand into it.  Then apply op and write back to G. */
7612    HChar   dis_buf[50];
7613    Int     alen;
7614    IRTemp  addr;
7615    UChar   rm = getIByte(delta);
7616    IRTemp  oldG0 = newTemp(Ity_V128);
7617    IRTemp  oldG1 = newTemp(Ity_V128);
7618 
7619    assign( oldG0, getXMMReg(gregOfRM(rm)) );
7620 
7621    if (epartIsReg(rm)) {
7622       assign( oldG1,
7623               binop( Iop_SetV128lo32,
7624                      mkexpr(oldG0),
7625                      getXMMRegLane32(eregOfRM(rm), 0)) );
7626       putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
7627       DIP("%s %s,%s\n", opname,
7628                         nameXMMReg(eregOfRM(rm)),
7629                         nameXMMReg(gregOfRM(rm)) );
7630       return delta+1;
7631    } else {
7632       addr = disAMode ( &alen, sorb, delta, dis_buf );
7633       assign( oldG1,
7634               binop( Iop_SetV128lo32,
7635                      mkexpr(oldG0),
7636                      loadLE(Ity_I32, mkexpr(addr)) ));
7637       putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
7638       DIP("%s %s,%s\n", opname,
7639                         dis_buf,
7640                         nameXMMReg(gregOfRM(rm)) );
7641       return delta+alen;
7642    }
7643 }
7644 
7645 
7646 /* Lowest 64-bit lane only unary SSE operation, G = op(E). */
7647 
dis_SSE_E_to_G_unary_lo64(UChar sorb,Int delta,const HChar * opname,IROp op)7648 static UInt dis_SSE_E_to_G_unary_lo64 (
7649                UChar sorb, Int delta,
7650                const HChar* opname, IROp op
7651             )
7652 {
7653    /* First we need to get the old G value and patch the low 64 bits
7654       of the E operand into it.  Then apply op and write back to G. */
7655    HChar   dis_buf[50];
7656    Int     alen;
7657    IRTemp  addr;
7658    UChar   rm = getIByte(delta);
7659    IRTemp  oldG0 = newTemp(Ity_V128);
7660    IRTemp  oldG1 = newTemp(Ity_V128);
7661 
7662    assign( oldG0, getXMMReg(gregOfRM(rm)) );
7663 
7664    if (epartIsReg(rm)) {
7665       assign( oldG1,
7666               binop( Iop_SetV128lo64,
7667                      mkexpr(oldG0),
7668                      getXMMRegLane64(eregOfRM(rm), 0)) );
7669       putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
7670       DIP("%s %s,%s\n", opname,
7671                         nameXMMReg(eregOfRM(rm)),
7672                         nameXMMReg(gregOfRM(rm)) );
7673       return delta+1;
7674    } else {
7675       addr = disAMode ( &alen, sorb, delta, dis_buf );
7676       assign( oldG1,
7677               binop( Iop_SetV128lo64,
7678                      mkexpr(oldG0),
7679                      loadLE(Ity_I64, mkexpr(addr)) ));
7680       putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
7681       DIP("%s %s,%s\n", opname,
7682                         dis_buf,
7683                         nameXMMReg(gregOfRM(rm)) );
7684       return delta+alen;
7685    }
7686 }
7687 
7688 
7689 /* SSE integer binary operation:
7690       G = G `op` E   (eLeft == False)
7691       G = E `op` G   (eLeft == True)
7692 */
dis_SSEint_E_to_G(UChar sorb,Int delta,const HChar * opname,IROp op,Bool eLeft)7693 static UInt dis_SSEint_E_to_G(
7694                UChar sorb, Int delta,
7695                const HChar* opname, IROp op,
7696                Bool   eLeft
7697             )
7698 {
7699    HChar   dis_buf[50];
7700    Int     alen;
7701    IRTemp  addr;
7702    UChar   rm = getIByte(delta);
7703    IRExpr* gpart = getXMMReg(gregOfRM(rm));
7704    IRExpr* epart = NULL;
7705    if (epartIsReg(rm)) {
7706       epart = getXMMReg(eregOfRM(rm));
7707       DIP("%s %s,%s\n", opname,
7708                         nameXMMReg(eregOfRM(rm)),
7709                         nameXMMReg(gregOfRM(rm)) );
7710       delta += 1;
7711    } else {
7712       addr  = disAMode ( &alen, sorb, delta, dis_buf );
7713       epart = loadLE(Ity_V128, mkexpr(addr));
7714       DIP("%s %s,%s\n", opname,
7715                         dis_buf,
7716                         nameXMMReg(gregOfRM(rm)) );
7717       delta += alen;
7718    }
7719    putXMMReg( gregOfRM(rm),
7720               eLeft ? binop(op, epart, gpart)
7721 	            : binop(op, gpart, epart) );
7722    return delta;
7723 }
7724 
7725 
7726 /* Helper for doing SSE FP comparisons. */
7727 
findSSECmpOp(Bool * needNot,IROp * op,Int imm8,Bool all_lanes,Int sz)7728 static void findSSECmpOp ( Bool* needNot, IROp* op,
7729                            Int imm8, Bool all_lanes, Int sz )
7730 {
7731    imm8 &= 7;
7732    *needNot = False;
7733    *op      = Iop_INVALID;
7734    if (imm8 >= 4) {
7735       *needNot = True;
7736       imm8 -= 4;
7737    }
7738 
7739    if (sz == 4 && all_lanes) {
7740       switch (imm8) {
7741          case 0: *op = Iop_CmpEQ32Fx4; return;
7742          case 1: *op = Iop_CmpLT32Fx4; return;
7743          case 2: *op = Iop_CmpLE32Fx4; return;
7744          case 3: *op = Iop_CmpUN32Fx4; return;
7745          default: break;
7746       }
7747    }
7748    if (sz == 4 && !all_lanes) {
7749       switch (imm8) {
7750          case 0: *op = Iop_CmpEQ32F0x4; return;
7751          case 1: *op = Iop_CmpLT32F0x4; return;
7752          case 2: *op = Iop_CmpLE32F0x4; return;
7753          case 3: *op = Iop_CmpUN32F0x4; return;
7754          default: break;
7755       }
7756    }
7757    if (sz == 8 && all_lanes) {
7758       switch (imm8) {
7759          case 0: *op = Iop_CmpEQ64Fx2; return;
7760          case 1: *op = Iop_CmpLT64Fx2; return;
7761          case 2: *op = Iop_CmpLE64Fx2; return;
7762          case 3: *op = Iop_CmpUN64Fx2; return;
7763          default: break;
7764       }
7765    }
7766    if (sz == 8 && !all_lanes) {
7767       switch (imm8) {
7768          case 0: *op = Iop_CmpEQ64F0x2; return;
7769          case 1: *op = Iop_CmpLT64F0x2; return;
7770          case 2: *op = Iop_CmpLE64F0x2; return;
7771          case 3: *op = Iop_CmpUN64F0x2; return;
7772          default: break;
7773       }
7774    }
7775    vpanic("findSSECmpOp(x86,guest)");
7776 }
7777 
7778 /* Handles SSE 32F/64F comparisons. */
7779 
dis_SSEcmp_E_to_G(UChar sorb,Int delta,const HChar * opname,Bool all_lanes,Int sz)7780 static UInt dis_SSEcmp_E_to_G ( UChar sorb, Int delta,
7781 				const HChar* opname, Bool all_lanes, Int sz )
7782 {
7783    HChar   dis_buf[50];
7784    Int     alen, imm8;
7785    IRTemp  addr;
7786    Bool    needNot = False;
7787    IROp    op      = Iop_INVALID;
7788    IRTemp  plain   = newTemp(Ity_V128);
7789    UChar   rm      = getIByte(delta);
7790    UShort  mask    = 0;
7791    vassert(sz == 4 || sz == 8);
7792    if (epartIsReg(rm)) {
7793       imm8 = getIByte(delta+1);
7794       findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
7795       assign( plain, binop(op, getXMMReg(gregOfRM(rm)),
7796                                getXMMReg(eregOfRM(rm))) );
7797       delta += 2;
7798       DIP("%s $%d,%s,%s\n", opname,
7799                             imm8,
7800                             nameXMMReg(eregOfRM(rm)),
7801                             nameXMMReg(gregOfRM(rm)) );
7802    } else {
7803       addr = disAMode ( &alen, sorb, delta, dis_buf );
7804       imm8 = getIByte(delta+alen);
7805       findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
7806       assign( plain,
7807               binop(
7808                  op,
7809                  getXMMReg(gregOfRM(rm)),
7810                    all_lanes  ? loadLE(Ity_V128, mkexpr(addr))
7811                  : sz == 8    ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
7812                  : /*sz==4*/    unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr)))
7813              )
7814       );
7815       delta += alen+1;
7816       DIP("%s $%d,%s,%s\n", opname,
7817                             imm8,
7818                             dis_buf,
7819                             nameXMMReg(gregOfRM(rm)) );
7820    }
7821 
7822    if (needNot && all_lanes) {
7823       putXMMReg( gregOfRM(rm),
7824                  unop(Iop_NotV128, mkexpr(plain)) );
7825    }
7826    else
7827    if (needNot && !all_lanes) {
7828       mask = toUShort( sz==4 ? 0x000F : 0x00FF );
7829       putXMMReg( gregOfRM(rm),
7830                  binop(Iop_XorV128, mkexpr(plain), mkV128(mask)) );
7831    }
7832    else {
7833       putXMMReg( gregOfRM(rm), mkexpr(plain) );
7834    }
7835 
7836    return delta;
7837 }
7838 
7839 
7840 /* Vector by scalar shift of G by the amount specified at the bottom
7841    of E. */
7842 
dis_SSE_shiftG_byE(UChar sorb,Int delta,const HChar * opname,IROp op)7843 static UInt dis_SSE_shiftG_byE ( UChar sorb, Int delta,
7844                                  const HChar* opname, IROp op )
7845 {
7846    HChar   dis_buf[50];
7847    Int     alen, size;
7848    IRTemp  addr;
7849    Bool    shl, shr, sar;
7850    UChar   rm   = getIByte(delta);
7851    IRTemp  g0   = newTemp(Ity_V128);
7852    IRTemp  g1   = newTemp(Ity_V128);
7853    IRTemp  amt  = newTemp(Ity_I32);
7854    IRTemp  amt8 = newTemp(Ity_I8);
7855    if (epartIsReg(rm)) {
7856       assign( amt, getXMMRegLane32(eregOfRM(rm), 0) );
7857       DIP("%s %s,%s\n", opname,
7858                         nameXMMReg(eregOfRM(rm)),
7859                         nameXMMReg(gregOfRM(rm)) );
7860       delta++;
7861    } else {
7862       addr = disAMode ( &alen, sorb, delta, dis_buf );
7863       assign( amt, loadLE(Ity_I32, mkexpr(addr)) );
7864       DIP("%s %s,%s\n", opname,
7865                         dis_buf,
7866                         nameXMMReg(gregOfRM(rm)) );
7867       delta += alen;
7868    }
7869    assign( g0,   getXMMReg(gregOfRM(rm)) );
7870    assign( amt8, unop(Iop_32to8, mkexpr(amt)) );
7871 
7872    shl = shr = sar = False;
7873    size = 0;
7874    switch (op) {
7875       case Iop_ShlN16x8: shl = True; size = 32; break;
7876       case Iop_ShlN32x4: shl = True; size = 32; break;
7877       case Iop_ShlN64x2: shl = True; size = 64; break;
7878       case Iop_SarN16x8: sar = True; size = 16; break;
7879       case Iop_SarN32x4: sar = True; size = 32; break;
7880       case Iop_ShrN16x8: shr = True; size = 16; break;
7881       case Iop_ShrN32x4: shr = True; size = 32; break;
7882       case Iop_ShrN64x2: shr = True; size = 64; break;
7883       default: vassert(0);
7884    }
7885 
7886    if (shl || shr) {
7887      assign(
7888         g1,
7889         IRExpr_ITE(
7890            binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size)),
7891            binop(op, mkexpr(g0), mkexpr(amt8)),
7892            mkV128(0x0000)
7893         )
7894      );
7895    } else
7896    if (sar) {
7897      assign(
7898         g1,
7899         IRExpr_ITE(
7900            binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size)),
7901            binop(op, mkexpr(g0), mkexpr(amt8)),
7902            binop(op, mkexpr(g0), mkU8(size-1))
7903         )
7904      );
7905    } else {
7906       /*NOTREACHED*/
7907       vassert(0);
7908    }
7909 
7910    putXMMReg( gregOfRM(rm), mkexpr(g1) );
7911    return delta;
7912 }
7913 
7914 
7915 /* Vector by scalar shift of E by an immediate byte. */
7916 
7917 static
dis_SSE_shiftE_imm(Int delta,const HChar * opname,IROp op)7918 UInt dis_SSE_shiftE_imm ( Int delta, const HChar* opname, IROp op )
7919 {
7920    Bool    shl, shr, sar;
7921    UChar   rm   = getIByte(delta);
7922    IRTemp  e0   = newTemp(Ity_V128);
7923    IRTemp  e1   = newTemp(Ity_V128);
7924    UChar   amt, size;
7925    vassert(epartIsReg(rm));
7926    vassert(gregOfRM(rm) == 2
7927            || gregOfRM(rm) == 4 || gregOfRM(rm) == 6);
7928    amt = getIByte(delta+1);
7929    delta += 2;
7930    DIP("%s $%d,%s\n", opname,
7931                       (Int)amt,
7932                       nameXMMReg(eregOfRM(rm)) );
7933    assign( e0, getXMMReg(eregOfRM(rm)) );
7934 
7935    shl = shr = sar = False;
7936    size = 0;
7937    switch (op) {
7938       case Iop_ShlN16x8: shl = True; size = 16; break;
7939       case Iop_ShlN32x4: shl = True; size = 32; break;
7940       case Iop_ShlN64x2: shl = True; size = 64; break;
7941       case Iop_SarN16x8: sar = True; size = 16; break;
7942       case Iop_SarN32x4: sar = True; size = 32; break;
7943       case Iop_ShrN16x8: shr = True; size = 16; break;
7944       case Iop_ShrN32x4: shr = True; size = 32; break;
7945       case Iop_ShrN64x2: shr = True; size = 64; break;
7946       default: vassert(0);
7947    }
7948 
7949    if (shl || shr) {
7950       assign( e1, amt >= size
7951                      ? mkV128(0x0000)
7952                      : binop(op, mkexpr(e0), mkU8(amt))
7953       );
7954    } else
7955    if (sar) {
7956       assign( e1, amt >= size
7957                      ? binop(op, mkexpr(e0), mkU8(size-1))
7958                      : binop(op, mkexpr(e0), mkU8(amt))
7959       );
7960    } else {
7961       /*NOTREACHED*/
7962       vassert(0);
7963    }
7964 
7965    putXMMReg( eregOfRM(rm), mkexpr(e1) );
7966    return delta;
7967 }
7968 
7969 
7970 /* Get the current SSE rounding mode. */
7971 
get_sse_roundingmode(void)7972 static IRExpr* /* :: Ity_I32 */ get_sse_roundingmode ( void )
7973 {
7974    return binop( Iop_And32,
7975                  IRExpr_Get( OFFB_SSEROUND, Ity_I32 ),
7976                  mkU32(3) );
7977 }
7978 
put_sse_roundingmode(IRExpr * sseround)7979 static void put_sse_roundingmode ( IRExpr* sseround )
7980 {
7981    vassert(typeOfIRExpr(irsb->tyenv, sseround) == Ity_I32);
7982    stmt( IRStmt_Put( OFFB_SSEROUND, sseround ) );
7983 }
7984 
7985 /* Break a 128-bit value up into four 32-bit ints. */
7986 
breakup128to32s(IRTemp t128,IRTemp * t3,IRTemp * t2,IRTemp * t1,IRTemp * t0)7987 static void breakup128to32s ( IRTemp t128,
7988 			      /*OUTs*/
7989                               IRTemp* t3, IRTemp* t2,
7990                               IRTemp* t1, IRTemp* t0 )
7991 {
7992    IRTemp hi64 = newTemp(Ity_I64);
7993    IRTemp lo64 = newTemp(Ity_I64);
7994    assign( hi64, unop(Iop_V128HIto64, mkexpr(t128)) );
7995    assign( lo64, unop(Iop_V128to64,   mkexpr(t128)) );
7996 
7997    vassert(t0 && *t0 == IRTemp_INVALID);
7998    vassert(t1 && *t1 == IRTemp_INVALID);
7999    vassert(t2 && *t2 == IRTemp_INVALID);
8000    vassert(t3 && *t3 == IRTemp_INVALID);
8001 
8002    *t0 = newTemp(Ity_I32);
8003    *t1 = newTemp(Ity_I32);
8004    *t2 = newTemp(Ity_I32);
8005    *t3 = newTemp(Ity_I32);
8006    assign( *t0, unop(Iop_64to32,   mkexpr(lo64)) );
8007    assign( *t1, unop(Iop_64HIto32, mkexpr(lo64)) );
8008    assign( *t2, unop(Iop_64to32,   mkexpr(hi64)) );
8009    assign( *t3, unop(Iop_64HIto32, mkexpr(hi64)) );
8010 }
8011 
8012 /* Construct a 128-bit value from four 32-bit ints. */
8013 
mk128from32s(IRTemp t3,IRTemp t2,IRTemp t1,IRTemp t0)8014 static IRExpr* mk128from32s ( IRTemp t3, IRTemp t2,
8015                               IRTemp t1, IRTemp t0 )
8016 {
8017    return
8018       binop( Iop_64HLtoV128,
8019              binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
8020              binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0))
8021    );
8022 }
8023 
8024 /* Break a 64-bit value up into four 16-bit ints. */
8025 
breakup64to16s(IRTemp t64,IRTemp * t3,IRTemp * t2,IRTemp * t1,IRTemp * t0)8026 static void breakup64to16s ( IRTemp t64,
8027                              /*OUTs*/
8028                              IRTemp* t3, IRTemp* t2,
8029                              IRTemp* t1, IRTemp* t0 )
8030 {
8031    IRTemp hi32 = newTemp(Ity_I32);
8032    IRTemp lo32 = newTemp(Ity_I32);
8033    assign( hi32, unop(Iop_64HIto32, mkexpr(t64)) );
8034    assign( lo32, unop(Iop_64to32,   mkexpr(t64)) );
8035 
8036    vassert(t0 && *t0 == IRTemp_INVALID);
8037    vassert(t1 && *t1 == IRTemp_INVALID);
8038    vassert(t2 && *t2 == IRTemp_INVALID);
8039    vassert(t3 && *t3 == IRTemp_INVALID);
8040 
8041    *t0 = newTemp(Ity_I16);
8042    *t1 = newTemp(Ity_I16);
8043    *t2 = newTemp(Ity_I16);
8044    *t3 = newTemp(Ity_I16);
8045    assign( *t0, unop(Iop_32to16,   mkexpr(lo32)) );
8046    assign( *t1, unop(Iop_32HIto16, mkexpr(lo32)) );
8047    assign( *t2, unop(Iop_32to16,   mkexpr(hi32)) );
8048    assign( *t3, unop(Iop_32HIto16, mkexpr(hi32)) );
8049 }
8050 
8051 /* Construct a 64-bit value from four 16-bit ints. */
8052 
mk64from16s(IRTemp t3,IRTemp t2,IRTemp t1,IRTemp t0)8053 static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2,
8054                              IRTemp t1, IRTemp t0 )
8055 {
8056    return
8057       binop( Iop_32HLto64,
8058              binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)),
8059              binop(Iop_16HLto32, mkexpr(t1), mkexpr(t0))
8060    );
8061 }
8062 
8063 /* Generate IR to set the guest %EFLAGS from the pushfl-format image
8064    in the given 32-bit temporary.  The flags that are set are: O S Z A
8065    C P D ID AC.
8066 
8067    In all cases, code to set AC is generated.  However, VEX actually
8068    ignores the AC value and so can optionally emit an emulation
8069    warning when it is enabled.  In this routine, an emulation warning
8070    is only emitted if emit_AC_emwarn is True, in which case
8071    next_insn_EIP must be correct (this allows for correct code
8072    generation for popfl/popfw).  If emit_AC_emwarn is False,
8073    next_insn_EIP is unimportant (this allows for easy if kludgey code
8074    generation for IRET.) */
8075 
8076 static
set_EFLAGS_from_value(IRTemp t1,Bool emit_AC_emwarn,Addr32 next_insn_EIP)8077 void set_EFLAGS_from_value ( IRTemp t1,
8078                              Bool   emit_AC_emwarn,
8079                              Addr32 next_insn_EIP )
8080 {
8081    vassert(typeOfIRTemp(irsb->tyenv,t1) == Ity_I32);
8082 
8083    /* t1 is the flag word.  Mask out everything except OSZACP and set
8084       the flags thunk to X86G_CC_OP_COPY. */
8085    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
8086    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
8087    stmt( IRStmt_Put( OFFB_CC_DEP1,
8088                      binop(Iop_And32,
8089                            mkexpr(t1),
8090                            mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
8091                                   | X86G_CC_MASK_A | X86G_CC_MASK_Z
8092                                   | X86G_CC_MASK_S| X86G_CC_MASK_O )
8093                           )
8094                     )
8095        );
8096    /* Set NDEP even though it isn't used.  This makes redundant-PUT
8097       elimination of previous stores to this field work better. */
8098    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
8099 
8100    /* Also need to set the D flag, which is held in bit 10 of t1.
8101       If zero, put 1 in OFFB_DFLAG, else -1 in OFFB_DFLAG. */
8102    stmt( IRStmt_Put(
8103             OFFB_DFLAG,
8104             IRExpr_ITE(
8105                unop(Iop_32to1,
8106                     binop(Iop_And32,
8107                           binop(Iop_Shr32, mkexpr(t1), mkU8(10)),
8108                           mkU32(1))),
8109                mkU32(0xFFFFFFFF),
8110                mkU32(1)))
8111        );
8112 
8113    /* Set the ID flag */
8114    stmt( IRStmt_Put(
8115             OFFB_IDFLAG,
8116             IRExpr_ITE(
8117                unop(Iop_32to1,
8118                     binop(Iop_And32,
8119                           binop(Iop_Shr32, mkexpr(t1), mkU8(21)),
8120                           mkU32(1))),
8121                mkU32(1),
8122                mkU32(0)))
8123        );
8124 
8125    /* And set the AC flag.  If setting it 1 to, possibly emit an
8126       emulation warning. */
8127    stmt( IRStmt_Put(
8128             OFFB_ACFLAG,
8129             IRExpr_ITE(
8130                unop(Iop_32to1,
8131                     binop(Iop_And32,
8132                           binop(Iop_Shr32, mkexpr(t1), mkU8(18)),
8133                           mkU32(1))),
8134                mkU32(1),
8135                mkU32(0)))
8136        );
8137 
8138    if (emit_AC_emwarn) {
8139       put_emwarn( mkU32(EmWarn_X86_acFlag) );
8140       stmt(
8141          IRStmt_Exit(
8142             binop( Iop_CmpNE32,
8143                    binop(Iop_And32, mkexpr(t1), mkU32(1<<18)),
8144                    mkU32(0) ),
8145             Ijk_EmWarn,
8146             IRConst_U32( next_insn_EIP ),
8147             OFFB_EIP
8148          )
8149       );
8150    }
8151 }
8152 
8153 
8154 /* Helper for the SSSE3 (not SSE3) PMULHRSW insns.  Given two 64-bit
8155    values (aa,bb), computes, for each of the 4 16-bit lanes:
8156 
8157    (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1
8158 */
dis_PMULHRSW_helper(IRExpr * aax,IRExpr * bbx)8159 static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx )
8160 {
8161    IRTemp aa      = newTemp(Ity_I64);
8162    IRTemp bb      = newTemp(Ity_I64);
8163    IRTemp aahi32s = newTemp(Ity_I64);
8164    IRTemp aalo32s = newTemp(Ity_I64);
8165    IRTemp bbhi32s = newTemp(Ity_I64);
8166    IRTemp bblo32s = newTemp(Ity_I64);
8167    IRTemp rHi     = newTemp(Ity_I64);
8168    IRTemp rLo     = newTemp(Ity_I64);
8169    IRTemp one32x2 = newTemp(Ity_I64);
8170    assign(aa, aax);
8171    assign(bb, bbx);
8172    assign( aahi32s,
8173            binop(Iop_SarN32x2,
8174                  binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)),
8175                  mkU8(16) ));
8176    assign( aalo32s,
8177            binop(Iop_SarN32x2,
8178                  binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)),
8179                  mkU8(16) ));
8180    assign( bbhi32s,
8181            binop(Iop_SarN32x2,
8182                  binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)),
8183                  mkU8(16) ));
8184    assign( bblo32s,
8185            binop(Iop_SarN32x2,
8186                  binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)),
8187                  mkU8(16) ));
8188    assign(one32x2, mkU64( (1ULL << 32) + 1 ));
8189    assign(
8190       rHi,
8191       binop(
8192          Iop_ShrN32x2,
8193          binop(
8194             Iop_Add32x2,
8195             binop(
8196                Iop_ShrN32x2,
8197                binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)),
8198                mkU8(14)
8199             ),
8200             mkexpr(one32x2)
8201          ),
8202          mkU8(1)
8203       )
8204    );
8205    assign(
8206       rLo,
8207       binop(
8208          Iop_ShrN32x2,
8209          binop(
8210             Iop_Add32x2,
8211             binop(
8212                Iop_ShrN32x2,
8213                binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)),
8214                mkU8(14)
8215             ),
8216             mkexpr(one32x2)
8217          ),
8218          mkU8(1)
8219       )
8220    );
8221    return
8222       binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo));
8223 }
8224 
8225 /* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns.  Given two 64-bit
8226    values (aa,bb), computes, for each lane:
8227 
8228           if aa_lane < 0 then - bb_lane
8229      else if aa_lane > 0 then bb_lane
8230      else 0
8231 */
dis_PSIGN_helper(IRExpr * aax,IRExpr * bbx,Int laneszB)8232 static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB )
8233 {
8234    IRTemp aa       = newTemp(Ity_I64);
8235    IRTemp bb       = newTemp(Ity_I64);
8236    IRTemp zero     = newTemp(Ity_I64);
8237    IRTemp bbNeg    = newTemp(Ity_I64);
8238    IRTemp negMask  = newTemp(Ity_I64);
8239    IRTemp posMask  = newTemp(Ity_I64);
8240    IROp   opSub    = Iop_INVALID;
8241    IROp   opCmpGTS = Iop_INVALID;
8242 
8243    switch (laneszB) {
8244       case 1: opSub = Iop_Sub8x8;  opCmpGTS = Iop_CmpGT8Sx8;  break;
8245       case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break;
8246       case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break;
8247       default: vassert(0);
8248    }
8249 
8250    assign( aa,      aax );
8251    assign( bb,      bbx );
8252    assign( zero,    mkU64(0) );
8253    assign( bbNeg,   binop(opSub,    mkexpr(zero), mkexpr(bb)) );
8254    assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) );
8255    assign( posMask, binop(opCmpGTS, mkexpr(aa),   mkexpr(zero)) );
8256 
8257    return
8258       binop(Iop_Or64,
8259             binop(Iop_And64, mkexpr(bb),    mkexpr(posMask)),
8260             binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) );
8261 
8262 }
8263 
8264 /* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns.  Given a 64-bit
8265    value aa, computes, for each lane
8266 
8267    if aa < 0 then -aa else aa
8268 
8269    Note that the result is interpreted as unsigned, so that the
8270    absolute value of the most negative signed input can be
8271    represented.
8272 */
dis_PABS_helper(IRExpr * aax,Int laneszB)8273 static IRExpr* dis_PABS_helper ( IRExpr* aax, Int laneszB )
8274 {
8275    IRTemp aa      = newTemp(Ity_I64);
8276    IRTemp zero    = newTemp(Ity_I64);
8277    IRTemp aaNeg   = newTemp(Ity_I64);
8278    IRTemp negMask = newTemp(Ity_I64);
8279    IRTemp posMask = newTemp(Ity_I64);
8280    IROp   opSub   = Iop_INVALID;
8281    IROp   opSarN  = Iop_INVALID;
8282 
8283    switch (laneszB) {
8284       case 1: opSub = Iop_Sub8x8;  opSarN = Iop_SarN8x8;  break;
8285       case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break;
8286       case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break;
8287       default: vassert(0);
8288    }
8289 
8290    assign( aa,      aax );
8291    assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) );
8292    assign( posMask, unop(Iop_Not64, mkexpr(negMask)) );
8293    assign( zero,    mkU64(0) );
8294    assign( aaNeg,   binop(opSub, mkexpr(zero), mkexpr(aa)) );
8295    return
8296       binop(Iop_Or64,
8297             binop(Iop_And64, mkexpr(aa),    mkexpr(posMask)),
8298             binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) );
8299 }
8300 
dis_PALIGNR_XMM_helper(IRTemp hi64,IRTemp lo64,Int byteShift)8301 static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
8302                                         IRTemp lo64, Int byteShift )
8303 {
8304    vassert(byteShift >= 1 && byteShift <= 7);
8305    return
8306       binop(Iop_Or64,
8307             binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))),
8308             binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift))
8309       );
8310 }
8311 
8312 /* Generate a SIGSEGV followed by a restart of the current instruction
8313    if effective_addr is not 16-aligned.  This is required behaviour
8314    for some SSE3 instructions and all 128-bit SSSE3 instructions.
8315    This assumes that guest_RIP_curr_instr is set correctly! */
gen_SEGV_if_not_16_aligned(IRTemp effective_addr)8316 static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr )
8317 {
8318    stmt(
8319       IRStmt_Exit(
8320          binop(Iop_CmpNE32,
8321                binop(Iop_And32,mkexpr(effective_addr),mkU32(0xF)),
8322                mkU32(0)),
8323          Ijk_SigSEGV,
8324          IRConst_U32(guest_EIP_curr_instr),
8325          OFFB_EIP
8326       )
8327    );
8328 }
8329 
8330 
8331 /* Helper for deciding whether a given insn (starting at the opcode
8332    byte) may validly be used with a LOCK prefix.  The following insns
8333    may be used with LOCK when their destination operand is in memory.
8334    AFAICS this is exactly the same for both 32-bit and 64-bit mode.
8335 
8336    ADD        80 /0,  81 /0,  82 /0,  83 /0,  00,  01
8337    OR         80 /1,  81 /1,  82 /x,  83 /1,  08,  09
8338    ADC        80 /2,  81 /2,  82 /2,  83 /2,  10,  11
8339    SBB        81 /3,  81 /3,  82 /x,  83 /3,  18,  19
8340    AND        80 /4,  81 /4,  82 /x,  83 /4,  20,  21
8341    SUB        80 /5,  81 /5,  82 /x,  83 /5,  28,  29
8342    XOR        80 /6,  81 /6,  82 /x,  83 /6,  30,  31
8343 
8344    DEC        FE /1,  FF /1
8345    INC        FE /0,  FF /0
8346 
8347    NEG        F6 /3,  F7 /3
8348    NOT        F6 /2,  F7 /2
8349 
8350    XCHG       86, 87
8351 
8352    BTC        0F BB,  0F BA /7
8353    BTR        0F B3,  0F BA /6
8354    BTS        0F AB,  0F BA /5
8355 
8356    CMPXCHG    0F B0,  0F B1
8357    CMPXCHG8B  0F C7 /1
8358 
8359    XADD       0F C0,  0F C1
8360 
8361    ------------------------------
8362 
8363    80 /0  =  addb $imm8,  rm8
8364    81 /0  =  addl $imm32, rm32  and  addw $imm16, rm16
8365    82 /0  =  addb $imm8,  rm8
8366    83 /0  =  addl $simm8, rm32  and  addw $simm8, rm16
8367 
8368    00     =  addb r8,  rm8
8369    01     =  addl r32, rm32  and  addw r16, rm16
8370 
8371    Same for ADD OR ADC SBB AND SUB XOR
8372 
8373    FE /1  = dec rm8
8374    FF /1  = dec rm32  and  dec rm16
8375 
8376    FE /0  = inc rm8
8377    FF /0  = inc rm32  and  inc rm16
8378 
8379    F6 /3  = neg rm8
8380    F7 /3  = neg rm32  and  neg rm16
8381 
8382    F6 /2  = not rm8
8383    F7 /2  = not rm32  and  not rm16
8384 
8385    0F BB     = btcw r16, rm16    and  btcl r32, rm32
8386    OF BA /7  = btcw $imm8, rm16  and  btcw $imm8, rm32
8387 
8388    Same for BTS, BTR
8389 */
can_be_used_with_LOCK_prefix(const UChar * opc)8390 static Bool can_be_used_with_LOCK_prefix ( const UChar* opc )
8391 {
8392    switch (opc[0]) {
8393       case 0x00: case 0x01: case 0x08: case 0x09:
8394       case 0x10: case 0x11: case 0x18: case 0x19:
8395       case 0x20: case 0x21: case 0x28: case 0x29:
8396       case 0x30: case 0x31:
8397          if (!epartIsReg(opc[1]))
8398             return True;
8399          break;
8400 
8401       case 0x80: case 0x81: case 0x82: case 0x83:
8402          if (gregOfRM(opc[1]) >= 0 && gregOfRM(opc[1]) <= 6
8403              && !epartIsReg(opc[1]))
8404             return True;
8405          break;
8406 
8407       case 0xFE: case 0xFF:
8408          if (gregOfRM(opc[1]) >= 0 && gregOfRM(opc[1]) <= 1
8409              && !epartIsReg(opc[1]))
8410             return True;
8411          break;
8412 
8413       case 0xF6: case 0xF7:
8414          if (gregOfRM(opc[1]) >= 2 && gregOfRM(opc[1]) <= 3
8415              && !epartIsReg(opc[1]))
8416             return True;
8417          break;
8418 
8419       case 0x86: case 0x87:
8420          if (!epartIsReg(opc[1]))
8421             return True;
8422          break;
8423 
8424       case 0x0F: {
8425          switch (opc[1]) {
8426             case 0xBB: case 0xB3: case 0xAB:
8427                if (!epartIsReg(opc[2]))
8428                   return True;
8429                break;
8430             case 0xBA:
8431                if (gregOfRM(opc[2]) >= 5 && gregOfRM(opc[2]) <= 7
8432                    && !epartIsReg(opc[2]))
8433                   return True;
8434                break;
8435             case 0xB0: case 0xB1:
8436                if (!epartIsReg(opc[2]))
8437                   return True;
8438                break;
8439             case 0xC7:
8440                if (gregOfRM(opc[2]) == 1 && !epartIsReg(opc[2]) )
8441                   return True;
8442                break;
8443             case 0xC0: case 0xC1:
8444                if (!epartIsReg(opc[2]))
8445                   return True;
8446                break;
8447             default:
8448                break;
8449          } /* switch (opc[1]) */
8450          break;
8451       }
8452 
8453       default:
8454          break;
8455    } /* switch (opc[0]) */
8456 
8457    return False;
8458 }
8459 
math_BSWAP(IRTemp t1,IRType ty)8460 static IRTemp math_BSWAP ( IRTemp t1, IRType ty )
8461 {
8462    IRTemp t2 = newTemp(ty);
8463    if (ty == Ity_I32) {
8464       assign( t2,
8465          binop(
8466             Iop_Or32,
8467             binop(Iop_Shl32, mkexpr(t1), mkU8(24)),
8468             binop(
8469                Iop_Or32,
8470                binop(Iop_And32, binop(Iop_Shl32, mkexpr(t1), mkU8(8)),
8471                                 mkU32(0x00FF0000)),
8472                binop(Iop_Or32,
8473                      binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(8)),
8474                                       mkU32(0x0000FF00)),
8475                      binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(24)),
8476                                       mkU32(0x000000FF) )
8477             )))
8478       );
8479       return t2;
8480    }
8481    if (ty == Ity_I16) {
8482       assign(t2,
8483              binop(Iop_Or16,
8484                    binop(Iop_Shl16, mkexpr(t1), mkU8(8)),
8485                    binop(Iop_Shr16, mkexpr(t1), mkU8(8)) ));
8486       return t2;
8487    }
8488    vassert(0);
8489    /*NOTREACHED*/
8490    return IRTemp_INVALID;
8491 }
8492 
8493 /*------------------------------------------------------------*/
8494 /*--- Disassemble a single instruction                     ---*/
8495 /*------------------------------------------------------------*/
8496 
8497 /* Disassemble a single instruction into IR.  The instruction is
8498    located in host memory at &guest_code[delta].  *expect_CAS is set
8499    to True if the resulting IR is expected to contain an IRCAS
8500    statement, and False if it's not expected to.  This makes it
8501    possible for the caller of disInstr_X86_WRK to check that
8502    LOCK-prefixed instructions are at least plausibly translated, in
8503    that it becomes possible to check that a (validly) LOCK-prefixed
8504    instruction generates a translation containing an IRCAS, and
8505    instructions without LOCK prefixes don't generate translations
8506    containing an IRCAS.
8507 */
8508 static
disInstr_X86_WRK(Bool * expect_CAS,Bool (* resteerOkFn)(void *,Addr),Bool resteerCisOk,void * callback_opaque,Long delta64,const VexArchInfo * archinfo,const VexAbiInfo * vbi,Bool sigill_diag)8509 DisResult disInstr_X86_WRK (
8510              /*OUT*/Bool* expect_CAS,
8511              Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
8512              Bool         resteerCisOk,
8513              void*        callback_opaque,
8514              Long         delta64,
8515              const VexArchInfo* archinfo,
8516              const VexAbiInfo*  vbi,
8517              Bool         sigill_diag
8518           )
8519 {
8520    IRType    ty;
8521    IRTemp    addr, t0, t1, t2, t3, t4, t5, t6;
8522    Int       alen;
8523    UChar     opc, modrm, abyte, pre;
8524    UInt      d32;
8525    HChar     dis_buf[50];
8526    Int       am_sz, d_sz, n_prefixes;
8527    DisResult dres;
8528    const UChar* insn; /* used in SSE decoders */
8529    Bool      has_66_pfx = 0;
8530 
8531    /* The running delta */
8532    Int delta = (Int)delta64;
8533 
8534    /* Holds eip at the start of the insn, so that we can print
8535       consistent error messages for unimplemented insns. */
8536    Int delta_start = delta;
8537 
8538    /* we keep using sz in order to avoid changing a lot of code without
8539     * any gain. So sz is equal to the current_sz_data.
8540     */
8541    Int sz;
8542    if (archinfo->x86_cr0 & 1) {
8543      sz = 4;
8544      current_sz_addr = 4;
8545      current_sz_data = 4;
8546      protected_mode = True;
8547    } else {
8548      sz = 2;
8549      current_sz_addr = 2;
8550      current_sz_data = 2;
8551      protected_mode = False;
8552    }
8553 
8554    /* sorb holds the segment-override-prefix byte, if any.  Zero if no
8555       prefix has been seen, else one of {0x26, 0x36, 0x3E, 0x64, 0x65}
8556       indicating the prefix.  */
8557    UChar sorb = 0;
8558 
8559    /* Gets set to True if a LOCK prefix is seen. */
8560    Bool pfx_lock = False;
8561 
8562    /* Set result defaults. */
8563    dres.whatNext    = Dis_Continue;
8564    dres.len         = 0;
8565    dres.continueAt  = 0;
8566    dres.hint        = Dis_HintNone;
8567    dres.jk_StopHere = Ijk_INVALID;
8568 
8569    *expect_CAS = False;
8570 
8571    addr = t0 = t1 = t2 = t3 = t4 = t5 = t6 = IRTemp_INVALID;
8572 
8573    vassert(guest_EIP_bbstart + delta == guest_EIP_curr_instr);
8574    DIP("\t0x%x:  ", guest_EIP_bbstart+delta);
8575 
8576    /* Spot "Special" instructions (see comment at top of file). */
8577    {
8578       const UChar* code = guest_code + delta;
8579       /* Spot the 12-byte preamble:
8580          C1C703   roll $3,  %edi
8581          C1C70D   roll $13, %edi
8582          C1C71D   roll $29, %edi
8583          C1C713   roll $19, %edi
8584       */
8585       if (code[ 0] == 0xC1 && code[ 1] == 0xC7 && code[ 2] == 0x03 &&
8586           code[ 3] == 0xC1 && code[ 4] == 0xC7 && code[ 5] == 0x0D &&
8587           code[ 6] == 0xC1 && code[ 7] == 0xC7 && code[ 8] == 0x1D &&
8588           code[ 9] == 0xC1 && code[10] == 0xC7 && code[11] == 0x13) {
8589          /* Got a "Special" instruction preamble.  Which one is it? */
8590          if (code[12] == 0x87 && code[13] == 0xDB /* xchgl %ebx,%ebx */) {
8591             /* %EDX = client_request ( %EAX ) */
8592             DIP("%%edx = client_request ( %%eax )\n");
8593             delta += 14;
8594             jmp_lit(&dres, Ijk_ClientReq, guest_EIP_bbstart+delta);
8595             vassert(dres.whatNext == Dis_StopHere);
8596             goto decode_success;
8597          }
8598          else
8599          if (code[12] == 0x87 && code[13] == 0xC9 /* xchgl %ecx,%ecx */) {
8600             /* %EAX = guest_NRADDR */
8601             DIP("%%eax = guest_NRADDR\n");
8602             delta += 14;
8603             putIReg(4, R_EAX, IRExpr_Get( OFFB_NRADDR, Ity_I32 ));
8604             goto decode_success;
8605          }
8606          else
8607          if (code[12] == 0x87 && code[13] == 0xD2 /* xchgl %edx,%edx */) {
8608             /* call-noredir *%EAX */
8609             DIP("call-noredir *%%eax\n");
8610             delta += 14;
8611             t1 = newTemp(Ity_I32);
8612             assign(t1, getIReg(4,R_EAX));
8613             t2 = newTemp(Ity_I32);
8614             assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
8615             putIReg(4, R_ESP, mkexpr(t2));
8616             storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta));
8617             jmp_treg(&dres, Ijk_NoRedir, t1);
8618             vassert(dres.whatNext == Dis_StopHere);
8619             goto decode_success;
8620          }
8621          else
8622          if (code[12] == 0x87 && code[13] == 0xFF /* xchgl %edi,%edi */) {
8623             /* IR injection */
8624             DIP("IR injection\n");
8625             vex_inject_ir(irsb, Iend_LE);
8626 
8627             // Invalidate the current insn. The reason is that the IRop we're
8628             // injecting here can change. In which case the translation has to
8629             // be redone. For ease of handling, we simply invalidate all the
8630             // time.
8631             stmt(IRStmt_Put(OFFB_CMSTART, mkU32(guest_EIP_curr_instr)));
8632             stmt(IRStmt_Put(OFFB_CMLEN,   mkU32(14)));
8633 
8634             delta += 14;
8635 
8636             stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_bbstart + delta) ) );
8637             dres.whatNext    = Dis_StopHere;
8638             dres.jk_StopHere = Ijk_InvalICache;
8639             goto decode_success;
8640          }
8641          /* We don't know what it is. */
8642          goto decode_failure;
8643          /*NOTREACHED*/
8644       }
8645    }
8646 
8647    /* Handle a couple of weird-ass NOPs that have been observed in the
8648       wild. */
8649    {
8650       const UChar* code = guest_code + delta;
8651       /* Sun's JVM 1.5.0 uses the following as a NOP:
8652          26 2E 64 65 90  %es:%cs:%fs:%gs:nop */
8653       if (code[0] == 0x26 && code[1] == 0x2E && code[2] == 0x64
8654           && code[3] == 0x65 && code[4] == 0x90) {
8655          DIP("%%es:%%cs:%%fs:%%gs:nop\n");
8656          delta += 5;
8657          goto decode_success;
8658       }
8659       /* Don't barf on recent binutils padding,
8660          all variants of which are: nopw %cs:0x0(%eax,%eax,1)
8661          66 2e 0f 1f 84 00 00 00 00 00
8662          66 66 2e 0f 1f 84 00 00 00 00 00
8663          66 66 66 2e 0f 1f 84 00 00 00 00 00
8664          66 66 66 66 2e 0f 1f 84 00 00 00 00 00
8665          66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00
8666          66 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00
8667       */
8668       if (code[0] == 0x66) {
8669          Int data16_cnt;
8670          for (data16_cnt = 1; data16_cnt < 6; data16_cnt++)
8671             if (code[data16_cnt] != 0x66)
8672                break;
8673          if (code[data16_cnt] == 0x2E && code[data16_cnt + 1] == 0x0F
8674              && code[data16_cnt + 2] == 0x1F && code[data16_cnt + 3] == 0x84
8675              && code[data16_cnt + 4] == 0x00 && code[data16_cnt + 5] == 0x00
8676              && code[data16_cnt + 6] == 0x00 && code[data16_cnt + 7] == 0x00
8677              && code[data16_cnt + 8] == 0x00 ) {
8678             DIP("nopw %%cs:0x0(%%eax,%%eax,1)\n");
8679             delta += 9 + data16_cnt;
8680             goto decode_success;
8681          }
8682       }
8683    }
8684 
8685    /* Normal instruction handling starts here. */
8686 
8687    /* Deal with some but not all prefixes:
8688          66(oso)
8689          F0(lock)
8690          2E(cs:) 3E(ds:) 26(es:) 64(fs:) 65(gs:) 36(ss:)
8691       Not dealt with (left in place):
8692          F2 F3
8693    */
8694    n_prefixes = 0;
8695    while (True) {
8696       if (n_prefixes > 7) goto decode_failure;
8697       pre = getUChar(delta);
8698       switch (pre) {
8699          case 0x66:
8700             has_66_pfx = 1;
8701             if (protected_mode) {
8702                sz = 2;
8703                current_sz_data = 2;
8704             } else {
8705                sz = 4;
8706                current_sz_data = 4;
8707             }
8708             break;
8709          case 0x67:
8710             if (protected_mode) {
8711                current_sz_addr = 2;
8712             } else {
8713                current_sz_addr = 4;
8714             }
8715             break;
8716          case 0xF0:
8717             pfx_lock = True;
8718             *expect_CAS = True;
8719             break;
8720          case 0x3E: /* %DS: */
8721          case 0x26: /* %ES: */
8722          case 0x64: /* %FS: */
8723          case 0x65: /* %GS: */
8724          case 0x36: /* %SS: */
8725             if (sorb != 0)
8726                goto decode_failure; /* only one seg override allowed */
8727             sorb = pre;
8728             break;
8729          case 0x2E: { /* %CS: */
8730             /* 2E prefix on a conditional branch instruction is a
8731                branch-prediction hint, which can safely be ignored.  */
8732             UChar op1 = getIByte(delta+1);
8733             UChar op2 = getIByte(delta+2);
8734             if ((op1 >= 0x70 && op1 <= 0x7F)
8735                 || (op1 == 0xE3)
8736                 || (op1 == 0x0F && op2 >= 0x80 && op2 <= 0x8F)) {
8737                if (0) vex_printf("vex x86->IR: ignoring branch hint\n");
8738             } else {
8739               sorb = pre;
8740             }
8741             break;
8742          }
8743          default:
8744             goto not_a_prefix;
8745       }
8746       n_prefixes++;
8747       delta++;
8748    }
8749 
8750    not_a_prefix:
8751 
8752    /* Now we should be looking at the primary opcode byte or the
8753       leading F2 or F3.  Check that any LOCK prefix is actually
8754       allowed. */
8755 
8756    if (pfx_lock) {
8757      if (can_be_used_with_LOCK_prefix( &guest_code[delta] )) {
8758          DIP("lock ");
8759       } else {
8760          *expect_CAS = False;
8761          goto decode_failure;
8762       }
8763    }
8764 
8765 
8766    /* ---------------------------------------------------- */
8767    /* --- The SSE decoder.                             --- */
8768    /* ---------------------------------------------------- */
8769 
8770    /* What did I do to deserve SSE ?  Perhaps I was really bad in a
8771       previous life? */
8772 
8773    /* Note, this doesn't handle SSE2 or SSE3.  That is handled in a
8774       later section, further on. */
8775 
8776    insn = &guest_code[delta];
8777 
8778    /* Treat fxsave specially.  It should be doable even on an SSE0
8779       (Pentium-II class) CPU.  Hence be prepared to handle it on
8780       any subarchitecture variant.
8781    */
8782 
8783    /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory */
8784    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0xAE
8785        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 0) {
8786       IRDirty* d;
8787       modrm = getIByte(delta+2);
8788       vassert(!has_66_pfx);
8789       vassert(!epartIsReg(modrm));
8790 
8791       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8792       delta += 2+alen;
8793       gen_SEGV_if_not_16_aligned(addr);
8794 
8795       DIP("fxsave %s\n", dis_buf);
8796 
8797       /* Uses dirty helper:
8798             void x86g_do_FXSAVE ( VexGuestX86State*, UInt ) */
8799       d = unsafeIRDirty_0_N (
8800              0/*regparms*/,
8801              "x86g_dirtyhelper_FXSAVE",
8802              &x86g_dirtyhelper_FXSAVE,
8803              mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
8804           );
8805 
8806       /* declare we're writing memory */
8807       d->mFx   = Ifx_Write;
8808       d->mAddr = mkexpr(addr);
8809       d->mSize = 464; /* according to recent Intel docs */
8810 
8811       /* declare we're reading guest state */
8812       d->nFxState = 7;
8813       vex_bzero(&d->fxState, sizeof(d->fxState));
8814 
8815       d->fxState[0].fx     = Ifx_Read;
8816       d->fxState[0].offset = OFFB_FTOP;
8817       d->fxState[0].size   = sizeof(UInt);
8818 
8819       d->fxState[1].fx     = Ifx_Read;
8820       d->fxState[1].offset = OFFB_FPREGS;
8821       d->fxState[1].size   = 8 * sizeof(ULong);
8822 
8823       d->fxState[2].fx     = Ifx_Read;
8824       d->fxState[2].offset = OFFB_FPTAGS;
8825       d->fxState[2].size   = 8 * sizeof(UChar);
8826 
8827       d->fxState[3].fx     = Ifx_Read;
8828       d->fxState[3].offset = OFFB_FPROUND;
8829       d->fxState[3].size   = sizeof(UInt);
8830 
8831       d->fxState[4].fx     = Ifx_Read;
8832       d->fxState[4].offset = OFFB_FC3210;
8833       d->fxState[4].size   = sizeof(UInt);
8834 
8835       d->fxState[5].fx     = Ifx_Read;
8836       d->fxState[5].offset = OFFB_XMM0;
8837       d->fxState[5].size   = 8 * sizeof(U128);
8838 
8839       d->fxState[6].fx     = Ifx_Read;
8840       d->fxState[6].offset = OFFB_SSEROUND;
8841       d->fxState[6].size   = sizeof(UInt);
8842 
8843       /* Be paranoid ... this assertion tries to ensure the 8 %xmm
8844 	 images are packed back-to-back.  If not, the value of
8845 	 d->fxState[5].size is wrong. */
8846       vassert(16 == sizeof(U128));
8847       vassert(OFFB_XMM7 == (OFFB_XMM0 + 7 * 16));
8848 
8849       stmt( IRStmt_Dirty(d) );
8850 
8851       goto decode_success;
8852    }
8853 
8854    /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory */
8855    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0xAE
8856        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 1) {
8857       IRDirty* d;
8858       modrm = getIByte(delta+2);
8859       vassert(!has_66_pfx);
8860       vassert(!epartIsReg(modrm));
8861 
8862       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8863       delta += 2+alen;
8864       gen_SEGV_if_not_16_aligned(addr);
8865 
8866       DIP("fxrstor %s\n", dis_buf);
8867 
8868       /* Uses dirty helper:
8869             VexEmNote x86g_do_FXRSTOR ( VexGuestX86State*, UInt )
8870          NOTE:
8871             the VexEmNote value is simply ignored (unlike for FRSTOR)
8872       */
8873       d = unsafeIRDirty_0_N (
8874              0/*regparms*/,
8875              "x86g_dirtyhelper_FXRSTOR",
8876              &x86g_dirtyhelper_FXRSTOR,
8877              mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
8878           );
8879 
8880       /* declare we're reading memory */
8881       d->mFx   = Ifx_Read;
8882       d->mAddr = mkexpr(addr);
8883       d->mSize = 464; /* according to recent Intel docs */
8884 
8885       /* declare we're writing guest state */
8886       d->nFxState = 7;
8887       vex_bzero(&d->fxState, sizeof(d->fxState));
8888 
8889       d->fxState[0].fx     = Ifx_Write;
8890       d->fxState[0].offset = OFFB_FTOP;
8891       d->fxState[0].size   = sizeof(UInt);
8892 
8893       d->fxState[1].fx     = Ifx_Write;
8894       d->fxState[1].offset = OFFB_FPREGS;
8895       d->fxState[1].size   = 8 * sizeof(ULong);
8896 
8897       d->fxState[2].fx     = Ifx_Write;
8898       d->fxState[2].offset = OFFB_FPTAGS;
8899       d->fxState[2].size   = 8 * sizeof(UChar);
8900 
8901       d->fxState[3].fx     = Ifx_Write;
8902       d->fxState[3].offset = OFFB_FPROUND;
8903       d->fxState[3].size   = sizeof(UInt);
8904 
8905       d->fxState[4].fx     = Ifx_Write;
8906       d->fxState[4].offset = OFFB_FC3210;
8907       d->fxState[4].size   = sizeof(UInt);
8908 
8909       d->fxState[5].fx     = Ifx_Write;
8910       d->fxState[5].offset = OFFB_XMM0;
8911       d->fxState[5].size   = 8 * sizeof(U128);
8912 
8913       d->fxState[6].fx     = Ifx_Write;
8914       d->fxState[6].offset = OFFB_SSEROUND;
8915       d->fxState[6].size   = sizeof(UInt);
8916 
8917       /* Be paranoid ... this assertion tries to ensure the 8 %xmm
8918 	 images are packed back-to-back.  If not, the value of
8919 	 d->fxState[5].size is wrong. */
8920       vassert(16 == sizeof(U128));
8921       vassert(OFFB_XMM7 == (OFFB_XMM0 + 7 * 16));
8922 
8923       stmt( IRStmt_Dirty(d) );
8924 
8925       goto decode_success;
8926    }
8927 
8928    /* ------ SSE decoder main ------ */
8929 
8930    /* Skip parts of the decoder which don't apply given the stated
8931       guest subarchitecture. */
8932    if (archinfo->hwcaps == 0/*baseline, no sse at all*/)
8933       goto after_sse_decoders;
8934 
8935    /* With mmxext only some extended MMX instructions are recognized.
8936       The mmxext instructions are MASKMOVQ MOVNTQ PAVGB PAVGW PMAXSW
8937       PMAXUB PMINSW PMINUB PMULHUW PSADBW PSHUFW PEXTRW PINSRW PMOVMSKB
8938       PREFETCHNTA PREFETCHT0 PREFETCHT1 PREFETCHT2 SFENCE
8939 
8940       http://support.amd.com/us/Embedded_TechDocs/22466.pdf
8941       https://en.wikipedia.org/wiki/3DNow!#3DNow.21_extensions */
8942 
8943    if (archinfo->hwcaps == VEX_HWCAPS_X86_MMXEXT/*integer only sse1 subset*/)
8944       goto mmxext;
8945 
8946    /* Otherwise we must be doing sse1 or sse2, so we can at least try
8947       for SSE1 here. */
8948 
8949    /* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
8950    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0x58) {
8951       delta = dis_SSE_E_to_G_all( sorb, delta+2, "addps", Iop_Add32Fx4 );
8952       goto decode_success;
8953    }
8954 
8955    /* F3 0F 58 = ADDSS -- add 32F0x4 from R/M to R */
8956    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x58) {
8957       vassert(!has_66_pfx);
8958       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "addss", Iop_Add32F0x4 );
8959       goto decode_success;
8960    }
8961 
8962    /* 0F 55 = ANDNPS -- G = (not G) and E */
8963    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0x55) {
8964       delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "andnps", Iop_AndV128 );
8965       goto decode_success;
8966    }
8967 
8968    /* 0F 54 = ANDPS -- G = G and E */
8969    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0x54) {
8970       delta = dis_SSE_E_to_G_all( sorb, delta+2, "andps", Iop_AndV128 );
8971       goto decode_success;
8972    }
8973 
8974    /* 0F C2 = CMPPS -- 32Fx4 comparison from R/M to R */
8975    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0xC2) {
8976       delta = dis_SSEcmp_E_to_G( sorb, delta+2, "cmpps", True, 4 );
8977       goto decode_success;
8978    }
8979 
8980    /* F3 0F C2 = CMPSS -- 32F0x4 comparison from R/M to R */
8981    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xC2) {
8982       vassert(!has_66_pfx);
8983       delta = dis_SSEcmp_E_to_G( sorb, delta+3, "cmpss", False, 4 );
8984       goto decode_success;
8985    }
8986 
8987    /* 0F 2F = COMISS  -- 32F0x4 comparison G,E, and set ZCP */
8988    /* 0F 2E = UCOMISS -- 32F0x4 comparison G,E, and set ZCP */
8989    if (!has_66_pfx && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
8990       IRTemp argL = newTemp(Ity_F32);
8991       IRTemp argR = newTemp(Ity_F32);
8992       modrm = getIByte(delta+2);
8993       if (epartIsReg(modrm)) {
8994          assign( argR, getXMMRegLane32F( eregOfRM(modrm), 0/*lowest lane*/ ) );
8995          delta += 2+1;
8996          DIP("[u]comiss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
8997                                   nameXMMReg(gregOfRM(modrm)) );
8998       } else {
8999          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9000 	 assign( argR, loadLE(Ity_F32, mkexpr(addr)) );
9001          delta += 2+alen;
9002          DIP("[u]comiss %s,%s\n", dis_buf,
9003                                   nameXMMReg(gregOfRM(modrm)) );
9004       }
9005       assign( argL, getXMMRegLane32F( gregOfRM(modrm), 0/*lowest lane*/ ) );
9006 
9007       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
9008       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
9009       stmt( IRStmt_Put(
9010                OFFB_CC_DEP1,
9011                binop( Iop_And32,
9012                       binop(Iop_CmpF64,
9013                             unop(Iop_F32toF64,mkexpr(argL)),
9014                             unop(Iop_F32toF64,mkexpr(argR))),
9015                       mkU32(0x45)
9016           )));
9017       /* Set NDEP even though it isn't used.  This makes redundant-PUT
9018          elimination of previous stores to this field work better. */
9019       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
9020       goto decode_success;
9021    }
9022 
9023    /* 0F 2A = CVTPI2PS -- convert 2 x I32 in mem/mmx to 2 x F32 in low
9024       half xmm */
9025    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0x2A) {
9026       IRTemp arg64 = newTemp(Ity_I64);
9027       IRTemp rmode = newTemp(Ity_I32);
9028       vassert(!has_66_pfx);
9029 
9030       modrm = getIByte(delta+2);
9031       if (epartIsReg(modrm)) {
9032          /* Only switch to MMX mode if the source is a MMX register.
9033             See comments on CVTPI2PD for details.  Fixes #357059. */
9034          do_MMX_preamble();
9035          assign( arg64, getMMXReg(eregOfRM(modrm)) );
9036          delta += 2+1;
9037          DIP("cvtpi2ps %s,%s\n", nameMMXReg(eregOfRM(modrm)),
9038                                  nameXMMReg(gregOfRM(modrm)));
9039       } else {
9040          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9041 	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
9042          delta += 2+alen;
9043          DIP("cvtpi2ps %s,%s\n", dis_buf,
9044                                  nameXMMReg(gregOfRM(modrm)) );
9045       }
9046 
9047       assign( rmode, get_sse_roundingmode() );
9048 
9049       putXMMRegLane32F(
9050          gregOfRM(modrm), 0,
9051          binop(Iop_F64toF32,
9052                mkexpr(rmode),
9053                unop(Iop_I32StoF64,
9054                     unop(Iop_64to32, mkexpr(arg64)) )) );
9055 
9056       putXMMRegLane32F(
9057          gregOfRM(modrm), 1,
9058          binop(Iop_F64toF32,
9059                mkexpr(rmode),
9060                unop(Iop_I32StoF64,
9061                     unop(Iop_64HIto32, mkexpr(arg64)) )) );
9062 
9063       goto decode_success;
9064    }
9065 
9066    /* F3 0F 2A = CVTSI2SS -- convert I32 in mem/ireg to F32 in low
9067       quarter xmm */
9068    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x2A) {
9069       IRTemp arg32 = newTemp(Ity_I32);
9070       IRTemp rmode = newTemp(Ity_I32);
9071       vassert(!has_66_pfx);
9072 
9073       modrm = getIByte(delta+3);
9074       if (epartIsReg(modrm)) {
9075          assign( arg32, getIReg(4, eregOfRM(modrm)) );
9076          delta += 3+1;
9077          DIP("cvtsi2ss %s,%s\n", nameIReg(4, eregOfRM(modrm)),
9078                                  nameXMMReg(gregOfRM(modrm)));
9079       } else {
9080          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9081 	 assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
9082          delta += 3+alen;
9083          DIP("cvtsi2ss %s,%s\n", dis_buf,
9084                                  nameXMMReg(gregOfRM(modrm)) );
9085       }
9086 
9087       assign( rmode, get_sse_roundingmode() );
9088 
9089       putXMMRegLane32F(
9090          gregOfRM(modrm), 0,
9091          binop(Iop_F64toF32,
9092                mkexpr(rmode),
9093                unop(Iop_I32StoF64, mkexpr(arg32)) ) );
9094 
9095       goto decode_success;
9096    }
9097 
9098    /* 0F 2D = CVTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
9099       I32 in mmx, according to prevailing SSE rounding mode */
9100    /* 0F 2C = CVTTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
9101       I32 in mmx, rounding towards zero */
9102    if (!has_66_pfx && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
9103       IRTemp dst64  = newTemp(Ity_I64);
9104       IRTemp rmode  = newTemp(Ity_I32);
9105       IRTemp f32lo  = newTemp(Ity_F32);
9106       IRTemp f32hi  = newTemp(Ity_F32);
9107       Bool   r2zero = toBool(insn[1] == 0x2C);
9108 
9109       do_MMX_preamble();
9110       modrm = getIByte(delta+2);
9111 
9112       if (epartIsReg(modrm)) {
9113          delta += 2+1;
9114 	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
9115 	 assign(f32hi, getXMMRegLane32F(eregOfRM(modrm), 1));
9116          DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
9117                                    nameXMMReg(eregOfRM(modrm)),
9118                                    nameMMXReg(gregOfRM(modrm)));
9119       } else {
9120          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9121 	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
9122 	 assign(f32hi, loadLE(Ity_F32, binop( Iop_Add32,
9123                                               mkexpr(addr),
9124                                               mkU32(4) )));
9125          delta += 2+alen;
9126          DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
9127                                    dis_buf,
9128                                    nameMMXReg(gregOfRM(modrm)));
9129       }
9130 
9131       if (r2zero) {
9132          assign(rmode, mkU32((UInt)Irrm_ZERO) );
9133       } else {
9134          assign( rmode, get_sse_roundingmode() );
9135       }
9136 
9137       assign(
9138          dst64,
9139          binop( Iop_32HLto64,
9140                 binop( Iop_F64toI32S,
9141                        mkexpr(rmode),
9142                        unop( Iop_F32toF64, mkexpr(f32hi) ) ),
9143                 binop( Iop_F64toI32S,
9144                        mkexpr(rmode),
9145                        unop( Iop_F32toF64, mkexpr(f32lo) ) )
9146               )
9147       );
9148 
9149       putMMXReg(gregOfRM(modrm), mkexpr(dst64));
9150       goto decode_success;
9151    }
9152 
9153    /* F3 0F 2D = CVTSS2SI -- convert F32 in mem/low quarter xmm to
9154       I32 in ireg, according to prevailing SSE rounding mode */
9155    /* F3 0F 2C = CVTTSS2SI -- convert F32 in mem/low quarter xmm to
9156       I32 in ireg, rounding towards zero */
9157    if (insn[0] == 0xF3 && insn[1] == 0x0F
9158        && (insn[2] == 0x2D || insn[2] == 0x2C)) {
9159       IRTemp rmode = newTemp(Ity_I32);
9160       IRTemp f32lo = newTemp(Ity_F32);
9161       Bool   r2zero = toBool(insn[2] == 0x2C);
9162       vassert(!has_66_pfx);
9163 
9164       modrm = getIByte(delta+3);
9165       if (epartIsReg(modrm)) {
9166          delta += 3+1;
9167 	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
9168          DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
9169                                    nameXMMReg(eregOfRM(modrm)),
9170                                    nameIReg(4, gregOfRM(modrm)));
9171       } else {
9172          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9173 	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
9174          delta += 3+alen;
9175          DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
9176                                    dis_buf,
9177                                    nameIReg(4, gregOfRM(modrm)));
9178       }
9179 
9180       if (r2zero) {
9181          assign( rmode, mkU32((UInt)Irrm_ZERO) );
9182       } else {
9183          assign( rmode, get_sse_roundingmode() );
9184       }
9185 
9186       putIReg(4, gregOfRM(modrm),
9187                  binop( Iop_F64toI32S,
9188                         mkexpr(rmode),
9189                         unop( Iop_F32toF64, mkexpr(f32lo) ) )
9190       );
9191 
9192       goto decode_success;
9193    }
9194 
9195    /* 0F 5E = DIVPS -- div 32Fx4 from R/M to R */
9196    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0x5E) {
9197       delta = dis_SSE_E_to_G_all( sorb, delta+2, "divps", Iop_Div32Fx4 );
9198       goto decode_success;
9199    }
9200 
9201    /* F3 0F 5E = DIVSS -- div 32F0x4 from R/M to R */
9202    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5E) {
9203       vassert(!has_66_pfx);
9204       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "divss", Iop_Div32F0x4 );
9205       goto decode_success;
9206    }
9207 
9208    /* 0F AE /2 = LDMXCSR m32 -- load %mxcsr */
9209    if (insn[0] == 0x0F && insn[1] == 0xAE
9210        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 2) {
9211 
9212       IRTemp t64 = newTemp(Ity_I64);
9213       IRTemp ew = newTemp(Ity_I32);
9214 
9215       modrm = getIByte(delta+2);
9216       vassert(!epartIsReg(modrm));
9217       vassert(!has_66_pfx);
9218 
9219       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9220       delta += 2+alen;
9221       DIP("ldmxcsr %s\n", dis_buf);
9222 
9223       /* The only thing we observe in %mxcsr is the rounding mode.
9224          Therefore, pass the 32-bit value (SSE native-format control
9225          word) to a clean helper, getting back a 64-bit value, the
9226          lower half of which is the SSEROUND value to store, and the
9227          upper half of which is the emulation-warning token which may
9228          be generated.
9229       */
9230       /* ULong x86h_check_ldmxcsr ( UInt ); */
9231       assign( t64, mkIRExprCCall(
9232                       Ity_I64, 0/*regparms*/,
9233                       "x86g_check_ldmxcsr",
9234                       &x86g_check_ldmxcsr,
9235                       mkIRExprVec_1( loadLE(Ity_I32, mkexpr(addr)) )
9236                    )
9237             );
9238 
9239       put_sse_roundingmode( unop(Iop_64to32, mkexpr(t64)) );
9240       assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
9241       put_emwarn( mkexpr(ew) );
9242       /* Finally, if an emulation warning was reported, side-exit to
9243          the next insn, reporting the warning, so that Valgrind's
9244          dispatcher sees the warning. */
9245       stmt(
9246          IRStmt_Exit(
9247             binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
9248             Ijk_EmWarn,
9249             IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
9250             OFFB_EIP
9251          )
9252       );
9253       goto decode_success;
9254    }
9255 
9256 
9257    /* mmxext sse1 subset starts here. mmxext only arches will parse
9258       only this subset of the sse1 instructions. */
9259   mmxext:
9260 
9261    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
9262    /* 0F F7 = MASKMOVQ -- 8x8 masked store */
9263    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0xF7) {
9264       Bool ok = False;
9265       delta = dis_MMX( &ok, sorb, sz, delta+1 );
9266       if (!ok)
9267          goto decode_failure;
9268       goto decode_success;
9269    }
9270 
9271    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
9272    /* 0F E7 = MOVNTQ -- for us, just a plain MMX store.  Note, the
9273       Intel manual does not say anything about the usual business of
9274       the FP reg tags getting trashed whenever an MMX insn happens.
9275       So we just leave them alone.
9276    */
9277    if (insn[0] == 0x0F && insn[1] == 0xE7) {
9278       modrm = getIByte(delta+2);
9279       if (!has_66_pfx && !epartIsReg(modrm)) {
9280          /* do_MMX_preamble(); Intel docs don't specify this */
9281          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9282          storeLE( mkexpr(addr), getMMXReg(gregOfRM(modrm)) );
9283          DIP("movntq %s,%s\n", dis_buf,
9284                                nameMMXReg(gregOfRM(modrm)));
9285          delta += 2+alen;
9286          goto decode_success;
9287       }
9288       /* else fall through */
9289    }
9290 
9291    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
9292    /* 0F E0 = PAVGB -- 8x8 unsigned Packed Average, with rounding */
9293    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0xE0) {
9294       do_MMX_preamble();
9295       delta = dis_MMXop_regmem_to_reg (
9296                 sorb, delta+2, insn[1], "pavgb", False );
9297       goto decode_success;
9298    }
9299 
9300    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
9301    /* 0F E3 = PAVGW -- 16x4 unsigned Packed Average, with rounding */
9302    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0xE3) {
9303       do_MMX_preamble();
9304       delta = dis_MMXop_regmem_to_reg (
9305                 sorb, delta+2, insn[1], "pavgw", False );
9306       goto decode_success;
9307    }
9308 
9309    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
9310    /* 0F C5 = PEXTRW -- extract 16-bit field from mmx(E) and put
9311       zero-extend of it in ireg(G). */
9312    if (insn[0] == 0x0F && insn[1] == 0xC5) {
9313       modrm = insn[2];
9314       if (!has_66_pfx && epartIsReg(modrm)) {
9315          IRTemp sV = newTemp(Ity_I64);
9316          t5 = newTemp(Ity_I16);
9317          do_MMX_preamble();
9318          assign(sV, getMMXReg(eregOfRM(modrm)));
9319          breakup64to16s( sV, &t3, &t2, &t1, &t0 );
9320          switch (insn[3] & 3) {
9321             case 0:  assign(t5, mkexpr(t0)); break;
9322             case 1:  assign(t5, mkexpr(t1)); break;
9323             case 2:  assign(t5, mkexpr(t2)); break;
9324             case 3:  assign(t5, mkexpr(t3)); break;
9325             default: vassert(0); /*NOTREACHED*/
9326          }
9327          putIReg(4, gregOfRM(modrm), unop(Iop_16Uto32, mkexpr(t5)));
9328          DIP("pextrw $%d,%s,%s\n",
9329              (Int)insn[3], nameMMXReg(eregOfRM(modrm)),
9330                            nameIReg(4,gregOfRM(modrm)));
9331          delta += 4;
9332          goto decode_success;
9333       }
9334       /* else fall through */
9335    }
9336 
9337    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
9338    /* 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
9339       put it into the specified lane of mmx(G). */
9340    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0xC4) {
9341       /* Use t0 .. t3 to hold the 4 original 16-bit lanes of the
9342          mmx reg.  t4 is the new lane value.  t5 is the original
9343          mmx value. t6 is the new mmx value. */
9344       Int lane;
9345       t4 = newTemp(Ity_I16);
9346       t5 = newTemp(Ity_I64);
9347       t6 = newTemp(Ity_I64);
9348       modrm = insn[2];
9349       do_MMX_preamble();
9350 
9351       assign(t5, getMMXReg(gregOfRM(modrm)));
9352       breakup64to16s( t5, &t3, &t2, &t1, &t0 );
9353 
9354       if (epartIsReg(modrm)) {
9355          assign(t4, getIReg(2, eregOfRM(modrm)));
9356          delta += 3+1;
9357          lane = insn[3+1-1];
9358          DIP("pinsrw $%d,%s,%s\n", lane,
9359                                    nameIReg(2,eregOfRM(modrm)),
9360                                    nameMMXReg(gregOfRM(modrm)));
9361       } else {
9362          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9363          delta += 3+alen;
9364          lane = insn[3+alen-1];
9365          assign(t4, loadLE(Ity_I16, mkexpr(addr)));
9366          DIP("pinsrw $%d,%s,%s\n", lane,
9367                                    dis_buf,
9368                                    nameMMXReg(gregOfRM(modrm)));
9369       }
9370 
9371       switch (lane & 3) {
9372          case 0:  assign(t6, mk64from16s(t3,t2,t1,t4)); break;
9373          case 1:  assign(t6, mk64from16s(t3,t2,t4,t0)); break;
9374          case 2:  assign(t6, mk64from16s(t3,t4,t1,t0)); break;
9375          case 3:  assign(t6, mk64from16s(t4,t2,t1,t0)); break;
9376          default: vassert(0); /*NOTREACHED*/
9377       }
9378       putMMXReg(gregOfRM(modrm), mkexpr(t6));
9379       goto decode_success;
9380    }
9381 
9382    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
9383    /* 0F EE = PMAXSW -- 16x4 signed max */
9384    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0xEE) {
9385       do_MMX_preamble();
9386       delta = dis_MMXop_regmem_to_reg (
9387                 sorb, delta+2, insn[1], "pmaxsw", False );
9388       goto decode_success;
9389    }
9390 
9391    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
9392    /* 0F DE = PMAXUB -- 8x8 unsigned max */
9393    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0xDE) {
9394       do_MMX_preamble();
9395       delta = dis_MMXop_regmem_to_reg (
9396                 sorb, delta+2, insn[1], "pmaxub", False );
9397       goto decode_success;
9398    }
9399 
9400    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
9401    /* 0F EA = PMINSW -- 16x4 signed min */
9402    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0xEA) {
9403       do_MMX_preamble();
9404       delta = dis_MMXop_regmem_to_reg (
9405                 sorb, delta+2, insn[1], "pminsw", False );
9406       goto decode_success;
9407    }
9408 
9409    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
9410    /* 0F DA = PMINUB -- 8x8 unsigned min */
9411    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0xDA) {
9412       do_MMX_preamble();
9413       delta = dis_MMXop_regmem_to_reg (
9414                 sorb, delta+2, insn[1], "pminub", False );
9415       goto decode_success;
9416    }
9417 
9418    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
9419    /* 0F D7 = PMOVMSKB -- extract sign bits from each of 8 lanes in
9420       mmx(E), turn them into a byte, and put zero-extend of it in
9421       ireg(G). */
9422    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0xD7) {
9423       modrm = insn[2];
9424       if (epartIsReg(modrm)) {
9425          do_MMX_preamble();
9426          t0 = newTemp(Ity_I64);
9427          t1 = newTemp(Ity_I32);
9428          assign(t0, getMMXReg(eregOfRM(modrm)));
9429          assign(t1, unop(Iop_8Uto32, unop(Iop_GetMSBs8x8, mkexpr(t0))));
9430          putIReg(4, gregOfRM(modrm), mkexpr(t1));
9431          DIP("pmovmskb %s,%s\n", nameMMXReg(eregOfRM(modrm)),
9432                                  nameIReg(4,gregOfRM(modrm)));
9433          delta += 3;
9434          goto decode_success;
9435       }
9436       /* else fall through */
9437    }
9438 
9439    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
9440    /* 0F E4 = PMULUH -- 16x4 hi-half of unsigned widening multiply */
9441    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0xE4) {
9442       do_MMX_preamble();
9443       delta = dis_MMXop_regmem_to_reg (
9444                 sorb, delta+2, insn[1], "pmuluh", False );
9445       goto decode_success;
9446    }
9447 
9448    /* 0F 18 /0 = PREFETCHNTA -- prefetch into caches, */
9449    /* 0F 18 /1 = PREFETCH0   -- with various different hints */
9450    /* 0F 18 /2 = PREFETCH1 */
9451    /* 0F 18 /3 = PREFETCH2 */
9452    if (insn[0] == 0x0F && insn[1] == 0x18
9453        && !epartIsReg(insn[2])
9454        && gregOfRM(insn[2]) >= 0 && gregOfRM(insn[2]) <= 3) {
9455       const HChar* hintstr = "??";
9456 
9457       modrm = getIByte(delta+2);
9458       vassert(!epartIsReg(modrm));
9459 
9460       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9461       delta += 2+alen;
9462 
9463       switch (gregOfRM(modrm)) {
9464          case 0: hintstr = "nta"; break;
9465          case 1: hintstr = "t0"; break;
9466          case 2: hintstr = "t1"; break;
9467          case 3: hintstr = "t2"; break;
9468          default: vassert(0); /*NOTREACHED*/
9469       }
9470 
9471       DIP("prefetch%s %s\n", hintstr, dis_buf);
9472       goto decode_success;
9473    }
9474 
9475    /* 0F 0D /0 = PREFETCH  m8 -- 3DNow! prefetch */
9476    /* 0F 0D /1 = PREFETCHW m8 -- ditto, with some other hint */
9477    if (insn[0] == 0x0F && insn[1] == 0x0D
9478        && !epartIsReg(insn[2])
9479        && gregOfRM(insn[2]) >= 0 && gregOfRM(insn[2]) <= 1) {
9480       const HChar* hintstr = "??";
9481 
9482       modrm = getIByte(delta+2);
9483       vassert(!epartIsReg(modrm));
9484 
9485       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9486       delta += 2+alen;
9487 
9488       switch (gregOfRM(modrm)) {
9489          case 0: hintstr = ""; break;
9490          case 1: hintstr = "w"; break;
9491          default: vassert(0); /*NOTREACHED*/
9492       }
9493 
9494       DIP("prefetch%s %s\n", hintstr, dis_buf);
9495       goto decode_success;
9496    }
9497 
9498    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
9499    /* 0F F6 = PSADBW -- sum of 8Ux8 absolute differences */
9500    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0xF6) {
9501       do_MMX_preamble();
9502       delta = dis_MMXop_regmem_to_reg (
9503                  sorb, delta+2, insn[1], "psadbw", False );
9504       goto decode_success;
9505    }
9506 
9507    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
9508    /* 0F 70 = PSHUFW -- rearrange 4x16 from E(mmx or mem) to G(mmx) */
9509    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0x70) {
9510       Int order;
9511       IRTemp sV, dV, s3, s2, s1, s0;
9512       s3 = s2 = s1 = s0 = IRTemp_INVALID;
9513       sV = newTemp(Ity_I64);
9514       dV = newTemp(Ity_I64);
9515       do_MMX_preamble();
9516       modrm = insn[2];
9517       if (epartIsReg(modrm)) {
9518          assign( sV, getMMXReg(eregOfRM(modrm)) );
9519          order = (Int)insn[3];
9520          delta += 2+2;
9521          DIP("pshufw $%d,%s,%s\n", order,
9522                                    nameMMXReg(eregOfRM(modrm)),
9523                                    nameMMXReg(gregOfRM(modrm)));
9524       } else {
9525          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9526          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
9527 	 order = (Int)insn[2+alen];
9528          delta += 3+alen;
9529          DIP("pshufw $%d,%s,%s\n", order,
9530                                    dis_buf,
9531                                    nameMMXReg(gregOfRM(modrm)));
9532       }
9533       breakup64to16s( sV, &s3, &s2, &s1, &s0 );
9534 
9535 #     define SEL(n) \
9536                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
9537       assign(dV,
9538 	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
9539                           SEL((order>>2)&3), SEL((order>>0)&3) )
9540       );
9541       putMMXReg(gregOfRM(modrm), mkexpr(dV));
9542 #     undef SEL
9543       goto decode_success;
9544    }
9545 
9546    /* 0F AE /7 = SFENCE -- flush pending operations to memory */
9547    if (insn[0] == 0x0F && insn[1] == 0xAE
9548        && epartIsReg(insn[2]) && gregOfRM(insn[2]) == 7) {
9549       vassert(!has_66_pfx);
9550       delta += 3;
9551       /* Insert a memory fence.  It's sometimes important that these
9552          are carried through to the generated code. */
9553       stmt( IRStmt_MBE(Imbe_Fence) );
9554       DIP("sfence\n");
9555       goto decode_success;
9556    }
9557 
9558    /* End of mmxext sse1 subset. No more sse parsing for mmxext only arches. */
9559    if (archinfo->hwcaps == VEX_HWCAPS_X86_MMXEXT/*integer only sse1 subset*/)
9560       goto after_sse_decoders;
9561 
9562 
9563    /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
9564    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0x5F) {
9565       delta = dis_SSE_E_to_G_all( sorb, delta+2, "maxps", Iop_Max32Fx4 );
9566       goto decode_success;
9567    }
9568 
9569    /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
9570    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5F) {
9571       vassert(!has_66_pfx);
9572       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "maxss", Iop_Max32F0x4 );
9573       goto decode_success;
9574    }
9575 
9576    /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
9577    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0x5D) {
9578       delta = dis_SSE_E_to_G_all( sorb, delta+2, "minps", Iop_Min32Fx4 );
9579       goto decode_success;
9580    }
9581 
9582    /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
9583    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5D) {
9584       vassert(!has_66_pfx);
9585       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "minss", Iop_Min32F0x4 );
9586       goto decode_success;
9587    }
9588 
9589    /* 0F 28 = MOVAPS -- move from E (mem or xmm) to G (xmm). */
9590    /* 0F 10 = MOVUPS -- move from E (mem or xmm) to G (xmm). */
9591    if (!has_66_pfx && insn[0] == 0x0F && (insn[1] == 0x28 || insn[1] == 0x10)) {
9592       modrm = getIByte(delta+2);
9593       if (epartIsReg(modrm)) {
9594          putXMMReg( gregOfRM(modrm),
9595                     getXMMReg( eregOfRM(modrm) ));
9596          DIP("mov[ua]ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9597                                   nameXMMReg(gregOfRM(modrm)));
9598          delta += 2+1;
9599       } else {
9600          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9601          if (insn[1] == 0x28/*movaps*/)
9602             gen_SEGV_if_not_16_aligned( addr );
9603          putXMMReg( gregOfRM(modrm),
9604                     loadLE(Ity_V128, mkexpr(addr)) );
9605          DIP("mov[ua]ps %s,%s\n", dis_buf,
9606                                   nameXMMReg(gregOfRM(modrm)));
9607          delta += 2+alen;
9608       }
9609       goto decode_success;
9610    }
9611 
9612    /* 0F 29 = MOVAPS -- move from G (xmm) to E (mem or xmm). */
9613    /* 0F 11 = MOVUPS -- move from G (xmm) to E (mem or xmm). */
9614    if (!has_66_pfx && insn[0] == 0x0F
9615        && (insn[1] == 0x29 || insn[1] == 0x11)) {
9616       modrm = getIByte(delta+2);
9617       if (epartIsReg(modrm)) {
9618          /* fall through; awaiting test case */
9619       } else {
9620          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9621          if (insn[1] == 0x29/*movaps*/)
9622             gen_SEGV_if_not_16_aligned( addr );
9623          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
9624          DIP("mov[ua]ps %s,%s\n", nameXMMReg(gregOfRM(modrm)),
9625                                   dis_buf );
9626          delta += 2+alen;
9627          goto decode_success;
9628       }
9629    }
9630 
9631    /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
9632    /* 0F 16 = MOVLHPS -- move from lo half to hi half of XMM. */
9633    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0x16) {
9634       modrm = getIByte(delta+2);
9635       if (epartIsReg(modrm)) {
9636          delta += 2+1;
9637          putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
9638                           getXMMRegLane64( eregOfRM(modrm), 0 ) );
9639          DIP("movhps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9640                                nameXMMReg(gregOfRM(modrm)));
9641       } else {
9642          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9643          delta += 2+alen;
9644          putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
9645                           loadLE(Ity_I64, mkexpr(addr)) );
9646          DIP("movhps %s,%s\n", dis_buf,
9647                                nameXMMReg( gregOfRM(modrm) ));
9648       }
9649       goto decode_success;
9650    }
9651 
9652    /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
9653    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0x17) {
9654       if (!epartIsReg(insn[2])) {
9655          delta += 2;
9656          addr = disAMode ( &alen, sorb, delta, dis_buf );
9657          delta += alen;
9658          storeLE( mkexpr(addr),
9659                   getXMMRegLane64( gregOfRM(insn[2]),
9660                                    1/*upper lane*/ ) );
9661          DIP("movhps %s,%s\n", nameXMMReg( gregOfRM(insn[2]) ),
9662                                dis_buf);
9663          goto decode_success;
9664       }
9665       /* else fall through */
9666    }
9667 
9668    /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
9669    /* OF 12 = MOVHLPS -- from from hi half to lo half of XMM. */
9670    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0x12) {
9671       modrm = getIByte(delta+2);
9672       if (epartIsReg(modrm)) {
9673          delta += 2+1;
9674          putXMMRegLane64( gregOfRM(modrm),
9675                           0/*lower lane*/,
9676                           getXMMRegLane64( eregOfRM(modrm), 1 ));
9677          DIP("movhlps %s, %s\n", nameXMMReg(eregOfRM(modrm)),
9678                                  nameXMMReg(gregOfRM(modrm)));
9679       } else {
9680          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9681          delta += 2+alen;
9682          putXMMRegLane64( gregOfRM(modrm),  0/*lower lane*/,
9683                           loadLE(Ity_I64, mkexpr(addr)) );
9684          DIP("movlps %s, %s\n",
9685              dis_buf, nameXMMReg( gregOfRM(modrm) ));
9686       }
9687       goto decode_success;
9688    }
9689 
9690    /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
9691    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0x13) {
9692       if (!epartIsReg(insn[2])) {
9693          delta += 2;
9694          addr = disAMode ( &alen, sorb, delta, dis_buf );
9695          delta += alen;
9696          storeLE( mkexpr(addr),
9697                   getXMMRegLane64( gregOfRM(insn[2]),
9698                                    0/*lower lane*/ ) );
9699          DIP("movlps %s, %s\n", nameXMMReg( gregOfRM(insn[2]) ),
9700                                 dis_buf);
9701          goto decode_success;
9702       }
9703       /* else fall through */
9704    }
9705 
9706    /* 0F 50 = MOVMSKPS - move 4 sign bits from 4 x F32 in xmm(E)
9707       to 4 lowest bits of ireg(G) */
9708    if (insn[0] == 0x0F && insn[1] == 0x50) {
9709       modrm = getIByte(delta+2);
9710       if (!has_66_pfx && epartIsReg(modrm)) {
9711          Int src;
9712          t0 = newTemp(Ity_I32);
9713          t1 = newTemp(Ity_I32);
9714          t2 = newTemp(Ity_I32);
9715          t3 = newTemp(Ity_I32);
9716          delta += 2+1;
9717          src = eregOfRM(modrm);
9718          assign( t0, binop( Iop_And32,
9719                             binop(Iop_Shr32, getXMMRegLane32(src,0), mkU8(31)),
9720                             mkU32(1) ));
9721          assign( t1, binop( Iop_And32,
9722                             binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(30)),
9723                             mkU32(2) ));
9724          assign( t2, binop( Iop_And32,
9725                             binop(Iop_Shr32, getXMMRegLane32(src,2), mkU8(29)),
9726                             mkU32(4) ));
9727          assign( t3, binop( Iop_And32,
9728                             binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(28)),
9729                             mkU32(8) ));
9730          putIReg(4, gregOfRM(modrm),
9731                     binop(Iop_Or32,
9732                           binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
9733                           binop(Iop_Or32, mkexpr(t2), mkexpr(t3))
9734                          )
9735                  );
9736          DIP("movmskps %s,%s\n", nameXMMReg(src),
9737                                  nameIReg(4, gregOfRM(modrm)));
9738          goto decode_success;
9739       }
9740       /* else fall through */
9741    }
9742 
9743    /* 0F 2B = MOVNTPS -- for us, just a plain SSE store. */
9744    /* 66 0F 2B = MOVNTPD -- for us, just a plain SSE store. */
9745    if (insn[0] == 0x0F && insn[1] == 0x2B) {
9746       modrm = getIByte(delta+2);
9747       if (!epartIsReg(modrm)) {
9748          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9749          gen_SEGV_if_not_16_aligned( addr );
9750          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
9751          DIP("movntp%s %s,%s\n", sz==2 ? "d" : "s",
9752                                  dis_buf,
9753                                  nameXMMReg(gregOfRM(modrm)));
9754          delta += 2+alen;
9755          goto decode_success;
9756       }
9757       /* else fall through */
9758    }
9759 
9760    /* F3 0F 10 = MOVSS -- move 32 bits from E (mem or lo 1/4 xmm) to G
9761       (lo 1/4 xmm).  If E is mem, upper 3/4 of G is zeroed out. */
9762    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x10) {
9763       vassert(!has_66_pfx);
9764       modrm = getIByte(delta+3);
9765       if (epartIsReg(modrm)) {
9766          putXMMRegLane32( gregOfRM(modrm), 0,
9767                           getXMMRegLane32( eregOfRM(modrm), 0 ));
9768          DIP("movss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9769                               nameXMMReg(gregOfRM(modrm)));
9770          delta += 3+1;
9771       } else {
9772          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9773          /* zero bits 127:64 */
9774          putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
9775          /* zero bits 63:32 */
9776          putXMMRegLane32( gregOfRM(modrm), 1, mkU32(0) );
9777          /* write bits 31:0 */
9778          putXMMRegLane32( gregOfRM(modrm), 0,
9779                           loadLE(Ity_I32, mkexpr(addr)) );
9780          DIP("movss %s,%s\n", dis_buf,
9781                               nameXMMReg(gregOfRM(modrm)));
9782          delta += 3+alen;
9783       }
9784       goto decode_success;
9785    }
9786 
9787    /* F3 0F 11 = MOVSS -- move 32 bits from G (lo 1/4 xmm) to E (mem
9788       or lo 1/4 xmm). */
9789    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x11) {
9790       vassert(!has_66_pfx);
9791       modrm = getIByte(delta+3);
9792       if (epartIsReg(modrm)) {
9793          /* fall through, we don't yet have a test case */
9794       } else {
9795          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9796          storeLE( mkexpr(addr),
9797                   getXMMRegLane32(gregOfRM(modrm), 0) );
9798          DIP("movss %s,%s\n", nameXMMReg(gregOfRM(modrm)),
9799                               dis_buf);
9800          delta += 3+alen;
9801          goto decode_success;
9802       }
9803    }
9804 
9805    /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
9806    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0x59) {
9807       delta = dis_SSE_E_to_G_all( sorb, delta+2, "mulps", Iop_Mul32Fx4 );
9808       goto decode_success;
9809    }
9810 
9811    /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
9812    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x59) {
9813       vassert(!has_66_pfx);
9814       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "mulss", Iop_Mul32F0x4 );
9815       goto decode_success;
9816    }
9817 
9818    /* 0F 56 = ORPS -- G = G and E */
9819    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0x56) {
9820       delta = dis_SSE_E_to_G_all( sorb, delta+2, "orps", Iop_OrV128 );
9821       goto decode_success;
9822    }
9823 
9824    /* 0F 53 = RCPPS -- approx reciprocal 32Fx4 from R/M to R */
9825    if (insn[0] == 0x0F && insn[1] == 0x53) {
9826       vassert(!has_66_pfx);
9827       delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
9828                                         "rcpps", Iop_RecipEst32Fx4 );
9829       goto decode_success;
9830    }
9831 
9832    /* F3 0F 53 = RCPSS -- approx reciprocal 32F0x4 from R/M to R */
9833    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x53) {
9834       vassert(!has_66_pfx);
9835       delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
9836                                          "rcpss", Iop_RecipEst32F0x4 );
9837       goto decode_success;
9838    }
9839 
9840    /* 0F 52 = RSQRTPS -- approx reciprocal sqrt 32Fx4 from R/M to R */
9841    if (insn[0] == 0x0F && insn[1] == 0x52) {
9842       vassert(!has_66_pfx);
9843       delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
9844                                         "rsqrtps", Iop_RSqrtEst32Fx4 );
9845       goto decode_success;
9846    }
9847 
9848    /* F3 0F 52 = RSQRTSS -- approx reciprocal sqrt 32F0x4 from R/M to R */
9849    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x52) {
9850       vassert(!has_66_pfx);
9851       delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
9852                                          "rsqrtss", Iop_RSqrtEst32F0x4 );
9853       goto decode_success;
9854    }
9855 
9856    /* 0F C6 /r ib = SHUFPS -- shuffle packed F32s */
9857    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0xC6) {
9858       Int    select;
9859       IRTemp sV, dV;
9860       IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
9861       sV = newTemp(Ity_V128);
9862       dV = newTemp(Ity_V128);
9863       s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
9864       modrm = insn[2];
9865       assign( dV, getXMMReg(gregOfRM(modrm)) );
9866 
9867       if (epartIsReg(modrm)) {
9868          assign( sV, getXMMReg(eregOfRM(modrm)) );
9869          select = (Int)insn[3];
9870          delta += 2+2;
9871          DIP("shufps $%d,%s,%s\n", select,
9872                                    nameXMMReg(eregOfRM(modrm)),
9873                                    nameXMMReg(gregOfRM(modrm)));
9874       } else {
9875          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9876          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
9877          select = (Int)insn[2+alen];
9878          delta += 3+alen;
9879          DIP("shufps $%d,%s,%s\n", select,
9880                                    dis_buf,
9881                                    nameXMMReg(gregOfRM(modrm)));
9882       }
9883 
9884       breakup128to32s( dV, &d3, &d2, &d1, &d0 );
9885       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
9886 
9887 #     define SELD(n) ((n)==0 ? d0 : ((n)==1 ? d1 : ((n)==2 ? d2 : d3)))
9888 #     define SELS(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
9889 
9890       putXMMReg(
9891          gregOfRM(modrm),
9892          mk128from32s( SELS((select>>6)&3), SELS((select>>4)&3),
9893                        SELD((select>>2)&3), SELD((select>>0)&3) )
9894       );
9895 
9896 #     undef SELD
9897 #     undef SELS
9898 
9899       goto decode_success;
9900    }
9901 
9902    /* 0F 51 = SQRTPS -- approx sqrt 32Fx4 from R/M to R */
9903    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0x51) {
9904       delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
9905                                         "sqrtps", Iop_Sqrt32Fx4 );
9906       goto decode_success;
9907    }
9908 
9909    /* F3 0F 51 = SQRTSS -- approx sqrt 32F0x4 from R/M to R */
9910    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x51) {
9911       vassert(!has_66_pfx);
9912       delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
9913                                          "sqrtss", Iop_Sqrt32F0x4 );
9914       goto decode_success;
9915    }
9916 
9917    /* 0F AE /3 = STMXCSR m32 -- store %mxcsr */
9918    if (insn[0] == 0x0F && insn[1] == 0xAE
9919        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 3) {
9920       modrm = getIByte(delta+2);
9921       vassert(!has_66_pfx);
9922       vassert(!epartIsReg(modrm));
9923 
9924       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9925       delta += 2+alen;
9926 
9927       /* Fake up a native SSE mxcsr word.  The only thing it depends
9928          on is SSEROUND[1:0], so call a clean helper to cook it up.
9929       */
9930       /* UInt x86h_create_mxcsr ( UInt sseround ) */
9931       DIP("stmxcsr %s\n", dis_buf);
9932       storeLE( mkexpr(addr),
9933                mkIRExprCCall(
9934                   Ity_I32, 0/*regp*/,
9935                   "x86g_create_mxcsr", &x86g_create_mxcsr,
9936                   mkIRExprVec_1( get_sse_roundingmode() )
9937                )
9938              );
9939       goto decode_success;
9940    }
9941 
9942    /* 0F 5C = SUBPS -- sub 32Fx4 from R/M to R */
9943    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0x5C) {
9944       delta = dis_SSE_E_to_G_all( sorb, delta+2, "subps", Iop_Sub32Fx4 );
9945       goto decode_success;
9946    }
9947 
9948    /* F3 0F 5C = SUBSS -- sub 32F0x4 from R/M to R */
9949    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5C) {
9950       vassert(!has_66_pfx);
9951       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "subss", Iop_Sub32F0x4 );
9952       goto decode_success;
9953    }
9954 
9955    /* 0F 15 = UNPCKHPS -- unpack and interleave high part F32s */
9956    /* 0F 14 = UNPCKLPS -- unpack and interleave low part F32s */
9957    /* These just appear to be special cases of SHUFPS */
9958    if (!has_66_pfx && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
9959       IRTemp sV, dV;
9960       IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
9961       Bool hi = toBool(insn[1] == 0x15);
9962       sV = newTemp(Ity_V128);
9963       dV = newTemp(Ity_V128);
9964       s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
9965       modrm = insn[2];
9966       assign( dV, getXMMReg(gregOfRM(modrm)) );
9967 
9968       if (epartIsReg(modrm)) {
9969          assign( sV, getXMMReg(eregOfRM(modrm)) );
9970          delta += 2+1;
9971          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
9972                                   nameXMMReg(eregOfRM(modrm)),
9973                                   nameXMMReg(gregOfRM(modrm)));
9974       } else {
9975          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9976          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
9977          delta += 2+alen;
9978          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
9979                                   dis_buf,
9980                                   nameXMMReg(gregOfRM(modrm)));
9981       }
9982 
9983       breakup128to32s( dV, &d3, &d2, &d1, &d0 );
9984       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
9985 
9986       if (hi) {
9987          putXMMReg( gregOfRM(modrm), mk128from32s( s3, d3, s2, d2 ) );
9988       } else {
9989          putXMMReg( gregOfRM(modrm), mk128from32s( s1, d1, s0, d0 ) );
9990       }
9991 
9992       goto decode_success;
9993    }
9994 
9995    /* 0F 57 = XORPS -- G = G and E */
9996    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0x57) {
9997       delta = dis_SSE_E_to_G_all( sorb, delta+2, "xorps", Iop_XorV128 );
9998       goto decode_success;
9999    }
10000 
10001    /* ---------------------------------------------------- */
10002    /* --- end of the SSE decoder.                      --- */
10003    /* ---------------------------------------------------- */
10004 
10005    /* ---------------------------------------------------- */
10006    /* --- start of the SSE2 decoder.                   --- */
10007    /* ---------------------------------------------------- */
10008 
10009    /* Skip parts of the decoder which don't apply given the stated
10010       guest subarchitecture. */
10011    if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2))
10012       goto after_sse_decoders; /* no SSE2 capabilities */
10013 
10014    insn = &guest_code[delta];
10015 
10016    /* 66 0F 58 = ADDPD -- add 32Fx4 from R/M to R */
10017    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x58) {
10018       delta = dis_SSE_E_to_G_all( sorb, delta+2, "addpd", Iop_Add64Fx2 );
10019       goto decode_success;
10020    }
10021 
10022    /* F2 0F 58 = ADDSD -- add 64F0x2 from R/M to R */
10023    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x58) {
10024       vassert(!has_66_pfx);
10025       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "addsd", Iop_Add64F0x2 );
10026       goto decode_success;
10027    }
10028 
10029    /* 66 0F 55 = ANDNPD -- G = (not G) and E */
10030    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x55) {
10031       delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "andnpd", Iop_AndV128 );
10032       goto decode_success;
10033    }
10034 
10035    /* 66 0F 54 = ANDPD -- G = G and E */
10036    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x54) {
10037       delta = dis_SSE_E_to_G_all( sorb, delta+2, "andpd", Iop_AndV128 );
10038       goto decode_success;
10039    }
10040 
10041    /* 66 0F C2 = CMPPD -- 64Fx2 comparison from R/M to R */
10042    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xC2) {
10043       delta = dis_SSEcmp_E_to_G( sorb, delta+2, "cmppd", True, 8 );
10044       goto decode_success;
10045    }
10046 
10047    /* F2 0F C2 = CMPSD -- 64F0x2 comparison from R/M to R */
10048    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xC2) {
10049       vassert(!has_66_pfx);
10050       delta = dis_SSEcmp_E_to_G( sorb, delta+3, "cmpsd", False, 8 );
10051       goto decode_success;
10052    }
10053 
10054    /* 66 0F 2F = COMISD  -- 64F0x2 comparison G,E, and set ZCP */
10055    /* 66 0F 2E = UCOMISD -- 64F0x2 comparison G,E, and set ZCP */
10056    if (has_66_pfx && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
10057       IRTemp argL = newTemp(Ity_F64);
10058       IRTemp argR = newTemp(Ity_F64);
10059       modrm = getIByte(delta+2);
10060       if (epartIsReg(modrm)) {
10061          assign( argR, getXMMRegLane64F( eregOfRM(modrm), 0/*lowest lane*/ ) );
10062          delta += 2+1;
10063          DIP("[u]comisd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10064                                   nameXMMReg(gregOfRM(modrm)) );
10065       } else {
10066          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10067 	 assign( argR, loadLE(Ity_F64, mkexpr(addr)) );
10068          delta += 2+alen;
10069          DIP("[u]comisd %s,%s\n", dis_buf,
10070                                   nameXMMReg(gregOfRM(modrm)) );
10071       }
10072       assign( argL, getXMMRegLane64F( gregOfRM(modrm), 0/*lowest lane*/ ) );
10073 
10074       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
10075       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
10076       stmt( IRStmt_Put(
10077                OFFB_CC_DEP1,
10078                binop( Iop_And32,
10079                       binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)),
10080                       mkU32(0x45)
10081           )));
10082       /* Set NDEP even though it isn't used.  This makes redundant-PUT
10083          elimination of previous stores to this field work better. */
10084       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
10085       goto decode_success;
10086    }
10087 
10088    /* F3 0F E6 = CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm to 2 x
10089       F64 in xmm(G) */
10090    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xE6) {
10091       IRTemp arg64 = newTemp(Ity_I64);
10092       vassert(!has_66_pfx);
10093 
10094       modrm = getIByte(delta+3);
10095       if (epartIsReg(modrm)) {
10096          assign( arg64, getXMMRegLane64(eregOfRM(modrm), 0) );
10097          delta += 3+1;
10098          DIP("cvtdq2pd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10099                                  nameXMMReg(gregOfRM(modrm)));
10100       } else {
10101          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
10102 	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
10103          delta += 3+alen;
10104          DIP("cvtdq2pd %s,%s\n", dis_buf,
10105                                  nameXMMReg(gregOfRM(modrm)) );
10106       }
10107 
10108       putXMMRegLane64F(
10109          gregOfRM(modrm), 0,
10110          unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)))
10111       );
10112 
10113       putXMMRegLane64F(
10114          gregOfRM(modrm), 1,
10115          unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)))
10116       );
10117 
10118       goto decode_success;
10119    }
10120 
10121    /* 0F 5B = CVTDQ2PS -- convert 4 x I32 in mem/xmm to 4 x F32 in
10122       xmm(G) */
10123    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0x5B) {
10124       IRTemp argV  = newTemp(Ity_V128);
10125       IRTemp rmode = newTemp(Ity_I32);
10126 
10127       modrm = getIByte(delta+2);
10128       if (epartIsReg(modrm)) {
10129          assign( argV, getXMMReg(eregOfRM(modrm)) );
10130          delta += 2+1;
10131          DIP("cvtdq2ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10132                                  nameXMMReg(gregOfRM(modrm)));
10133       } else {
10134          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10135 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
10136          delta += 2+alen;
10137          DIP("cvtdq2ps %s,%s\n", dis_buf,
10138                                  nameXMMReg(gregOfRM(modrm)) );
10139       }
10140 
10141       assign( rmode, get_sse_roundingmode() );
10142       breakup128to32s( argV, &t3, &t2, &t1, &t0 );
10143 
10144 #     define CVT(_t)  binop( Iop_F64toF32,                    \
10145                              mkexpr(rmode),                   \
10146                              unop(Iop_I32StoF64,mkexpr(_t)))
10147 
10148       putXMMRegLane32F( gregOfRM(modrm), 3, CVT(t3) );
10149       putXMMRegLane32F( gregOfRM(modrm), 2, CVT(t2) );
10150       putXMMRegLane32F( gregOfRM(modrm), 1, CVT(t1) );
10151       putXMMRegLane32F( gregOfRM(modrm), 0, CVT(t0) );
10152 
10153 #     undef CVT
10154 
10155       goto decode_success;
10156    }
10157 
10158    /* F2 0F E6 = CVTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
10159       lo half xmm(G), and zero upper half */
10160    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xE6) {
10161       IRTemp argV  = newTemp(Ity_V128);
10162       IRTemp rmode = newTemp(Ity_I32);
10163       vassert(!has_66_pfx);
10164 
10165       modrm = getIByte(delta+3);
10166       if (epartIsReg(modrm)) {
10167          assign( argV, getXMMReg(eregOfRM(modrm)) );
10168          delta += 3+1;
10169          DIP("cvtpd2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10170                                  nameXMMReg(gregOfRM(modrm)));
10171       } else {
10172          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
10173 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
10174          delta += 3+alen;
10175          DIP("cvtpd2dq %s,%s\n", dis_buf,
10176                                  nameXMMReg(gregOfRM(modrm)) );
10177       }
10178 
10179       assign( rmode, get_sse_roundingmode() );
10180       t0 = newTemp(Ity_F64);
10181       t1 = newTemp(Ity_F64);
10182       assign( t0, unop(Iop_ReinterpI64asF64,
10183                        unop(Iop_V128to64, mkexpr(argV))) );
10184       assign( t1, unop(Iop_ReinterpI64asF64,
10185                        unop(Iop_V128HIto64, mkexpr(argV))) );
10186 
10187 #     define CVT(_t)  binop( Iop_F64toI32S,                   \
10188                              mkexpr(rmode),                   \
10189                              mkexpr(_t) )
10190 
10191       putXMMRegLane32( gregOfRM(modrm), 3, mkU32(0) );
10192       putXMMRegLane32( gregOfRM(modrm), 2, mkU32(0) );
10193       putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
10194       putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
10195 
10196 #     undef CVT
10197 
10198       goto decode_success;
10199    }
10200 
10201    /* 66 0F 2D = CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
10202       I32 in mmx, according to prevailing SSE rounding mode */
10203    /* 66 0F 2C = CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
10204       I32 in mmx, rounding towards zero */
10205    if (has_66_pfx && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
10206       IRTemp dst64  = newTemp(Ity_I64);
10207       IRTemp rmode  = newTemp(Ity_I32);
10208       IRTemp f64lo  = newTemp(Ity_F64);
10209       IRTemp f64hi  = newTemp(Ity_F64);
10210       Bool   r2zero = toBool(insn[1] == 0x2C);
10211 
10212       do_MMX_preamble();
10213       modrm = getIByte(delta+2);
10214 
10215       if (epartIsReg(modrm)) {
10216          delta += 2+1;
10217 	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
10218 	 assign(f64hi, getXMMRegLane64F(eregOfRM(modrm), 1));
10219          DIP("cvt%spd2pi %s,%s\n", r2zero ? "t" : "",
10220                                    nameXMMReg(eregOfRM(modrm)),
10221                                    nameMMXReg(gregOfRM(modrm)));
10222       } else {
10223          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10224 	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
10225 	 assign(f64hi, loadLE(Ity_F64, binop( Iop_Add32,
10226                                               mkexpr(addr),
10227                                               mkU32(8) )));
10228          delta += 2+alen;
10229          DIP("cvt%spf2pi %s,%s\n", r2zero ? "t" : "",
10230                                    dis_buf,
10231                                    nameMMXReg(gregOfRM(modrm)));
10232       }
10233 
10234       if (r2zero) {
10235          assign(rmode, mkU32((UInt)Irrm_ZERO) );
10236       } else {
10237          assign( rmode, get_sse_roundingmode() );
10238       }
10239 
10240       assign(
10241          dst64,
10242          binop( Iop_32HLto64,
10243                 binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64hi) ),
10244                 binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo) )
10245               )
10246       );
10247 
10248       putMMXReg(gregOfRM(modrm), mkexpr(dst64));
10249       goto decode_success;
10250    }
10251 
10252    /* 66 0F 5A = CVTPD2PS -- convert 2 x F64 in mem/xmm to 2 x F32 in
10253       lo half xmm(G), and zero upper half */
10254    /* Note, this is practically identical to CVTPD2DQ.  It would have
10255       been nicer to merge them together, but the insn[] offsets differ
10256       by one. */
10257    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x5A) {
10258       IRTemp argV  = newTemp(Ity_V128);
10259       IRTemp rmode = newTemp(Ity_I32);
10260 
10261       modrm = getIByte(delta+2);
10262       if (epartIsReg(modrm)) {
10263          assign( argV, getXMMReg(eregOfRM(modrm)) );
10264          delta += 2+1;
10265          DIP("cvtpd2ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10266                                  nameXMMReg(gregOfRM(modrm)));
10267       } else {
10268          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10269 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
10270          delta += 2+alen;
10271          DIP("cvtpd2ps %s,%s\n", dis_buf,
10272                                  nameXMMReg(gregOfRM(modrm)) );
10273       }
10274 
10275       assign( rmode, get_sse_roundingmode() );
10276       t0 = newTemp(Ity_F64);
10277       t1 = newTemp(Ity_F64);
10278       assign( t0, unop(Iop_ReinterpI64asF64,
10279                        unop(Iop_V128to64, mkexpr(argV))) );
10280       assign( t1, unop(Iop_ReinterpI64asF64,
10281                        unop(Iop_V128HIto64, mkexpr(argV))) );
10282 
10283 #     define CVT(_t)  binop( Iop_F64toF32,                    \
10284                              mkexpr(rmode),                   \
10285                              mkexpr(_t) )
10286 
10287       putXMMRegLane32(  gregOfRM(modrm), 3, mkU32(0) );
10288       putXMMRegLane32(  gregOfRM(modrm), 2, mkU32(0) );
10289       putXMMRegLane32F( gregOfRM(modrm), 1, CVT(t1) );
10290       putXMMRegLane32F( gregOfRM(modrm), 0, CVT(t0) );
10291 
10292 #     undef CVT
10293 
10294       goto decode_success;
10295    }
10296 
10297    /* 66 0F 2A = CVTPI2PD -- convert 2 x I32 in mem/mmx to 2 x F64 in
10298       xmm(G) */
10299    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x2A) {
10300       IRTemp arg64 = newTemp(Ity_I64);
10301 
10302       modrm = getIByte(delta+2);
10303       if (epartIsReg(modrm)) {
10304          /* Only switch to MMX mode if the source is a MMX register.
10305             This is inconsistent with all other instructions which
10306             convert between XMM and (M64 or MMX), which always switch
10307             to MMX mode even if 64-bit operand is M64 and not MMX.  At
10308             least, that's what the Intel docs seem to me to say.
10309             Fixes #210264. */
10310          do_MMX_preamble();
10311          assign( arg64, getMMXReg(eregOfRM(modrm)) );
10312          delta += 2+1;
10313          DIP("cvtpi2pd %s,%s\n", nameMMXReg(eregOfRM(modrm)),
10314                                  nameXMMReg(gregOfRM(modrm)));
10315       } else {
10316          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10317 	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
10318          delta += 2+alen;
10319          DIP("cvtpi2pd %s,%s\n", dis_buf,
10320                                  nameXMMReg(gregOfRM(modrm)) );
10321       }
10322 
10323       putXMMRegLane64F(
10324          gregOfRM(modrm), 0,
10325          unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)) )
10326       );
10327 
10328       putXMMRegLane64F(
10329          gregOfRM(modrm), 1,
10330          unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)) )
10331       );
10332 
10333       goto decode_success;
10334    }
10335 
10336    /* 66 0F 5B = CVTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
10337       xmm(G) */
10338    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x5B) {
10339       IRTemp argV  = newTemp(Ity_V128);
10340       IRTemp rmode = newTemp(Ity_I32);
10341 
10342       modrm = getIByte(delta+2);
10343       if (epartIsReg(modrm)) {
10344          assign( argV, getXMMReg(eregOfRM(modrm)) );
10345          delta += 2+1;
10346          DIP("cvtps2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10347                                  nameXMMReg(gregOfRM(modrm)));
10348       } else {
10349          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10350 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
10351          delta += 2+alen;
10352          DIP("cvtps2dq %s,%s\n", dis_buf,
10353                                  nameXMMReg(gregOfRM(modrm)) );
10354       }
10355 
10356       assign( rmode, get_sse_roundingmode() );
10357       breakup128to32s( argV, &t3, &t2, &t1, &t0 );
10358 
10359       /* This is less than ideal.  If it turns out to be a performance
10360 	 bottleneck it can be improved. */
10361 #     define CVT(_t)                            \
10362         binop( Iop_F64toI32S,                   \
10363                mkexpr(rmode),                   \
10364                unop( Iop_F32toF64,              \
10365                      unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
10366 
10367       putXMMRegLane32( gregOfRM(modrm), 3, CVT(t3) );
10368       putXMMRegLane32( gregOfRM(modrm), 2, CVT(t2) );
10369       putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
10370       putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
10371 
10372 #     undef CVT
10373 
10374       goto decode_success;
10375    }
10376 
10377    /* 0F 5A = CVTPS2PD -- convert 2 x F32 in low half mem/xmm to 2 x
10378       F64 in xmm(G). */
10379    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0x5A) {
10380       IRTemp f32lo = newTemp(Ity_F32);
10381       IRTemp f32hi = newTemp(Ity_F32);
10382 
10383       modrm = getIByte(delta+2);
10384       if (epartIsReg(modrm)) {
10385          assign( f32lo, getXMMRegLane32F(eregOfRM(modrm), 0) );
10386          assign( f32hi, getXMMRegLane32F(eregOfRM(modrm), 1) );
10387          delta += 2+1;
10388          DIP("cvtps2pd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10389                                  nameXMMReg(gregOfRM(modrm)));
10390       } else {
10391          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10392 	 assign( f32lo, loadLE(Ity_F32, mkexpr(addr)) );
10393 	 assign( f32hi, loadLE(Ity_F32,
10394                                binop(Iop_Add32,mkexpr(addr),mkU32(4))) );
10395          delta += 2+alen;
10396          DIP("cvtps2pd %s,%s\n", dis_buf,
10397                                  nameXMMReg(gregOfRM(modrm)) );
10398       }
10399 
10400       putXMMRegLane64F( gregOfRM(modrm), 1,
10401                         unop(Iop_F32toF64, mkexpr(f32hi)) );
10402       putXMMRegLane64F( gregOfRM(modrm), 0,
10403                         unop(Iop_F32toF64, mkexpr(f32lo)) );
10404 
10405       goto decode_success;
10406    }
10407 
10408    /* F2 0F 2D = CVTSD2SI -- convert F64 in mem/low half xmm to
10409       I32 in ireg, according to prevailing SSE rounding mode */
10410    /* F2 0F 2C = CVTTSD2SI -- convert F64 in mem/low half xmm to
10411       I32 in ireg, rounding towards zero */
10412    if (insn[0] == 0xF2 && insn[1] == 0x0F
10413        && (insn[2] == 0x2D || insn[2] == 0x2C)) {
10414       IRTemp rmode = newTemp(Ity_I32);
10415       IRTemp f64lo = newTemp(Ity_F64);
10416       Bool   r2zero = toBool(insn[2] == 0x2C);
10417       vassert(!has_66_pfx);
10418 
10419       modrm = getIByte(delta+3);
10420       if (epartIsReg(modrm)) {
10421          delta += 3+1;
10422 	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
10423          DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
10424                                    nameXMMReg(eregOfRM(modrm)),
10425                                    nameIReg(4, gregOfRM(modrm)));
10426       } else {
10427          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
10428 	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
10429          delta += 3+alen;
10430          DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
10431                                    dis_buf,
10432                                    nameIReg(4, gregOfRM(modrm)));
10433       }
10434 
10435       if (r2zero) {
10436          assign( rmode, mkU32((UInt)Irrm_ZERO) );
10437       } else {
10438          assign( rmode, get_sse_roundingmode() );
10439       }
10440 
10441       putIReg(4, gregOfRM(modrm),
10442                  binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo)) );
10443 
10444       goto decode_success;
10445    }
10446 
10447    /* F2 0F 5A = CVTSD2SS -- convert F64 in mem/low half xmm to F32 in
10448       low 1/4 xmm(G), according to prevailing SSE rounding mode */
10449    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5A) {
10450       IRTemp rmode = newTemp(Ity_I32);
10451       IRTemp f64lo = newTemp(Ity_F64);
10452       vassert(!has_66_pfx);
10453 
10454       modrm = getIByte(delta+3);
10455       if (epartIsReg(modrm)) {
10456          delta += 3+1;
10457 	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
10458          DIP("cvtsd2ss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10459                                  nameXMMReg(gregOfRM(modrm)));
10460       } else {
10461          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
10462 	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
10463          delta += 3+alen;
10464          DIP("cvtsd2ss %s,%s\n", dis_buf,
10465                                  nameXMMReg(gregOfRM(modrm)));
10466       }
10467 
10468       assign( rmode, get_sse_roundingmode() );
10469       putXMMRegLane32F(
10470          gregOfRM(modrm), 0,
10471          binop( Iop_F64toF32, mkexpr(rmode), mkexpr(f64lo) )
10472       );
10473 
10474       goto decode_success;
10475    }
10476 
10477    /* F2 0F 2A = CVTSI2SD -- convert I32 in mem/ireg to F64 in low
10478       half xmm */
10479    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x2A) {
10480       IRTemp arg32 = newTemp(Ity_I32);
10481       vassert(!has_66_pfx);
10482 
10483       modrm = getIByte(delta+3);
10484       if (epartIsReg(modrm)) {
10485          assign( arg32, getIReg(4, eregOfRM(modrm)) );
10486          delta += 3+1;
10487          DIP("cvtsi2sd %s,%s\n", nameIReg(4, eregOfRM(modrm)),
10488                                  nameXMMReg(gregOfRM(modrm)));
10489       } else {
10490          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
10491 	 assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
10492          delta += 3+alen;
10493          DIP("cvtsi2sd %s,%s\n", dis_buf,
10494                                  nameXMMReg(gregOfRM(modrm)) );
10495       }
10496 
10497       putXMMRegLane64F(
10498          gregOfRM(modrm), 0,
10499          unop(Iop_I32StoF64, mkexpr(arg32)) );
10500 
10501       goto decode_success;
10502    }
10503 
10504    /* F3 0F 5A = CVTSS2SD -- convert F32 in mem/low 1/4 xmm to F64 in
10505       low half xmm(G) */
10506    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5A) {
10507       IRTemp f32lo = newTemp(Ity_F32);
10508       vassert(!has_66_pfx);
10509 
10510       modrm = getIByte(delta+3);
10511       if (epartIsReg(modrm)) {
10512          delta += 3+1;
10513 	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
10514          DIP("cvtss2sd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10515                                  nameXMMReg(gregOfRM(modrm)));
10516       } else {
10517          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
10518 	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
10519          delta += 3+alen;
10520          DIP("cvtss2sd %s,%s\n", dis_buf,
10521                                  nameXMMReg(gregOfRM(modrm)));
10522       }
10523 
10524       putXMMRegLane64F( gregOfRM(modrm), 0,
10525                         unop( Iop_F32toF64, mkexpr(f32lo) ) );
10526 
10527       goto decode_success;
10528    }
10529 
10530    /* 66 0F E6 = CVTTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
10531       lo half xmm(G), and zero upper half, rounding towards zero */
10532    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xE6) {
10533       IRTemp argV  = newTemp(Ity_V128);
10534       IRTemp rmode = newTemp(Ity_I32);
10535 
10536       modrm = getIByte(delta+2);
10537       if (epartIsReg(modrm)) {
10538          assign( argV, getXMMReg(eregOfRM(modrm)) );
10539          delta += 2+1;
10540          DIP("cvttpd2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10541                                   nameXMMReg(gregOfRM(modrm)));
10542       } else {
10543          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10544 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
10545          delta += 2+alen;
10546          DIP("cvttpd2dq %s,%s\n", dis_buf,
10547                                   nameXMMReg(gregOfRM(modrm)) );
10548       }
10549 
10550       assign( rmode, mkU32((UInt)Irrm_ZERO) );
10551 
10552       t0 = newTemp(Ity_F64);
10553       t1 = newTemp(Ity_F64);
10554       assign( t0, unop(Iop_ReinterpI64asF64,
10555                        unop(Iop_V128to64, mkexpr(argV))) );
10556       assign( t1, unop(Iop_ReinterpI64asF64,
10557                        unop(Iop_V128HIto64, mkexpr(argV))) );
10558 
10559 #     define CVT(_t)  binop( Iop_F64toI32S,                   \
10560                              mkexpr(rmode),                   \
10561                              mkexpr(_t) )
10562 
10563       putXMMRegLane32( gregOfRM(modrm), 3, mkU32(0) );
10564       putXMMRegLane32( gregOfRM(modrm), 2, mkU32(0) );
10565       putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
10566       putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
10567 
10568 #     undef CVT
10569 
10570       goto decode_success;
10571    }
10572 
10573    /* F3 0F 5B = CVTTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
10574       xmm(G), rounding towards zero */
10575    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5B) {
10576       IRTemp argV  = newTemp(Ity_V128);
10577       IRTemp rmode = newTemp(Ity_I32);
10578       vassert(!has_66_pfx);
10579 
10580       modrm = getIByte(delta+3);
10581       if (epartIsReg(modrm)) {
10582          assign( argV, getXMMReg(eregOfRM(modrm)) );
10583          delta += 3+1;
10584          DIP("cvttps2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10585                                   nameXMMReg(gregOfRM(modrm)));
10586       } else {
10587          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
10588 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
10589          delta += 3+alen;
10590          DIP("cvttps2dq %s,%s\n", dis_buf,
10591                                   nameXMMReg(gregOfRM(modrm)) );
10592       }
10593 
10594       assign( rmode, mkU32((UInt)Irrm_ZERO) );
10595       breakup128to32s( argV, &t3, &t2, &t1, &t0 );
10596 
10597       /* This is less than ideal.  If it turns out to be a performance
10598 	 bottleneck it can be improved. */
10599 #     define CVT(_t)                            \
10600         binop( Iop_F64toI32S,                   \
10601                mkexpr(rmode),                   \
10602                unop( Iop_F32toF64,              \
10603                      unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
10604 
10605       putXMMRegLane32( gregOfRM(modrm), 3, CVT(t3) );
10606       putXMMRegLane32( gregOfRM(modrm), 2, CVT(t2) );
10607       putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
10608       putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
10609 
10610 #     undef CVT
10611 
10612       goto decode_success;
10613    }
10614 
10615    /* 66 0F 5E = DIVPD -- div 64Fx2 from R/M to R */
10616    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x5E) {
10617       delta = dis_SSE_E_to_G_all( sorb, delta+2, "divpd", Iop_Div64Fx2 );
10618       goto decode_success;
10619    }
10620 
10621    /* F2 0F 5E = DIVSD -- div 64F0x2 from R/M to R */
10622    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5E) {
10623       vassert(!has_66_pfx);
10624       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "divsd", Iop_Div64F0x2 );
10625       goto decode_success;
10626    }
10627 
10628    /* 0F AE /5 = LFENCE -- flush pending operations to memory */
10629    /* 0F AE /6 = MFENCE -- flush pending operations to memory */
10630    if (insn[0] == 0x0F && insn[1] == 0xAE
10631        && epartIsReg(insn[2])
10632        && (gregOfRM(insn[2]) == 5 || gregOfRM(insn[2]) == 6)) {
10633       vassert(!has_66_pfx);
10634       delta += 3;
10635       /* Insert a memory fence.  It's sometimes important that these
10636          are carried through to the generated code. */
10637       stmt( IRStmt_MBE(Imbe_Fence) );
10638       DIP("%sfence\n", gregOfRM(insn[2])==5 ? "l" : "m");
10639       goto decode_success;
10640    }
10641 
10642    /* 66 0F 5F = MAXPD -- max 64Fx2 from R/M to R */
10643    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x5F) {
10644       delta = dis_SSE_E_to_G_all( sorb, delta+2, "maxpd", Iop_Max64Fx2 );
10645       goto decode_success;
10646    }
10647 
10648    /* F2 0F 5F = MAXSD -- max 64F0x2 from R/M to R */
10649    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5F) {
10650       vassert(!has_66_pfx);
10651       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "maxsd", Iop_Max64F0x2 );
10652       goto decode_success;
10653    }
10654 
10655    /* 66 0F 5D = MINPD -- min 64Fx2 from R/M to R */
10656    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x5D) {
10657       delta = dis_SSE_E_to_G_all( sorb, delta+2, "minpd", Iop_Min64Fx2 );
10658       goto decode_success;
10659    }
10660 
10661    /* F2 0F 5D = MINSD -- min 64F0x2 from R/M to R */
10662    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5D) {
10663       vassert(!has_66_pfx);
10664       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "minsd", Iop_Min64F0x2 );
10665       goto decode_success;
10666    }
10667 
10668    /* 66 0F 28 = MOVAPD -- move from E (mem or xmm) to G (xmm). */
10669    /* 66 0F 10 = MOVUPD -- move from E (mem or xmm) to G (xmm). */
10670    /* 66 0F 6F = MOVDQA -- move from E (mem or xmm) to G (xmm). */
10671    if (has_66_pfx && insn[0] == 0x0F
10672        && (insn[1] == 0x28 || insn[1] == 0x10 || insn[1] == 0x6F)) {
10673       const HChar* wot = insn[1]==0x28 ? "apd" :
10674                          insn[1]==0x10 ? "upd" : "dqa";
10675       modrm = getIByte(delta+2);
10676       if (epartIsReg(modrm)) {
10677          putXMMReg( gregOfRM(modrm),
10678                     getXMMReg( eregOfRM(modrm) ));
10679          DIP("mov%s %s,%s\n", wot, nameXMMReg(eregOfRM(modrm)),
10680                                    nameXMMReg(gregOfRM(modrm)));
10681          delta += 2+1;
10682       } else {
10683          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10684          if (insn[1] == 0x28/*movapd*/ || insn[1] == 0x6F/*movdqa*/)
10685             gen_SEGV_if_not_16_aligned( addr );
10686          putXMMReg( gregOfRM(modrm),
10687                     loadLE(Ity_V128, mkexpr(addr)) );
10688          DIP("mov%s %s,%s\n", wot, dis_buf,
10689                                    nameXMMReg(gregOfRM(modrm)));
10690          delta += 2+alen;
10691       }
10692       goto decode_success;
10693    }
10694 
10695    /* 66 0F 29 = MOVAPD -- move from G (xmm) to E (mem or xmm). */
10696    /* 66 0F 11 = MOVUPD -- move from G (xmm) to E (mem or xmm). */
10697    if (has_66_pfx && insn[0] == 0x0F
10698        && (insn[1] == 0x29 || insn[1] == 0x11)) {
10699       const HChar* wot = insn[1]==0x29 ? "apd" : "upd";
10700       modrm = getIByte(delta+2);
10701       if (epartIsReg(modrm)) {
10702          /* fall through; awaiting test case */
10703       } else {
10704          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10705          if (insn[1] == 0x29/*movapd*/)
10706             gen_SEGV_if_not_16_aligned( addr );
10707          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
10708          DIP("mov%s %s,%s\n", wot, nameXMMReg(gregOfRM(modrm)),
10709                                    dis_buf );
10710          delta += 2+alen;
10711          goto decode_success;
10712       }
10713    }
10714 
10715    /* 66 0F 6E = MOVD from r/m32 to xmm, zeroing high 3/4 of xmm. */
10716    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x6E) {
10717       modrm = getIByte(delta+2);
10718       if (epartIsReg(modrm)) {
10719          delta += 2+1;
10720          putXMMReg(
10721             gregOfRM(modrm),
10722             unop( Iop_32UtoV128, getIReg(4, eregOfRM(modrm)) )
10723          );
10724          DIP("movd %s, %s\n",
10725              nameIReg(4,eregOfRM(modrm)), nameXMMReg(gregOfRM(modrm)));
10726       } else {
10727          addr = disAMode( &alen, sorb, delta+2, dis_buf );
10728          delta += 2+alen;
10729          putXMMReg(
10730             gregOfRM(modrm),
10731             unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)) )
10732          );
10733          DIP("movd %s, %s\n", dis_buf, nameXMMReg(gregOfRM(modrm)));
10734       }
10735       goto decode_success;
10736    }
10737 
10738    /* 66 0F 7E = MOVD from xmm low 1/4 to r/m32. */
10739    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x7E) {
10740       modrm = getIByte(delta+2);
10741       if (epartIsReg(modrm)) {
10742          delta += 2+1;
10743          putIReg( 4, eregOfRM(modrm),
10744                   getXMMRegLane32(gregOfRM(modrm), 0) );
10745          DIP("movd %s, %s\n",
10746              nameXMMReg(gregOfRM(modrm)), nameIReg(4,eregOfRM(modrm)));
10747       } else {
10748          addr = disAMode( &alen, sorb, delta+2, dis_buf );
10749          delta += 2+alen;
10750          storeLE( mkexpr(addr),
10751                   getXMMRegLane32(gregOfRM(modrm), 0) );
10752          DIP("movd %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
10753       }
10754       goto decode_success;
10755    }
10756 
10757    /* 66 0F 7F = MOVDQA -- move from G (xmm) to E (mem or xmm). */
10758    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x7F) {
10759       modrm = getIByte(delta+2);
10760       if (epartIsReg(modrm)) {
10761          delta += 2+1;
10762          putXMMReg( eregOfRM(modrm),
10763                     getXMMReg(gregOfRM(modrm)) );
10764          DIP("movdqa %s, %s\n", nameXMMReg(gregOfRM(modrm)),
10765                                 nameXMMReg(eregOfRM(modrm)));
10766       } else {
10767          addr = disAMode( &alen, sorb, delta+2, dis_buf );
10768          delta += 2+alen;
10769          gen_SEGV_if_not_16_aligned( addr );
10770          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
10771          DIP("movdqa %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
10772       }
10773       goto decode_success;
10774    }
10775 
10776    /* F3 0F 6F = MOVDQU -- move from E (mem or xmm) to G (xmm). */
10777    /* Unfortunately can't simply use the MOVDQA case since the
10778       prefix lengths are different (66 vs F3) */
10779    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x6F) {
10780       vassert(!has_66_pfx);
10781       modrm = getIByte(delta+3);
10782       if (epartIsReg(modrm)) {
10783          putXMMReg( gregOfRM(modrm),
10784                     getXMMReg( eregOfRM(modrm) ));
10785          DIP("movdqu %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10786                                nameXMMReg(gregOfRM(modrm)));
10787          delta += 3+1;
10788       } else {
10789          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
10790          putXMMReg( gregOfRM(modrm),
10791                     loadLE(Ity_V128, mkexpr(addr)) );
10792          DIP("movdqu %s,%s\n", dis_buf,
10793                                nameXMMReg(gregOfRM(modrm)));
10794          delta += 3+alen;
10795       }
10796       goto decode_success;
10797    }
10798 
10799    /* F3 0F 7F = MOVDQU -- move from G (xmm) to E (mem or xmm). */
10800    /* Unfortunately can't simply use the MOVDQA case since the
10801       prefix lengths are different (66 vs F3) */
10802    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x7F) {
10803       vassert(!has_66_pfx);
10804       modrm = getIByte(delta+3);
10805       if (epartIsReg(modrm)) {
10806          delta += 3+1;
10807          putXMMReg( eregOfRM(modrm),
10808                     getXMMReg(gregOfRM(modrm)) );
10809          DIP("movdqu %s, %s\n", nameXMMReg(gregOfRM(modrm)),
10810                                 nameXMMReg(eregOfRM(modrm)));
10811       } else {
10812          addr = disAMode( &alen, sorb, delta+3, dis_buf );
10813          delta += 3+alen;
10814          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
10815          DIP("movdqu %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
10816       }
10817       goto decode_success;
10818    }
10819 
10820    /* F2 0F D6 = MOVDQ2Q -- move from E (lo half xmm, not mem) to G (mmx). */
10821    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xD6) {
10822       vassert(!has_66_pfx);
10823       modrm = getIByte(delta+3);
10824       if (epartIsReg(modrm)) {
10825          do_MMX_preamble();
10826          putMMXReg( gregOfRM(modrm),
10827                     getXMMRegLane64( eregOfRM(modrm), 0 ));
10828          DIP("movdq2q %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10829                                 nameMMXReg(gregOfRM(modrm)));
10830          delta += 3+1;
10831          goto decode_success;
10832       } else {
10833          /* fall through, apparently no mem case for this insn */
10834       }
10835    }
10836 
10837    /* 66 0F 16 = MOVHPD -- move from mem to high half of XMM. */
10838    /* These seems identical to MOVHPS.  This instruction encoding is
10839       completely crazy. */
10840    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x16) {
10841       modrm = getIByte(delta+2);
10842       if (epartIsReg(modrm)) {
10843          /* fall through; apparently reg-reg is not possible */
10844       } else {
10845          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10846          delta += 2+alen;
10847          putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
10848                           loadLE(Ity_I64, mkexpr(addr)) );
10849          DIP("movhpd %s,%s\n", dis_buf,
10850                                nameXMMReg( gregOfRM(modrm) ));
10851          goto decode_success;
10852       }
10853    }
10854 
10855    /* 66 0F 17 = MOVHPD -- move from high half of XMM to mem. */
10856    /* Again, this seems identical to MOVHPS. */
10857    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x17) {
10858       if (!epartIsReg(insn[2])) {
10859          delta += 2;
10860          addr = disAMode ( &alen, sorb, delta, dis_buf );
10861          delta += alen;
10862          storeLE( mkexpr(addr),
10863                   getXMMRegLane64( gregOfRM(insn[2]),
10864                                    1/*upper lane*/ ) );
10865          DIP("movhpd %s,%s\n", nameXMMReg( gregOfRM(insn[2]) ),
10866                                dis_buf);
10867          goto decode_success;
10868       }
10869       /* else fall through */
10870    }
10871 
10872    /* 66 0F 12 = MOVLPD -- move from mem to low half of XMM. */
10873    /* Identical to MOVLPS ? */
10874    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x12) {
10875       modrm = getIByte(delta+2);
10876       if (epartIsReg(modrm)) {
10877          /* fall through; apparently reg-reg is not possible */
10878       } else {
10879          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10880          delta += 2+alen;
10881          putXMMRegLane64( gregOfRM(modrm),  0/*lower lane*/,
10882                           loadLE(Ity_I64, mkexpr(addr)) );
10883          DIP("movlpd %s, %s\n",
10884              dis_buf, nameXMMReg( gregOfRM(modrm) ));
10885          goto decode_success;
10886       }
10887    }
10888 
10889    /* 66 0F 13 = MOVLPD -- move from low half of XMM to mem. */
10890    /* Identical to MOVLPS ? */
10891    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x13) {
10892       if (!epartIsReg(insn[2])) {
10893          delta += 2;
10894          addr = disAMode ( &alen, sorb, delta, dis_buf );
10895          delta += alen;
10896          storeLE( mkexpr(addr),
10897                   getXMMRegLane64( gregOfRM(insn[2]),
10898                                    0/*lower lane*/ ) );
10899          DIP("movlpd %s, %s\n", nameXMMReg( gregOfRM(insn[2]) ),
10900                                 dis_buf);
10901          goto decode_success;
10902       }
10903       /* else fall through */
10904    }
10905 
10906    /* 66 0F 50 = MOVMSKPD - move 2 sign bits from 2 x F64 in xmm(E) to
10907       2 lowest bits of ireg(G) */
10908    if (insn[0] == 0x0F && insn[1] == 0x50) {
10909       modrm = getIByte(delta+2);
10910       if (has_66_pfx && epartIsReg(modrm)) {
10911          Int src;
10912          t0 = newTemp(Ity_I32);
10913          t1 = newTemp(Ity_I32);
10914          delta += 2+1;
10915          src = eregOfRM(modrm);
10916          assign( t0, binop( Iop_And32,
10917                             binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(31)),
10918                             mkU32(1) ));
10919          assign( t1, binop( Iop_And32,
10920                             binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(30)),
10921                             mkU32(2) ));
10922          putIReg(4, gregOfRM(modrm),
10923                     binop(Iop_Or32, mkexpr(t0), mkexpr(t1))
10924                  );
10925          DIP("movmskpd %s,%s\n", nameXMMReg(src),
10926                                  nameIReg(4, gregOfRM(modrm)));
10927          goto decode_success;
10928       }
10929       /* else fall through */
10930    }
10931 
10932    /* 66 0F F7 = MASKMOVDQU -- store selected bytes of double quadword */
10933    if (insn[0] == 0x0F && insn[1] == 0xF7) {
10934       modrm = getIByte(delta+2);
10935       if (has_66_pfx && epartIsReg(modrm)) {
10936          IRTemp regD    = newTemp(Ity_V128);
10937          IRTemp mask    = newTemp(Ity_V128);
10938          IRTemp olddata = newTemp(Ity_V128);
10939          IRTemp newdata = newTemp(Ity_V128);
10940                 addr    = newTemp(Ity_I32);
10941 
10942          assign( addr, handleSegOverride( sorb, getIReg(4, R_EDI) ));
10943          assign( regD, getXMMReg( gregOfRM(modrm) ));
10944 
10945          /* Unfortunately can't do the obvious thing with SarN8x16
10946             here since that can't be re-emitted as SSE2 code - no such
10947             insn. */
10948 	 assign(
10949             mask,
10950             binop(Iop_64HLtoV128,
10951                   binop(Iop_SarN8x8,
10952                         getXMMRegLane64( eregOfRM(modrm), 1 ),
10953                         mkU8(7) ),
10954                   binop(Iop_SarN8x8,
10955                         getXMMRegLane64( eregOfRM(modrm), 0 ),
10956                         mkU8(7) ) ));
10957          assign( olddata, loadLE( Ity_V128, mkexpr(addr) ));
10958          assign( newdata,
10959                  binop(Iop_OrV128,
10960                        binop(Iop_AndV128,
10961                              mkexpr(regD),
10962                              mkexpr(mask) ),
10963                        binop(Iop_AndV128,
10964                              mkexpr(olddata),
10965                              unop(Iop_NotV128, mkexpr(mask)))) );
10966          storeLE( mkexpr(addr), mkexpr(newdata) );
10967 
10968          delta += 2+1;
10969          DIP("maskmovdqu %s,%s\n", nameXMMReg( eregOfRM(modrm) ),
10970                                    nameXMMReg( gregOfRM(modrm) ) );
10971          goto decode_success;
10972       }
10973       /* else fall through */
10974    }
10975 
10976    /* 66 0F E7 = MOVNTDQ -- for us, just a plain SSE store. */
10977    if (insn[0] == 0x0F && insn[1] == 0xE7) {
10978       modrm = getIByte(delta+2);
10979       if (has_66_pfx && !epartIsReg(modrm)) {
10980          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10981          gen_SEGV_if_not_16_aligned( addr );
10982          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
10983          DIP("movntdq %s,%s\n", dis_buf,
10984                                 nameXMMReg(gregOfRM(modrm)));
10985          delta += 2+alen;
10986          goto decode_success;
10987       }
10988       /* else fall through */
10989    }
10990 
10991    /* 0F C3 = MOVNTI -- for us, just a plain ireg store. */
10992    if (insn[0] == 0x0F && insn[1] == 0xC3) {
10993       vassert(!has_66_pfx);
10994       modrm = getIByte(delta+2);
10995       if (!epartIsReg(modrm)) {
10996          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10997          storeLE( mkexpr(addr), getIReg(4, gregOfRM(modrm)) );
10998          DIP("movnti %s,%s\n", dis_buf,
10999                                nameIReg(4, gregOfRM(modrm)));
11000          delta += 2+alen;
11001          goto decode_success;
11002       }
11003       /* else fall through */
11004    }
11005 
11006    /* 66 0F D6 = MOVQ -- move 64 bits from G (lo half xmm) to E (mem
11007       or lo half xmm).  */
11008    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xD6) {
11009       modrm = getIByte(delta+2);
11010       if (epartIsReg(modrm)) {
11011          /* fall through, awaiting test case */
11012          /* dst: lo half copied, hi half zeroed */
11013       } else {
11014          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11015          storeLE( mkexpr(addr),
11016                   getXMMRegLane64( gregOfRM(modrm), 0 ));
11017          DIP("movq %s,%s\n", nameXMMReg(gregOfRM(modrm)), dis_buf );
11018          delta += 2+alen;
11019          goto decode_success;
11020       }
11021    }
11022 
11023    /* F3 0F D6 = MOVQ2DQ -- move from E (mmx) to G (lo half xmm, zero
11024       hi half). */
11025    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xD6) {
11026       vassert(!has_66_pfx);
11027       modrm = getIByte(delta+3);
11028       if (epartIsReg(modrm)) {
11029          do_MMX_preamble();
11030          putXMMReg( gregOfRM(modrm),
11031                     unop(Iop_64UtoV128, getMMXReg( eregOfRM(modrm) )) );
11032          DIP("movq2dq %s,%s\n", nameMMXReg(eregOfRM(modrm)),
11033                                 nameXMMReg(gregOfRM(modrm)));
11034          delta += 3+1;
11035          goto decode_success;
11036       } else {
11037          /* fall through, apparently no mem case for this insn */
11038       }
11039    }
11040 
11041    /* F3 0F 7E = MOVQ -- move 64 bits from E (mem or lo half xmm) to
11042       G (lo half xmm).  Upper half of G is zeroed out. */
11043    /* F2 0F 10 = MOVSD -- move 64 bits from E (mem or lo half xmm) to
11044       G (lo half xmm).  If E is mem, upper half of G is zeroed out.
11045       If E is reg, upper half of G is unchanged. */
11046    if ((insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x10)
11047        || (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x7E)) {
11048       vassert(!has_66_pfx);
11049       modrm = getIByte(delta+3);
11050       if (epartIsReg(modrm)) {
11051          putXMMRegLane64( gregOfRM(modrm), 0,
11052                           getXMMRegLane64( eregOfRM(modrm), 0 ));
11053          if (insn[0] == 0xF3/*MOVQ*/) {
11054             /* zero bits 127:64 */
11055             putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
11056          }
11057          DIP("movsd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11058                               nameXMMReg(gregOfRM(modrm)));
11059          delta += 3+1;
11060       } else {
11061          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11062          /* zero bits 127:64 */
11063          putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
11064          /* write bits 63:0 */
11065          putXMMRegLane64( gregOfRM(modrm), 0,
11066                           loadLE(Ity_I64, mkexpr(addr)) );
11067          DIP("movsd %s,%s\n", dis_buf,
11068                               nameXMMReg(gregOfRM(modrm)));
11069          delta += 3+alen;
11070       }
11071       goto decode_success;
11072    }
11073 
11074    /* F2 0F 11 = MOVSD -- move 64 bits from G (lo half xmm) to E (mem
11075       or lo half xmm). */
11076    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x11) {
11077       vassert(!has_66_pfx);
11078       modrm = getIByte(delta+3);
11079       if (epartIsReg(modrm)) {
11080          putXMMRegLane64( eregOfRM(modrm), 0,
11081                           getXMMRegLane64( gregOfRM(modrm), 0 ));
11082          DIP("movsd %s,%s\n", nameXMMReg(gregOfRM(modrm)),
11083                               nameXMMReg(eregOfRM(modrm)));
11084          delta += 3+1;
11085       } else {
11086          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11087          storeLE( mkexpr(addr),
11088                   getXMMRegLane64(gregOfRM(modrm), 0) );
11089          DIP("movsd %s,%s\n", nameXMMReg(gregOfRM(modrm)),
11090                               dis_buf);
11091          delta += 3+alen;
11092       }
11093       goto decode_success;
11094    }
11095 
11096    /* 66 0F 59 = MULPD -- mul 64Fx2 from R/M to R */
11097    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x59) {
11098       delta = dis_SSE_E_to_G_all( sorb, delta+2, "mulpd", Iop_Mul64Fx2 );
11099       goto decode_success;
11100    }
11101 
11102    /* F2 0F 59 = MULSD -- mul 64F0x2 from R/M to R */
11103    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x59) {
11104       vassert(!has_66_pfx);
11105       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "mulsd", Iop_Mul64F0x2 );
11106       goto decode_success;
11107    }
11108 
11109    /* 66 0F 56 = ORPD -- G = G and E */
11110    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x56) {
11111       delta = dis_SSE_E_to_G_all( sorb, delta+2, "orpd", Iop_OrV128 );
11112       goto decode_success;
11113    }
11114 
11115    /* 66 0F C6 /r ib = SHUFPD -- shuffle packed F64s */
11116    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xC6) {
11117       Int    select;
11118       IRTemp sV = newTemp(Ity_V128);
11119       IRTemp dV = newTemp(Ity_V128);
11120       IRTemp s1 = newTemp(Ity_I64);
11121       IRTemp s0 = newTemp(Ity_I64);
11122       IRTemp d1 = newTemp(Ity_I64);
11123       IRTemp d0 = newTemp(Ity_I64);
11124 
11125       modrm = insn[2];
11126       assign( dV, getXMMReg(gregOfRM(modrm)) );
11127 
11128       if (epartIsReg(modrm)) {
11129          assign( sV, getXMMReg(eregOfRM(modrm)) );
11130          select = (Int)insn[3];
11131          delta += 2+2;
11132          DIP("shufpd $%d,%s,%s\n", select,
11133                                    nameXMMReg(eregOfRM(modrm)),
11134                                    nameXMMReg(gregOfRM(modrm)));
11135       } else {
11136          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11137          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11138          select = (Int)insn[2+alen];
11139          delta += 3+alen;
11140          DIP("shufpd $%d,%s,%s\n", select,
11141                                    dis_buf,
11142                                    nameXMMReg(gregOfRM(modrm)));
11143       }
11144 
11145       assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
11146       assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
11147       assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
11148       assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
11149 
11150 #     define SELD(n) mkexpr((n)==0 ? d0 : d1)
11151 #     define SELS(n) mkexpr((n)==0 ? s0 : s1)
11152 
11153       putXMMReg(
11154          gregOfRM(modrm),
11155          binop(Iop_64HLtoV128, SELS((select>>1)&1), SELD((select>>0)&1) )
11156       );
11157 
11158 #     undef SELD
11159 #     undef SELS
11160 
11161       goto decode_success;
11162    }
11163 
11164    /* 66 0F 51 = SQRTPD -- approx sqrt 64Fx2 from R/M to R */
11165    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x51) {
11166       delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
11167                                         "sqrtpd", Iop_Sqrt64Fx2 );
11168       goto decode_success;
11169    }
11170 
11171    /* F2 0F 51 = SQRTSD -- approx sqrt 64F0x2 from R/M to R */
11172    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x51) {
11173       vassert(!has_66_pfx);
11174       delta = dis_SSE_E_to_G_unary_lo64( sorb, delta+3,
11175                                          "sqrtsd", Iop_Sqrt64F0x2 );
11176       goto decode_success;
11177    }
11178 
11179    /* 66 0F 5C = SUBPD -- sub 64Fx2 from R/M to R */
11180    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x5C) {
11181       delta = dis_SSE_E_to_G_all( sorb, delta+2, "subpd", Iop_Sub64Fx2 );
11182       goto decode_success;
11183    }
11184 
11185    /* F2 0F 5C = SUBSD -- sub 64F0x2 from R/M to R */
11186    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5C) {
11187       vassert(!has_66_pfx);
11188       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "subsd", Iop_Sub64F0x2 );
11189       goto decode_success;
11190    }
11191 
11192    /* 66 0F 15 = UNPCKHPD -- unpack and interleave high part F64s */
11193    /* 66 0F 14 = UNPCKLPD -- unpack and interleave low part F64s */
11194    /* These just appear to be special cases of SHUFPS */
11195    if (has_66_pfx && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
11196       IRTemp s1 = newTemp(Ity_I64);
11197       IRTemp s0 = newTemp(Ity_I64);
11198       IRTemp d1 = newTemp(Ity_I64);
11199       IRTemp d0 = newTemp(Ity_I64);
11200       IRTemp sV = newTemp(Ity_V128);
11201       IRTemp dV = newTemp(Ity_V128);
11202       Bool   hi = toBool(insn[1] == 0x15);
11203 
11204       modrm = insn[2];
11205       assign( dV, getXMMReg(gregOfRM(modrm)) );
11206 
11207       if (epartIsReg(modrm)) {
11208          assign( sV, getXMMReg(eregOfRM(modrm)) );
11209          delta += 2+1;
11210          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
11211                                   nameXMMReg(eregOfRM(modrm)),
11212                                   nameXMMReg(gregOfRM(modrm)));
11213       } else {
11214          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11215          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11216          delta += 2+alen;
11217          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
11218                                   dis_buf,
11219                                   nameXMMReg(gregOfRM(modrm)));
11220       }
11221 
11222       assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
11223       assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
11224       assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
11225       assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
11226 
11227       if (hi) {
11228          putXMMReg( gregOfRM(modrm),
11229                     binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) );
11230       } else {
11231          putXMMReg( gregOfRM(modrm),
11232                     binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)) );
11233       }
11234 
11235       goto decode_success;
11236    }
11237 
11238    /* 66 0F 57 = XORPD -- G = G and E */
11239    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x57) {
11240       delta = dis_SSE_E_to_G_all( sorb, delta+2, "xorpd", Iop_XorV128 );
11241       goto decode_success;
11242    }
11243 
11244    /* 66 0F 6B = PACKSSDW */
11245    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x6B) {
11246       delta = dis_SSEint_E_to_G( sorb, delta+2,
11247                                  "packssdw",
11248                                  Iop_QNarrowBin32Sto16Sx8, True );
11249       goto decode_success;
11250    }
11251 
11252    /* 66 0F 63 = PACKSSWB */
11253    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x63) {
11254       delta = dis_SSEint_E_to_G( sorb, delta+2,
11255                                  "packsswb",
11256                                  Iop_QNarrowBin16Sto8Sx16, True );
11257       goto decode_success;
11258    }
11259 
11260    /* 66 0F 67 = PACKUSWB */
11261    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x67) {
11262       delta = dis_SSEint_E_to_G( sorb, delta+2,
11263                                  "packuswb",
11264                                  Iop_QNarrowBin16Sto8Ux16, True );
11265       goto decode_success;
11266    }
11267 
11268    /* 66 0F FC = PADDB */
11269    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xFC) {
11270       delta = dis_SSEint_E_to_G( sorb, delta+2,
11271                                  "paddb", Iop_Add8x16, False );
11272       goto decode_success;
11273    }
11274 
11275    /* 66 0F FE = PADDD */
11276    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xFE) {
11277       delta = dis_SSEint_E_to_G( sorb, delta+2,
11278                                  "paddd", Iop_Add32x4, False );
11279       goto decode_success;
11280    }
11281 
11282    /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
11283    /* 0F D4 = PADDQ -- add 64x1 */
11284    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0xD4) {
11285       do_MMX_preamble();
11286       delta = dis_MMXop_regmem_to_reg (
11287                 sorb, delta+2, insn[1], "paddq", False );
11288       goto decode_success;
11289    }
11290 
11291    /* 66 0F D4 = PADDQ */
11292    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xD4) {
11293       delta = dis_SSEint_E_to_G( sorb, delta+2,
11294                                  "paddq", Iop_Add64x2, False );
11295       goto decode_success;
11296    }
11297 
11298    /* 66 0F FD = PADDW */
11299    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xFD) {
11300       delta = dis_SSEint_E_to_G( sorb, delta+2,
11301                                  "paddw", Iop_Add16x8, False );
11302       goto decode_success;
11303    }
11304 
11305    /* 66 0F EC = PADDSB */
11306    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xEC) {
11307       delta = dis_SSEint_E_to_G( sorb, delta+2,
11308                                  "paddsb", Iop_QAdd8Sx16, False );
11309       goto decode_success;
11310    }
11311 
11312    /* 66 0F ED = PADDSW */
11313    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xED) {
11314       delta = dis_SSEint_E_to_G( sorb, delta+2,
11315                                  "paddsw", Iop_QAdd16Sx8, False );
11316       goto decode_success;
11317    }
11318 
11319    /* 66 0F DC = PADDUSB */
11320    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xDC) {
11321       delta = dis_SSEint_E_to_G( sorb, delta+2,
11322                                  "paddusb", Iop_QAdd8Ux16, False );
11323       goto decode_success;
11324    }
11325 
11326    /* 66 0F DD = PADDUSW */
11327    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xDD) {
11328       delta = dis_SSEint_E_to_G( sorb, delta+2,
11329                                  "paddusw", Iop_QAdd16Ux8, False );
11330       goto decode_success;
11331    }
11332 
11333    /* 66 0F DB = PAND */
11334    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xDB) {
11335       delta = dis_SSE_E_to_G_all( sorb, delta+2, "pand", Iop_AndV128 );
11336       goto decode_success;
11337    }
11338 
11339    /* 66 0F DF = PANDN */
11340    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xDF) {
11341       delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "pandn", Iop_AndV128 );
11342       goto decode_success;
11343    }
11344 
11345    /* 66 0F E0 = PAVGB */
11346    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xE0) {
11347       delta = dis_SSEint_E_to_G( sorb, delta+2,
11348                                  "pavgb", Iop_Avg8Ux16, False );
11349       goto decode_success;
11350    }
11351 
11352    /* 66 0F E3 = PAVGW */
11353    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xE3) {
11354       delta = dis_SSEint_E_to_G( sorb, delta+2,
11355                                  "pavgw", Iop_Avg16Ux8, False );
11356       goto decode_success;
11357    }
11358 
11359    /* 66 0F 74 = PCMPEQB */
11360    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x74) {
11361       delta = dis_SSEint_E_to_G( sorb, delta+2,
11362                                  "pcmpeqb", Iop_CmpEQ8x16, False );
11363       goto decode_success;
11364    }
11365 
11366    /* 66 0F 76 = PCMPEQD */
11367    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x76) {
11368       delta = dis_SSEint_E_to_G( sorb, delta+2,
11369                                  "pcmpeqd", Iop_CmpEQ32x4, False );
11370       goto decode_success;
11371    }
11372 
11373    /* 66 0F 75 = PCMPEQW */
11374    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x75) {
11375       delta = dis_SSEint_E_to_G( sorb, delta+2,
11376                                  "pcmpeqw", Iop_CmpEQ16x8, False );
11377       goto decode_success;
11378    }
11379 
11380    /* 66 0F 64 = PCMPGTB */
11381    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x64) {
11382       delta = dis_SSEint_E_to_G( sorb, delta+2,
11383                                  "pcmpgtb", Iop_CmpGT8Sx16, False );
11384       goto decode_success;
11385    }
11386 
11387    /* 66 0F 66 = PCMPGTD */
11388    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x66) {
11389       delta = dis_SSEint_E_to_G( sorb, delta+2,
11390                                  "pcmpgtd", Iop_CmpGT32Sx4, False );
11391       goto decode_success;
11392    }
11393 
11394    /* 66 0F 65 = PCMPGTW */
11395    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x65) {
11396       delta = dis_SSEint_E_to_G( sorb, delta+2,
11397                                  "pcmpgtw", Iop_CmpGT16Sx8, False );
11398       goto decode_success;
11399    }
11400 
11401    /* 66 0F C5 = PEXTRW -- extract 16-bit field from xmm(E) and put
11402       zero-extend of it in ireg(G). */
11403    if (insn[0] == 0x0F && insn[1] == 0xC5) {
11404       modrm = insn[2];
11405       if (has_66_pfx && epartIsReg(modrm)) {
11406          t5 = newTemp(Ity_V128);
11407          t4 = newTemp(Ity_I16);
11408          assign(t5, getXMMReg(eregOfRM(modrm)));
11409          breakup128to32s( t5, &t3, &t2, &t1, &t0 );
11410          switch (insn[3] & 7) {
11411             case 0:  assign(t4, unop(Iop_32to16,   mkexpr(t0))); break;
11412             case 1:  assign(t4, unop(Iop_32HIto16, mkexpr(t0))); break;
11413             case 2:  assign(t4, unop(Iop_32to16,   mkexpr(t1))); break;
11414             case 3:  assign(t4, unop(Iop_32HIto16, mkexpr(t1))); break;
11415             case 4:  assign(t4, unop(Iop_32to16,   mkexpr(t2))); break;
11416             case 5:  assign(t4, unop(Iop_32HIto16, mkexpr(t2))); break;
11417             case 6:  assign(t4, unop(Iop_32to16,   mkexpr(t3))); break;
11418             case 7:  assign(t4, unop(Iop_32HIto16, mkexpr(t3))); break;
11419             default: vassert(0); /*NOTREACHED*/
11420          }
11421          putIReg(4, gregOfRM(modrm), unop(Iop_16Uto32, mkexpr(t4)));
11422          DIP("pextrw $%d,%s,%s\n",
11423              (Int)insn[3], nameXMMReg(eregOfRM(modrm)),
11424                            nameIReg(4,gregOfRM(modrm)));
11425          delta += 4;
11426          goto decode_success;
11427       }
11428       /* else fall through */
11429    }
11430 
11431    /* 66 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
11432       put it into the specified lane of xmm(G). */
11433    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xC4) {
11434       Int lane;
11435       t4 = newTemp(Ity_I16);
11436       modrm = insn[2];
11437 
11438       if (epartIsReg(modrm)) {
11439          assign(t4, getIReg(2, eregOfRM(modrm)));
11440          delta += 3+1;
11441          lane = insn[3+1-1];
11442          DIP("pinsrw $%d,%s,%s\n", lane,
11443                                    nameIReg(2,eregOfRM(modrm)),
11444                                    nameXMMReg(gregOfRM(modrm)));
11445       } else {
11446          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11447          delta += 3+alen;
11448          lane = insn[3+alen-1];
11449          assign(t4, loadLE(Ity_I16, mkexpr(addr)));
11450          DIP("pinsrw $%d,%s,%s\n", lane,
11451                                    dis_buf,
11452                                    nameXMMReg(gregOfRM(modrm)));
11453       }
11454 
11455       putXMMRegLane16( gregOfRM(modrm), lane & 7, mkexpr(t4) );
11456       goto decode_success;
11457    }
11458 
11459    /* 66 0F F5 = PMADDWD -- Multiply and add packed integers from
11460       E(xmm or mem) to G(xmm) */
11461    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xF5) {
11462       IRTemp s1V  = newTemp(Ity_V128);
11463       IRTemp s2V  = newTemp(Ity_V128);
11464       IRTemp dV   = newTemp(Ity_V128);
11465       IRTemp s1Hi = newTemp(Ity_I64);
11466       IRTemp s1Lo = newTemp(Ity_I64);
11467       IRTemp s2Hi = newTemp(Ity_I64);
11468       IRTemp s2Lo = newTemp(Ity_I64);
11469       IRTemp dHi  = newTemp(Ity_I64);
11470       IRTemp dLo  = newTemp(Ity_I64);
11471       modrm = insn[2];
11472       if (epartIsReg(modrm)) {
11473          assign( s1V, getXMMReg(eregOfRM(modrm)) );
11474          delta += 2+1;
11475          DIP("pmaddwd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11476                                 nameXMMReg(gregOfRM(modrm)));
11477       } else {
11478          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11479          assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
11480          delta += 2+alen;
11481          DIP("pmaddwd %s,%s\n", dis_buf,
11482                                 nameXMMReg(gregOfRM(modrm)));
11483       }
11484       assign( s2V, getXMMReg(gregOfRM(modrm)) );
11485       assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
11486       assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
11487       assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
11488       assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
11489       assign( dHi, mkIRExprCCall(
11490                       Ity_I64, 0/*regparms*/,
11491                       "x86g_calculate_mmx_pmaddwd",
11492                       &x86g_calculate_mmx_pmaddwd,
11493                       mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
11494                    ));
11495       assign( dLo, mkIRExprCCall(
11496                       Ity_I64, 0/*regparms*/,
11497                       "x86g_calculate_mmx_pmaddwd",
11498                       &x86g_calculate_mmx_pmaddwd,
11499                       mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
11500                    ));
11501       assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
11502       putXMMReg(gregOfRM(modrm), mkexpr(dV));
11503       goto decode_success;
11504    }
11505 
11506    /* 66 0F EE = PMAXSW -- 16x8 signed max */
11507    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xEE) {
11508       delta = dis_SSEint_E_to_G( sorb, delta+2,
11509                                  "pmaxsw", Iop_Max16Sx8, False );
11510       goto decode_success;
11511    }
11512 
11513    /* 66 0F DE = PMAXUB -- 8x16 unsigned max */
11514    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xDE) {
11515       delta = dis_SSEint_E_to_G( sorb, delta+2,
11516                                  "pmaxub", Iop_Max8Ux16, False );
11517       goto decode_success;
11518    }
11519 
11520    /* 66 0F EA = PMINSW -- 16x8 signed min */
11521    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xEA) {
11522       delta = dis_SSEint_E_to_G( sorb, delta+2,
11523                                  "pminsw", Iop_Min16Sx8, False );
11524       goto decode_success;
11525    }
11526 
11527    /* 66 0F DA = PMINUB -- 8x16 unsigned min */
11528    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xDA) {
11529       delta = dis_SSEint_E_to_G( sorb, delta+2,
11530                                  "pminub", Iop_Min8Ux16, False );
11531       goto decode_success;
11532    }
11533 
11534    /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16 lanes
11535       in xmm(E), turn them into a byte, and put zero-extend of it in
11536       ireg(G). */
11537    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xD7) {
11538       modrm = insn[2];
11539       if (epartIsReg(modrm)) {
11540          t0 = newTemp(Ity_I64);
11541          t1 = newTemp(Ity_I64);
11542          assign(t0, getXMMRegLane64(eregOfRM(modrm), 0));
11543          assign(t1, getXMMRegLane64(eregOfRM(modrm), 1));
11544          t5 = newTemp(Ity_I32);
11545          assign(t5,
11546                 unop(Iop_16Uto32,
11547                      binop(Iop_8HLto16,
11548                            unop(Iop_GetMSBs8x8, mkexpr(t1)),
11549                            unop(Iop_GetMSBs8x8, mkexpr(t0)))));
11550          putIReg(4, gregOfRM(modrm), mkexpr(t5));
11551          DIP("pmovmskb %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11552                                  nameIReg(4,gregOfRM(modrm)));
11553          delta += 3;
11554          goto decode_success;
11555       }
11556       /* else fall through */
11557    }
11558 
11559    /* 66 0F E4 = PMULHUW -- 16x8 hi-half of unsigned widening multiply */
11560    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xE4) {
11561       delta = dis_SSEint_E_to_G( sorb, delta+2,
11562                                  "pmulhuw", Iop_MulHi16Ux8, False );
11563       goto decode_success;
11564    }
11565 
11566    /* 66 0F E5 = PMULHW -- 16x8 hi-half of signed widening multiply */
11567    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xE5) {
11568       delta = dis_SSEint_E_to_G( sorb, delta+2,
11569                                  "pmulhw", Iop_MulHi16Sx8, False );
11570       goto decode_success;
11571    }
11572 
11573    /* 66 0F D5 = PMULHL -- 16x8 multiply */
11574    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xD5) {
11575       delta = dis_SSEint_E_to_G( sorb, delta+2,
11576                                  "pmullw", Iop_Mul16x8, False );
11577       goto decode_success;
11578    }
11579 
11580    /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
11581    /* 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
11582       0 to form 64-bit result */
11583    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0xF4) {
11584       IRTemp sV = newTemp(Ity_I64);
11585       IRTemp dV = newTemp(Ity_I64);
11586       t1 = newTemp(Ity_I32);
11587       t0 = newTemp(Ity_I32);
11588       modrm = insn[2];
11589 
11590       do_MMX_preamble();
11591       assign( dV, getMMXReg(gregOfRM(modrm)) );
11592 
11593       if (epartIsReg(modrm)) {
11594          assign( sV, getMMXReg(eregOfRM(modrm)) );
11595          delta += 2+1;
11596          DIP("pmuludq %s,%s\n", nameMMXReg(eregOfRM(modrm)),
11597                                 nameMMXReg(gregOfRM(modrm)));
11598       } else {
11599          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11600          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
11601          delta += 2+alen;
11602          DIP("pmuludq %s,%s\n", dis_buf,
11603                                 nameMMXReg(gregOfRM(modrm)));
11604       }
11605 
11606       assign( t0, unop(Iop_64to32, mkexpr(dV)) );
11607       assign( t1, unop(Iop_64to32, mkexpr(sV)) );
11608       putMMXReg( gregOfRM(modrm),
11609                  binop( Iop_MullU32, mkexpr(t0), mkexpr(t1) ) );
11610       goto decode_success;
11611    }
11612 
11613    /* 66 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
11614       0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit
11615       half */
11616    /* This is a really poor translation -- could be improved if
11617       performance critical */
11618    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xF4) {
11619       IRTemp sV, dV;
11620       IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
11621       sV = newTemp(Ity_V128);
11622       dV = newTemp(Ity_V128);
11623       s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
11624       t1 = newTemp(Ity_I64);
11625       t0 = newTemp(Ity_I64);
11626       modrm = insn[2];
11627       assign( dV, getXMMReg(gregOfRM(modrm)) );
11628 
11629       if (epartIsReg(modrm)) {
11630          assign( sV, getXMMReg(eregOfRM(modrm)) );
11631          delta += 2+1;
11632          DIP("pmuludq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11633                                 nameXMMReg(gregOfRM(modrm)));
11634       } else {
11635          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11636          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11637          delta += 2+alen;
11638          DIP("pmuludq %s,%s\n", dis_buf,
11639                                 nameXMMReg(gregOfRM(modrm)));
11640       }
11641 
11642       breakup128to32s( dV, &d3, &d2, &d1, &d0 );
11643       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
11644 
11645       assign( t0, binop( Iop_MullU32, mkexpr(d0), mkexpr(s0)) );
11646       putXMMRegLane64( gregOfRM(modrm), 0, mkexpr(t0) );
11647       assign( t1, binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)) );
11648       putXMMRegLane64( gregOfRM(modrm), 1, mkexpr(t1) );
11649       goto decode_success;
11650    }
11651 
11652    /* 66 0F EB = POR */
11653    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xEB) {
11654       delta = dis_SSE_E_to_G_all( sorb, delta+2, "por", Iop_OrV128 );
11655       goto decode_success;
11656    }
11657 
11658    /* 66 0F F6 = PSADBW -- 2 x (8x8 -> 48 zeroes ++ u16) Sum Abs Diffs
11659       from E(xmm or mem) to G(xmm) */
11660    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xF6) {
11661       IRTemp s1V  = newTemp(Ity_V128);
11662       IRTemp s2V  = newTemp(Ity_V128);
11663       IRTemp dV   = newTemp(Ity_V128);
11664       IRTemp s1Hi = newTemp(Ity_I64);
11665       IRTemp s1Lo = newTemp(Ity_I64);
11666       IRTemp s2Hi = newTemp(Ity_I64);
11667       IRTemp s2Lo = newTemp(Ity_I64);
11668       IRTemp dHi  = newTemp(Ity_I64);
11669       IRTemp dLo  = newTemp(Ity_I64);
11670       modrm = insn[2];
11671       if (epartIsReg(modrm)) {
11672          assign( s1V, getXMMReg(eregOfRM(modrm)) );
11673          delta += 2+1;
11674          DIP("psadbw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11675                                nameXMMReg(gregOfRM(modrm)));
11676       } else {
11677          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11678          assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
11679          delta += 2+alen;
11680          DIP("psadbw %s,%s\n", dis_buf,
11681                                nameXMMReg(gregOfRM(modrm)));
11682       }
11683       assign( s2V, getXMMReg(gregOfRM(modrm)) );
11684       assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
11685       assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
11686       assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
11687       assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
11688       assign( dHi, mkIRExprCCall(
11689                       Ity_I64, 0/*regparms*/,
11690                       "x86g_calculate_mmx_psadbw",
11691                       &x86g_calculate_mmx_psadbw,
11692                       mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
11693                    ));
11694       assign( dLo, mkIRExprCCall(
11695                       Ity_I64, 0/*regparms*/,
11696                       "x86g_calculate_mmx_psadbw",
11697                       &x86g_calculate_mmx_psadbw,
11698                       mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
11699                    ));
11700       assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
11701       putXMMReg(gregOfRM(modrm), mkexpr(dV));
11702       goto decode_success;
11703    }
11704 
11705    /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */
11706    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x70) {
11707       Int order;
11708       IRTemp sV, dV, s3, s2, s1, s0;
11709       s3 = s2 = s1 = s0 = IRTemp_INVALID;
11710       sV = newTemp(Ity_V128);
11711       dV = newTemp(Ity_V128);
11712       modrm = insn[2];
11713       if (epartIsReg(modrm)) {
11714          assign( sV, getXMMReg(eregOfRM(modrm)) );
11715          order = (Int)insn[3];
11716          delta += 2+2;
11717          DIP("pshufd $%d,%s,%s\n", order,
11718                                    nameXMMReg(eregOfRM(modrm)),
11719                                    nameXMMReg(gregOfRM(modrm)));
11720       } else {
11721          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11722          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11723 	 order = (Int)insn[2+alen];
11724          delta += 3+alen;
11725          DIP("pshufd $%d,%s,%s\n", order,
11726                                    dis_buf,
11727                                    nameXMMReg(gregOfRM(modrm)));
11728       }
11729       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
11730 
11731 #     define SEL(n) \
11732                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
11733       assign(dV,
11734 	     mk128from32s( SEL((order>>6)&3), SEL((order>>4)&3),
11735                            SEL((order>>2)&3), SEL((order>>0)&3) )
11736       );
11737       putXMMReg(gregOfRM(modrm), mkexpr(dV));
11738 #     undef SEL
11739       goto decode_success;
11740    }
11741 
11742    /* F3 0F 70 = PSHUFHW -- rearrange upper half 4x16 from E(xmm or
11743       mem) to G(xmm), and copy lower half */
11744    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x70) {
11745       Int order;
11746       IRTemp sVhi, dVhi, sV, dV, s3, s2, s1, s0;
11747       s3 = s2 = s1 = s0 = IRTemp_INVALID;
11748       sV   = newTemp(Ity_V128);
11749       dV   = newTemp(Ity_V128);
11750       sVhi = newTemp(Ity_I64);
11751       dVhi = newTemp(Ity_I64);
11752       modrm = insn[3];
11753       if (epartIsReg(modrm)) {
11754          assign( sV, getXMMReg(eregOfRM(modrm)) );
11755          order = (Int)insn[4];
11756          delta += 4+1;
11757          DIP("pshufhw $%d,%s,%s\n", order,
11758                                     nameXMMReg(eregOfRM(modrm)),
11759                                     nameXMMReg(gregOfRM(modrm)));
11760       } else {
11761          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11762          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11763 	 order = (Int)insn[3+alen];
11764          delta += 4+alen;
11765          DIP("pshufhw $%d,%s,%s\n", order,
11766                                     dis_buf,
11767                                     nameXMMReg(gregOfRM(modrm)));
11768       }
11769       assign( sVhi, unop(Iop_V128HIto64, mkexpr(sV)) );
11770       breakup64to16s( sVhi, &s3, &s2, &s1, &s0 );
11771 
11772 #     define SEL(n) \
11773                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
11774       assign(dVhi,
11775 	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
11776                           SEL((order>>2)&3), SEL((order>>0)&3) )
11777       );
11778       assign(dV, binop( Iop_64HLtoV128,
11779                         mkexpr(dVhi),
11780                         unop(Iop_V128to64, mkexpr(sV))) );
11781       putXMMReg(gregOfRM(modrm), mkexpr(dV));
11782 #     undef SEL
11783       goto decode_success;
11784    }
11785 
11786    /* F2 0F 70 = PSHUFLW -- rearrange lower half 4x16 from E(xmm or
11787       mem) to G(xmm), and copy upper half */
11788    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x70) {
11789       Int order;
11790       IRTemp sVlo, dVlo, sV, dV, s3, s2, s1, s0;
11791       s3 = s2 = s1 = s0 = IRTemp_INVALID;
11792       sV   = newTemp(Ity_V128);
11793       dV   = newTemp(Ity_V128);
11794       sVlo = newTemp(Ity_I64);
11795       dVlo = newTemp(Ity_I64);
11796       modrm = insn[3];
11797       if (epartIsReg(modrm)) {
11798          assign( sV, getXMMReg(eregOfRM(modrm)) );
11799          order = (Int)insn[4];
11800          delta += 4+1;
11801          DIP("pshuflw $%d,%s,%s\n", order,
11802                                     nameXMMReg(eregOfRM(modrm)),
11803                                     nameXMMReg(gregOfRM(modrm)));
11804       } else {
11805          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11806          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11807 	 order = (Int)insn[3+alen];
11808          delta += 4+alen;
11809          DIP("pshuflw $%d,%s,%s\n", order,
11810                                     dis_buf,
11811                                     nameXMMReg(gregOfRM(modrm)));
11812       }
11813       assign( sVlo, unop(Iop_V128to64, mkexpr(sV)) );
11814       breakup64to16s( sVlo, &s3, &s2, &s1, &s0 );
11815 
11816 #     define SEL(n) \
11817                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
11818       assign(dVlo,
11819 	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
11820                           SEL((order>>2)&3), SEL((order>>0)&3) )
11821       );
11822       assign(dV, binop( Iop_64HLtoV128,
11823                         unop(Iop_V128HIto64, mkexpr(sV)),
11824                         mkexpr(dVlo) ) );
11825       putXMMReg(gregOfRM(modrm), mkexpr(dV));
11826 #     undef SEL
11827       goto decode_success;
11828    }
11829 
11830    /* 66 0F 72 /6 ib = PSLLD by immediate */
11831    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x72
11832        && epartIsReg(insn[2])
11833        && gregOfRM(insn[2]) == 6) {
11834       delta = dis_SSE_shiftE_imm( delta+2, "pslld", Iop_ShlN32x4 );
11835       goto decode_success;
11836    }
11837 
11838    /* 66 0F F2 = PSLLD by E */
11839    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xF2) {
11840       delta = dis_SSE_shiftG_byE( sorb, delta+2, "pslld", Iop_ShlN32x4 );
11841       goto decode_success;
11842    }
11843 
11844    /* 66 0F 73 /7 ib = PSLLDQ by immediate */
11845    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x73
11846        && epartIsReg(insn[2])
11847        && gregOfRM(insn[2]) == 7) {
11848       IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
11849       Int    imm = (Int)insn[3];
11850       Int    reg = eregOfRM(insn[2]);
11851       DIP("pslldq $%d,%s\n", imm, nameXMMReg(reg));
11852       vassert(imm >= 0 && imm <= 255);
11853       delta += 4;
11854 
11855       sV    = newTemp(Ity_V128);
11856       dV    = newTemp(Ity_V128);
11857       hi64  = newTemp(Ity_I64);
11858       lo64  = newTemp(Ity_I64);
11859       hi64r = newTemp(Ity_I64);
11860       lo64r = newTemp(Ity_I64);
11861 
11862       if (imm >= 16) {
11863          putXMMReg(reg, mkV128(0x0000));
11864          goto decode_success;
11865       }
11866 
11867       assign( sV, getXMMReg(reg) );
11868       assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
11869       assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
11870 
11871       if (imm == 0) {
11872          assign( lo64r, mkexpr(lo64) );
11873          assign( hi64r, mkexpr(hi64) );
11874       }
11875       else
11876       if (imm == 8) {
11877          assign( lo64r, mkU64(0) );
11878          assign( hi64r, mkexpr(lo64) );
11879       }
11880       else
11881       if (imm > 8) {
11882          assign( lo64r, mkU64(0) );
11883          assign( hi64r, binop( Iop_Shl64,
11884                                mkexpr(lo64),
11885                                mkU8( 8*(imm-8) ) ));
11886       } else {
11887          assign( lo64r, binop( Iop_Shl64,
11888                                mkexpr(lo64),
11889                                mkU8(8 * imm) ));
11890          assign( hi64r,
11891                  binop( Iop_Or64,
11892                         binop(Iop_Shl64, mkexpr(hi64),
11893                                          mkU8(8 * imm)),
11894                         binop(Iop_Shr64, mkexpr(lo64),
11895                                          mkU8(8 * (8 - imm)) )
11896                       )
11897                );
11898       }
11899       assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
11900       putXMMReg(reg, mkexpr(dV));
11901       goto decode_success;
11902    }
11903 
11904    /* 66 0F 73 /6 ib = PSLLQ by immediate */
11905    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x73
11906        && epartIsReg(insn[2])
11907        && gregOfRM(insn[2]) == 6) {
11908       delta = dis_SSE_shiftE_imm( delta+2, "psllq", Iop_ShlN64x2 );
11909       goto decode_success;
11910    }
11911 
11912    /* 66 0F F3 = PSLLQ by E */
11913    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xF3) {
11914       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psllq", Iop_ShlN64x2 );
11915       goto decode_success;
11916    }
11917 
11918    /* 66 0F 71 /6 ib = PSLLW by immediate */
11919    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x71
11920        && epartIsReg(insn[2])
11921        && gregOfRM(insn[2]) == 6) {
11922       delta = dis_SSE_shiftE_imm( delta+2, "psllw", Iop_ShlN16x8 );
11923       goto decode_success;
11924    }
11925 
11926    /* 66 0F F1 = PSLLW by E */
11927    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xF1) {
11928       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psllw", Iop_ShlN16x8 );
11929       goto decode_success;
11930    }
11931 
11932    /* 66 0F 72 /4 ib = PSRAD by immediate */
11933    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x72
11934        && epartIsReg(insn[2])
11935        && gregOfRM(insn[2]) == 4) {
11936       delta = dis_SSE_shiftE_imm( delta+2, "psrad", Iop_SarN32x4 );
11937       goto decode_success;
11938    }
11939 
11940    /* 66 0F E2 = PSRAD by E */
11941    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xE2) {
11942       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrad", Iop_SarN32x4 );
11943       goto decode_success;
11944    }
11945 
11946    /* 66 0F 71 /4 ib = PSRAW by immediate */
11947    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x71
11948        && epartIsReg(insn[2])
11949        && gregOfRM(insn[2]) == 4) {
11950       delta = dis_SSE_shiftE_imm( delta+2, "psraw", Iop_SarN16x8 );
11951       goto decode_success;
11952    }
11953 
11954    /* 66 0F E1 = PSRAW by E */
11955    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xE1) {
11956       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psraw", Iop_SarN16x8 );
11957       goto decode_success;
11958    }
11959 
11960    /* 66 0F 72 /2 ib = PSRLD by immediate */
11961    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x72
11962        && epartIsReg(insn[2])
11963        && gregOfRM(insn[2]) == 2) {
11964       delta = dis_SSE_shiftE_imm( delta+2, "psrld", Iop_ShrN32x4 );
11965       goto decode_success;
11966    }
11967 
11968    /* 66 0F D2 = PSRLD by E */
11969    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xD2) {
11970       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrld", Iop_ShrN32x4 );
11971       goto decode_success;
11972    }
11973 
11974    /* 66 0F 73 /3 ib = PSRLDQ by immediate */
11975    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x73
11976        && epartIsReg(insn[2])
11977        && gregOfRM(insn[2]) == 3) {
11978       IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
11979       Int    imm = (Int)insn[3];
11980       Int    reg = eregOfRM(insn[2]);
11981       DIP("psrldq $%d,%s\n", imm, nameXMMReg(reg));
11982       vassert(imm >= 0 && imm <= 255);
11983       delta += 4;
11984 
11985       sV    = newTemp(Ity_V128);
11986       dV    = newTemp(Ity_V128);
11987       hi64  = newTemp(Ity_I64);
11988       lo64  = newTemp(Ity_I64);
11989       hi64r = newTemp(Ity_I64);
11990       lo64r = newTemp(Ity_I64);
11991 
11992       if (imm >= 16) {
11993          putXMMReg(reg, mkV128(0x0000));
11994          goto decode_success;
11995       }
11996 
11997       assign( sV, getXMMReg(reg) );
11998       assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
11999       assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
12000 
12001       if (imm == 0) {
12002          assign( lo64r, mkexpr(lo64) );
12003          assign( hi64r, mkexpr(hi64) );
12004       }
12005       else
12006       if (imm == 8) {
12007          assign( hi64r, mkU64(0) );
12008          assign( lo64r, mkexpr(hi64) );
12009       }
12010       else
12011       if (imm > 8) {
12012          assign( hi64r, mkU64(0) );
12013          assign( lo64r, binop( Iop_Shr64,
12014                                mkexpr(hi64),
12015                                mkU8( 8*(imm-8) ) ));
12016       } else {
12017          assign( hi64r, binop( Iop_Shr64,
12018                                mkexpr(hi64),
12019                                mkU8(8 * imm) ));
12020          assign( lo64r,
12021                  binop( Iop_Or64,
12022                         binop(Iop_Shr64, mkexpr(lo64),
12023                                          mkU8(8 * imm)),
12024                         binop(Iop_Shl64, mkexpr(hi64),
12025                                          mkU8(8 * (8 - imm)) )
12026                       )
12027                );
12028       }
12029 
12030       assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
12031       putXMMReg(reg, mkexpr(dV));
12032       goto decode_success;
12033    }
12034 
12035    /* 66 0F 73 /2 ib = PSRLQ by immediate */
12036    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x73
12037        && epartIsReg(insn[2])
12038        && gregOfRM(insn[2]) == 2) {
12039       delta = dis_SSE_shiftE_imm( delta+2, "psrlq", Iop_ShrN64x2 );
12040       goto decode_success;
12041    }
12042 
12043    /* 66 0F D3 = PSRLQ by E */
12044    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xD3) {
12045       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrlq", Iop_ShrN64x2 );
12046       goto decode_success;
12047    }
12048 
12049    /* 66 0F 71 /2 ib = PSRLW by immediate */
12050    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x71
12051        && epartIsReg(insn[2])
12052        && gregOfRM(insn[2]) == 2) {
12053       delta = dis_SSE_shiftE_imm( delta+2, "psrlw", Iop_ShrN16x8 );
12054       goto decode_success;
12055    }
12056 
12057    /* 66 0F D1 = PSRLW by E */
12058    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xD1) {
12059       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrlw", Iop_ShrN16x8 );
12060       goto decode_success;
12061    }
12062 
12063    /* 66 0F F8 = PSUBB */
12064    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xF8) {
12065       delta = dis_SSEint_E_to_G( sorb, delta+2,
12066                                  "psubb", Iop_Sub8x16, False );
12067       goto decode_success;
12068    }
12069 
12070    /* 66 0F FA = PSUBD */
12071    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xFA) {
12072       delta = dis_SSEint_E_to_G( sorb, delta+2,
12073                                  "psubd", Iop_Sub32x4, False );
12074       goto decode_success;
12075    }
12076 
12077    /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
12078    /* 0F FB = PSUBQ -- sub 64x1 */
12079    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0xFB) {
12080       do_MMX_preamble();
12081       delta = dis_MMXop_regmem_to_reg (
12082                 sorb, delta+2, insn[1], "psubq", False );
12083       goto decode_success;
12084    }
12085 
12086    /* 66 0F FB = PSUBQ */
12087    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xFB) {
12088       delta = dis_SSEint_E_to_G( sorb, delta+2,
12089                                  "psubq", Iop_Sub64x2, False );
12090       goto decode_success;
12091    }
12092 
12093    /* 66 0F F9 = PSUBW */
12094    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xF9) {
12095       delta = dis_SSEint_E_to_G( sorb, delta+2,
12096                                  "psubw", Iop_Sub16x8, False );
12097       goto decode_success;
12098    }
12099 
12100    /* 66 0F E8 = PSUBSB */
12101    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xE8) {
12102       delta = dis_SSEint_E_to_G( sorb, delta+2,
12103                                  "psubsb", Iop_QSub8Sx16, False );
12104       goto decode_success;
12105    }
12106 
12107    /* 66 0F E9 = PSUBSW */
12108    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xE9) {
12109       delta = dis_SSEint_E_to_G( sorb, delta+2,
12110                                  "psubsw", Iop_QSub16Sx8, False );
12111       goto decode_success;
12112    }
12113 
12114    /* 66 0F D8 = PSUBSB */
12115    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xD8) {
12116       delta = dis_SSEint_E_to_G( sorb, delta+2,
12117                                  "psubusb", Iop_QSub8Ux16, False );
12118       goto decode_success;
12119    }
12120 
12121    /* 66 0F D9 = PSUBSW */
12122    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xD9) {
12123       delta = dis_SSEint_E_to_G( sorb, delta+2,
12124                                  "psubusw", Iop_QSub16Ux8, False );
12125       goto decode_success;
12126    }
12127 
12128    /* 66 0F 68 = PUNPCKHBW */
12129    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x68) {
12130       delta = dis_SSEint_E_to_G( sorb, delta+2,
12131                                  "punpckhbw",
12132                                  Iop_InterleaveHI8x16, True );
12133       goto decode_success;
12134    }
12135 
12136    /* 66 0F 6A = PUNPCKHDQ */
12137    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x6A) {
12138       delta = dis_SSEint_E_to_G( sorb, delta+2,
12139                                  "punpckhdq",
12140                                  Iop_InterleaveHI32x4, True );
12141       goto decode_success;
12142    }
12143 
12144    /* 66 0F 6D = PUNPCKHQDQ */
12145    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x6D) {
12146       delta = dis_SSEint_E_to_G( sorb, delta+2,
12147                                  "punpckhqdq",
12148                                  Iop_InterleaveHI64x2, True );
12149       goto decode_success;
12150    }
12151 
12152    /* 66 0F 69 = PUNPCKHWD */
12153    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x69) {
12154       delta = dis_SSEint_E_to_G( sorb, delta+2,
12155                                  "punpckhwd",
12156                                  Iop_InterleaveHI16x8, True );
12157       goto decode_success;
12158    }
12159 
12160    /* 66 0F 60 = PUNPCKLBW */
12161    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x60) {
12162       delta = dis_SSEint_E_to_G( sorb, delta+2,
12163                                  "punpcklbw",
12164                                  Iop_InterleaveLO8x16, True );
12165       goto decode_success;
12166    }
12167 
12168    /* 66 0F 62 = PUNPCKLDQ */
12169    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x62) {
12170       delta = dis_SSEint_E_to_G( sorb, delta+2,
12171                                  "punpckldq",
12172                                  Iop_InterleaveLO32x4, True );
12173       goto decode_success;
12174    }
12175 
12176    /* 66 0F 6C = PUNPCKLQDQ */
12177    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x6C) {
12178       delta = dis_SSEint_E_to_G( sorb, delta+2,
12179                                  "punpcklqdq",
12180                                  Iop_InterleaveLO64x2, True );
12181       goto decode_success;
12182    }
12183 
12184    /* 66 0F 61 = PUNPCKLWD */
12185    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0x61) {
12186       delta = dis_SSEint_E_to_G( sorb, delta+2,
12187                                  "punpcklwd",
12188                                  Iop_InterleaveLO16x8, True );
12189       goto decode_success;
12190    }
12191 
12192    /* 66 0F EF = PXOR */
12193    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xEF) {
12194       delta = dis_SSE_E_to_G_all( sorb, delta+2, "pxor", Iop_XorV128 );
12195       goto decode_success;
12196    }
12197 
12198 //--    /* FXSAVE/FXRSTOR m32 -- load/store the FPU/MMX/SSE state. */
12199 //--    if (insn[0] == 0x0F && insn[1] == 0xAE
12200 //--        && (!epartIsReg(insn[2]))
12201 //--        && (gregOfRM(insn[2]) == 1 || gregOfRM(insn[2]) == 0) ) {
12202 //--       Bool store = gregOfRM(insn[2]) == 0;
12203 //--       vg_assert(!has_66_pfx);
12204 //--       pair = disAMode ( cb, sorb, eip+2, dis_buf );
12205 //--       t1   = LOW24(pair);
12206 //--       eip += 2+HI8(pair);
12207 //--       uInstr3(cb, store ? SSE2a_MemWr : SSE2a_MemRd, 512,
12208 //--                   Lit16, (((UShort)insn[0]) << 8) | (UShort)insn[1],
12209 //--                   Lit16, (UShort)insn[2],
12210 //--                   TempReg, t1 );
12211 //--       DIP("fx%s %s\n", store ? "save" : "rstor", dis_buf );
12212 //--       goto decode_success;
12213 //--    }
12214 
12215    /* 0F AE /7 = CLFLUSH -- flush cache line */
12216    if (!has_66_pfx && insn[0] == 0x0F && insn[1] == 0xAE
12217        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 7) {
12218 
12219       /* This is something of a hack.  We need to know the size of the
12220          cache line containing addr.  Since we don't (easily), assume
12221          256 on the basis that no real cache would have a line that
12222          big.  It's safe to invalidate more stuff than we need, just
12223          inefficient. */
12224       UInt lineszB = 256;
12225 
12226       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
12227       delta += 2+alen;
12228 
12229       /* Round addr down to the start of the containing block. */
12230       stmt( IRStmt_Put(
12231                OFFB_CMSTART,
12232                binop( Iop_And32,
12233                       mkexpr(addr),
12234                       mkU32( ~(lineszB-1) ))) );
12235 
12236       stmt( IRStmt_Put(OFFB_CMLEN, mkU32(lineszB) ) );
12237 
12238       jmp_lit(&dres, Ijk_InvalICache, (Addr32)(guest_EIP_bbstart+delta));
12239 
12240       DIP("clflush %s\n", dis_buf);
12241       goto decode_success;
12242    }
12243 
12244    /* ---------------------------------------------------- */
12245    /* --- end of the SSE2 decoder.                     --- */
12246    /* ---------------------------------------------------- */
12247 
12248    /* ---------------------------------------------------- */
12249    /* --- start of the SSE3 decoder.                   --- */
12250    /* ---------------------------------------------------- */
12251 
12252    /* Skip parts of the decoder which don't apply given the stated
12253       guest subarchitecture. */
12254    if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE3))
12255       goto after_sse_decoders; /* no SSE3 capabilities */
12256 
12257    insn = &guest_code[delta];
12258 
12259    /* F3 0F 12 = MOVSLDUP -- move from E (mem or xmm) to G (xmm),
12260       duplicating some lanes (2:2:0:0). */
12261    /* F3 0F 16 = MOVSHDUP -- move from E (mem or xmm) to G (xmm),
12262       duplicating some lanes (3:3:1:1). */
12263    if (!has_66_pfx && insn[0] == 0xF3 && insn[1] == 0x0F
12264        && (insn[2] == 0x12 || insn[2] == 0x16)) {
12265       IRTemp s3, s2, s1, s0;
12266       IRTemp sV  = newTemp(Ity_V128);
12267       Bool   isH = insn[2] == 0x16;
12268       s3 = s2 = s1 = s0 = IRTemp_INVALID;
12269 
12270       modrm = insn[3];
12271       if (epartIsReg(modrm)) {
12272          assign( sV, getXMMReg( eregOfRM(modrm)) );
12273          DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
12274                                   nameXMMReg(eregOfRM(modrm)),
12275                                   nameXMMReg(gregOfRM(modrm)));
12276          delta += 3+1;
12277       } else {
12278          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12279          gen_SEGV_if_not_16_aligned( addr );
12280          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12281          DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
12282 	     dis_buf,
12283              nameXMMReg(gregOfRM(modrm)));
12284          delta += 3+alen;
12285       }
12286 
12287       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
12288       putXMMReg( gregOfRM(modrm),
12289                  isH ? mk128from32s( s3, s3, s1, s1 )
12290                      : mk128from32s( s2, s2, s0, s0 ) );
12291       goto decode_success;
12292    }
12293 
12294    /* F2 0F 12 = MOVDDUP -- move from E (mem or xmm) to G (xmm),
12295       duplicating some lanes (0:1:0:1). */
12296    if (!has_66_pfx && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x12) {
12297       IRTemp sV = newTemp(Ity_V128);
12298       IRTemp d0 = newTemp(Ity_I64);
12299 
12300       modrm = insn[3];
12301       if (epartIsReg(modrm)) {
12302          assign( sV, getXMMReg( eregOfRM(modrm)) );
12303          DIP("movddup %s,%s\n", nameXMMReg(eregOfRM(modrm)),
12304                                 nameXMMReg(gregOfRM(modrm)));
12305          delta += 3+1;
12306          assign ( d0, unop(Iop_V128to64, mkexpr(sV)) );
12307       } else {
12308          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12309          assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
12310          DIP("movddup %s,%s\n", dis_buf,
12311                                 nameXMMReg(gregOfRM(modrm)));
12312          delta += 3+alen;
12313       }
12314 
12315       putXMMReg( gregOfRM(modrm), binop(Iop_64HLtoV128,mkexpr(d0),mkexpr(d0)) );
12316       goto decode_success;
12317    }
12318 
12319    /* F2 0F D0 = ADDSUBPS -- 32x4 +/-/+/- from E (mem or xmm) to G (xmm). */
12320    if (!has_66_pfx && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xD0) {
12321       IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
12322       IRTemp eV   = newTemp(Ity_V128);
12323       IRTemp gV   = newTemp(Ity_V128);
12324       IRTemp addV = newTemp(Ity_V128);
12325       IRTemp subV = newTemp(Ity_V128);
12326       IRTemp rm     = newTemp(Ity_I32);
12327       a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
12328 
12329       modrm = insn[3];
12330       if (epartIsReg(modrm)) {
12331          assign( eV, getXMMReg( eregOfRM(modrm)) );
12332          DIP("addsubps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
12333                                  nameXMMReg(gregOfRM(modrm)));
12334          delta += 3+1;
12335       } else {
12336          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12337          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
12338          DIP("addsubps %s,%s\n", dis_buf,
12339                                  nameXMMReg(gregOfRM(modrm)));
12340          delta += 3+alen;
12341       }
12342 
12343       assign( gV, getXMMReg(gregOfRM(modrm)) );
12344 
12345       assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
12346       assign( addV, triop(Iop_Add32Fx4, mkexpr(rm), mkexpr(gV), mkexpr(eV)) );
12347       assign( subV, triop(Iop_Sub32Fx4, mkexpr(rm), mkexpr(gV), mkexpr(eV)) );
12348 
12349       breakup128to32s( addV, &a3, &a2, &a1, &a0 );
12350       breakup128to32s( subV, &s3, &s2, &s1, &s0 );
12351 
12352       putXMMReg( gregOfRM(modrm), mk128from32s( a3, s2, a1, s0 ));
12353       goto decode_success;
12354    }
12355 
12356    /* 66 0F D0 = ADDSUBPD -- 64x4 +/- from E (mem or xmm) to G (xmm). */
12357    if (has_66_pfx && insn[0] == 0x0F && insn[1] == 0xD0) {
12358       IRTemp eV   = newTemp(Ity_V128);
12359       IRTemp gV   = newTemp(Ity_V128);
12360       IRTemp addV = newTemp(Ity_V128);
12361       IRTemp subV = newTemp(Ity_V128);
12362       IRTemp a1     = newTemp(Ity_I64);
12363       IRTemp s0     = newTemp(Ity_I64);
12364       IRTemp rm     = newTemp(Ity_I32);
12365 
12366       modrm = insn[2];
12367       if (epartIsReg(modrm)) {
12368          assign( eV, getXMMReg( eregOfRM(modrm)) );
12369          DIP("addsubpd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
12370                                  nameXMMReg(gregOfRM(modrm)));
12371          delta += 2+1;
12372       } else {
12373          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
12374          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
12375          DIP("addsubpd %s,%s\n", dis_buf,
12376                                  nameXMMReg(gregOfRM(modrm)));
12377          delta += 2+alen;
12378       }
12379 
12380       assign( gV, getXMMReg(gregOfRM(modrm)) );
12381 
12382       assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
12383       assign( addV, triop(Iop_Add64Fx2, mkexpr(rm), mkexpr(gV), mkexpr(eV)) );
12384       assign( subV, triop(Iop_Sub64Fx2, mkexpr(rm), mkexpr(gV), mkexpr(eV)) );
12385 
12386       assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
12387       assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
12388 
12389       putXMMReg( gregOfRM(modrm),
12390                  binop(Iop_64HLtoV128, mkexpr(a1), mkexpr(s0)) );
12391       goto decode_success;
12392    }
12393 
12394    /* F2 0F 7D = HSUBPS -- 32x4 sub across from E (mem or xmm) to G (xmm). */
12395    /* F2 0F 7C = HADDPS -- 32x4 add across from E (mem or xmm) to G (xmm). */
12396    if (!has_66_pfx && insn[0] == 0xF2 && insn[1] == 0x0F
12397        && (insn[2] == 0x7C || insn[2] == 0x7D)) {
12398       IRTemp e3, e2, e1, e0, g3, g2, g1, g0;
12399       IRTemp eV     = newTemp(Ity_V128);
12400       IRTemp gV     = newTemp(Ity_V128);
12401       IRTemp leftV  = newTemp(Ity_V128);
12402       IRTemp rightV = newTemp(Ity_V128);
12403       IRTemp rm     = newTemp(Ity_I32);
12404       Bool   isAdd  = insn[2] == 0x7C;
12405       const HChar* str = isAdd ? "add" : "sub";
12406       e3 = e2 = e1 = e0 = g3 = g2 = g1 = g0 = IRTemp_INVALID;
12407 
12408       modrm = insn[3];
12409       if (epartIsReg(modrm)) {
12410          assign( eV, getXMMReg( eregOfRM(modrm)) );
12411          DIP("h%sps %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
12412                                    nameXMMReg(gregOfRM(modrm)));
12413          delta += 3+1;
12414       } else {
12415          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12416          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
12417          DIP("h%sps %s,%s\n", str, dis_buf,
12418                                    nameXMMReg(gregOfRM(modrm)));
12419          delta += 3+alen;
12420       }
12421 
12422       assign( gV, getXMMReg(gregOfRM(modrm)) );
12423 
12424       breakup128to32s( eV, &e3, &e2, &e1, &e0 );
12425       breakup128to32s( gV, &g3, &g2, &g1, &g0 );
12426 
12427       assign( leftV,  mk128from32s( e2, e0, g2, g0 ) );
12428       assign( rightV, mk128from32s( e3, e1, g3, g1 ) );
12429 
12430       assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
12431       putXMMReg( gregOfRM(modrm),
12432                  triop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4,
12433                        mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
12434       goto decode_success;
12435    }
12436 
12437    /* 66 0F 7D = HSUBPD -- 64x2 sub across from E (mem or xmm) to G (xmm). */
12438    /* 66 0F 7C = HADDPD -- 64x2 add across from E (mem or xmm) to G (xmm). */
12439    if (has_66_pfx && insn[0] == 0x0F && (insn[1] == 0x7C || insn[1] == 0x7D)) {
12440       IRTemp e1     = newTemp(Ity_I64);
12441       IRTemp e0     = newTemp(Ity_I64);
12442       IRTemp g1     = newTemp(Ity_I64);
12443       IRTemp g0     = newTemp(Ity_I64);
12444       IRTemp eV     = newTemp(Ity_V128);
12445       IRTemp gV     = newTemp(Ity_V128);
12446       IRTemp leftV  = newTemp(Ity_V128);
12447       IRTemp rightV = newTemp(Ity_V128);
12448       IRTemp rm     = newTemp(Ity_I32);
12449       Bool   isAdd  = insn[1] == 0x7C;
12450       const HChar* str = isAdd ? "add" : "sub";
12451 
12452       modrm = insn[2];
12453       if (epartIsReg(modrm)) {
12454          assign( eV, getXMMReg( eregOfRM(modrm)) );
12455          DIP("h%spd %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
12456                                    nameXMMReg(gregOfRM(modrm)));
12457          delta += 2+1;
12458       } else {
12459          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
12460          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
12461          DIP("h%spd %s,%s\n", str, dis_buf,
12462                               nameXMMReg(gregOfRM(modrm)));
12463          delta += 2+alen;
12464       }
12465 
12466       assign( gV, getXMMReg(gregOfRM(modrm)) );
12467 
12468       assign( e1, unop(Iop_V128HIto64, mkexpr(eV) ));
12469       assign( e0, unop(Iop_V128to64, mkexpr(eV) ));
12470       assign( g1, unop(Iop_V128HIto64, mkexpr(gV) ));
12471       assign( g0, unop(Iop_V128to64, mkexpr(gV) ));
12472 
12473       assign( leftV,  binop(Iop_64HLtoV128, mkexpr(e0),mkexpr(g0)) );
12474       assign( rightV, binop(Iop_64HLtoV128, mkexpr(e1),mkexpr(g1)) );
12475 
12476       assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
12477       putXMMReg( gregOfRM(modrm),
12478                  triop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2,
12479                        mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
12480       goto decode_success;
12481    }
12482 
12483    /* F2 0F F0 = LDDQU -- move from E (mem or xmm) to G (xmm). */
12484    if (!has_66_pfx && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xF0) {
12485       modrm = getIByte(delta+3);
12486       if (epartIsReg(modrm)) {
12487          goto decode_failure;
12488       } else {
12489          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12490          putXMMReg( gregOfRM(modrm),
12491                     loadLE(Ity_V128, mkexpr(addr)) );
12492          DIP("lddqu %s,%s\n", dis_buf,
12493                               nameXMMReg(gregOfRM(modrm)));
12494          delta += 3+alen;
12495       }
12496       goto decode_success;
12497    }
12498 
12499    /* ---------------------------------------------------- */
12500    /* --- end of the SSE3 decoder.                     --- */
12501    /* ---------------------------------------------------- */
12502 
12503    /* ---------------------------------------------------- */
12504    /* --- start of the SSSE3 decoder.                  --- */
12505    /* ---------------------------------------------------- */
12506 
12507    /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
12508       Unsigned Bytes (MMX) */
12509    if (!has_66_pfx
12510        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
12511       IRTemp sV        = newTemp(Ity_I64);
12512       IRTemp dV        = newTemp(Ity_I64);
12513       IRTemp sVoddsSX  = newTemp(Ity_I64);
12514       IRTemp sVevensSX = newTemp(Ity_I64);
12515       IRTemp dVoddsZX  = newTemp(Ity_I64);
12516       IRTemp dVevensZX = newTemp(Ity_I64);
12517 
12518       modrm = insn[3];
12519       do_MMX_preamble();
12520       assign( dV, getMMXReg(gregOfRM(modrm)) );
12521 
12522       if (epartIsReg(modrm)) {
12523          assign( sV, getMMXReg(eregOfRM(modrm)) );
12524          delta += 3+1;
12525          DIP("pmaddubsw %s,%s\n", nameMMXReg(eregOfRM(modrm)),
12526                                   nameMMXReg(gregOfRM(modrm)));
12527       } else {
12528          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12529          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12530          delta += 3+alen;
12531          DIP("pmaddubsw %s,%s\n", dis_buf,
12532                                   nameMMXReg(gregOfRM(modrm)));
12533       }
12534 
12535       /* compute dV unsigned x sV signed */
12536       assign( sVoddsSX,
12537               binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) );
12538       assign( sVevensSX,
12539               binop(Iop_SarN16x4,
12540                     binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)),
12541                     mkU8(8)) );
12542       assign( dVoddsZX,
12543               binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) );
12544       assign( dVevensZX,
12545               binop(Iop_ShrN16x4,
12546                     binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)),
12547                     mkU8(8)) );
12548 
12549       putMMXReg(
12550          gregOfRM(modrm),
12551          binop(Iop_QAdd16Sx4,
12552                binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
12553                binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX))
12554          )
12555       );
12556       goto decode_success;
12557    }
12558 
12559    /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
12560       Unsigned Bytes (XMM) */
12561    if (has_66_pfx
12562        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
12563       IRTemp sV        = newTemp(Ity_V128);
12564       IRTemp dV        = newTemp(Ity_V128);
12565       IRTemp sVoddsSX  = newTemp(Ity_V128);
12566       IRTemp sVevensSX = newTemp(Ity_V128);
12567       IRTemp dVoddsZX  = newTemp(Ity_V128);
12568       IRTemp dVevensZX = newTemp(Ity_V128);
12569 
12570       modrm = insn[3];
12571       assign( dV, getXMMReg(gregOfRM(modrm)) );
12572 
12573       if (epartIsReg(modrm)) {
12574          assign( sV, getXMMReg(eregOfRM(modrm)) );
12575          delta += 3+1;
12576          DIP("pmaddubsw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
12577                                   nameXMMReg(gregOfRM(modrm)));
12578       } else {
12579          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12580          gen_SEGV_if_not_16_aligned( addr );
12581          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12582          delta += 3+alen;
12583          DIP("pmaddubsw %s,%s\n", dis_buf,
12584                                   nameXMMReg(gregOfRM(modrm)));
12585       }
12586 
12587       /* compute dV unsigned x sV signed */
12588       assign( sVoddsSX,
12589               binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) );
12590       assign( sVevensSX,
12591               binop(Iop_SarN16x8,
12592                     binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)),
12593                     mkU8(8)) );
12594       assign( dVoddsZX,
12595               binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) );
12596       assign( dVevensZX,
12597               binop(Iop_ShrN16x8,
12598                     binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)),
12599                     mkU8(8)) );
12600 
12601       putXMMReg(
12602          gregOfRM(modrm),
12603          binop(Iop_QAdd16Sx8,
12604                binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
12605                binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX))
12606          )
12607       );
12608       goto decode_success;
12609    }
12610 
12611    /* ***--- these are MMX class insns introduced in SSSE3 ---*** */
12612    /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or
12613       mmx) and G to G (mmx). */
12614    /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or
12615       mmx) and G to G (mmx). */
12616    /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G
12617       to G (mmx). */
12618    /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G
12619       to G (mmx). */
12620    /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G
12621       to G (mmx). */
12622    /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G
12623       to G (mmx). */
12624 
12625    if (!has_66_pfx
12626        && insn[0] == 0x0F && insn[1] == 0x38
12627        && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
12628            || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
12629       const HChar* str = "???";
12630       IROp   opV64  = Iop_INVALID;
12631       IROp   opCatO = Iop_CatOddLanes16x4;
12632       IROp   opCatE = Iop_CatEvenLanes16x4;
12633       IRTemp sV     = newTemp(Ity_I64);
12634       IRTemp dV     = newTemp(Ity_I64);
12635 
12636       modrm = insn[3];
12637 
12638       switch (insn[2]) {
12639          case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
12640          case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
12641          case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
12642          case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
12643          case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
12644          case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
12645          default: vassert(0);
12646       }
12647       if (insn[2] == 0x02 || insn[2] == 0x06) {
12648          opCatO = Iop_InterleaveHI32x2;
12649          opCatE = Iop_InterleaveLO32x2;
12650       }
12651 
12652       do_MMX_preamble();
12653       assign( dV, getMMXReg(gregOfRM(modrm)) );
12654 
12655       if (epartIsReg(modrm)) {
12656          assign( sV, getMMXReg(eregOfRM(modrm)) );
12657          delta += 3+1;
12658          DIP("ph%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
12659                                   nameMMXReg(gregOfRM(modrm)));
12660       } else {
12661          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12662          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12663          delta += 3+alen;
12664          DIP("ph%s %s,%s\n", str, dis_buf,
12665                                   nameMMXReg(gregOfRM(modrm)));
12666       }
12667 
12668       putMMXReg(
12669          gregOfRM(modrm),
12670          binop(opV64,
12671                binop(opCatE,mkexpr(sV),mkexpr(dV)),
12672                binop(opCatO,mkexpr(sV),mkexpr(dV))
12673          )
12674       );
12675       goto decode_success;
12676    }
12677 
12678    /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or
12679       xmm) and G to G (xmm). */
12680    /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or
12681       xmm) and G to G (xmm). */
12682    /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and
12683       G to G (xmm). */
12684    /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and
12685       G to G (xmm). */
12686    /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and
12687       G to G (xmm). */
12688    /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and
12689       G to G (xmm). */
12690 
12691    if (has_66_pfx
12692        && insn[0] == 0x0F && insn[1] == 0x38
12693        && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
12694            || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
12695       const HChar* str = "???";
12696       IROp   opV64  = Iop_INVALID;
12697       IROp   opCatO = Iop_CatOddLanes16x4;
12698       IROp   opCatE = Iop_CatEvenLanes16x4;
12699       IRTemp sV     = newTemp(Ity_V128);
12700       IRTemp dV     = newTemp(Ity_V128);
12701       IRTemp sHi    = newTemp(Ity_I64);
12702       IRTemp sLo    = newTemp(Ity_I64);
12703       IRTemp dHi    = newTemp(Ity_I64);
12704       IRTemp dLo    = newTemp(Ity_I64);
12705 
12706       modrm = insn[3];
12707 
12708       switch (insn[2]) {
12709          case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
12710          case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
12711          case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
12712          case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
12713          case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
12714          case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
12715          default: vassert(0);
12716       }
12717       if (insn[2] == 0x02 || insn[2] == 0x06) {
12718          opCatO = Iop_InterleaveHI32x2;
12719          opCatE = Iop_InterleaveLO32x2;
12720       }
12721 
12722       assign( dV, getXMMReg(gregOfRM(modrm)) );
12723 
12724       if (epartIsReg(modrm)) {
12725          assign( sV, getXMMReg( eregOfRM(modrm)) );
12726          DIP("ph%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
12727                                   nameXMMReg(gregOfRM(modrm)));
12728          delta += 3+1;
12729       } else {
12730          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12731          gen_SEGV_if_not_16_aligned( addr );
12732          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12733          DIP("ph%s %s,%s\n", str, dis_buf,
12734                              nameXMMReg(gregOfRM(modrm)));
12735          delta += 3+alen;
12736       }
12737 
12738       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
12739       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
12740       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12741       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12742 
12743       /* This isn't a particularly efficient way to compute the
12744          result, but at least it avoids a proliferation of IROps,
12745          hence avoids complication all the backends. */
12746       putXMMReg(
12747          gregOfRM(modrm),
12748          binop(Iop_64HLtoV128,
12749                binop(opV64,
12750                      binop(opCatE,mkexpr(sHi),mkexpr(sLo)),
12751                      binop(opCatO,mkexpr(sHi),mkexpr(sLo))
12752                ),
12753                binop(opV64,
12754                      binop(opCatE,mkexpr(dHi),mkexpr(dLo)),
12755                      binop(opCatO,mkexpr(dHi),mkexpr(dLo))
12756                )
12757          )
12758       );
12759       goto decode_success;
12760    }
12761 
12762    /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale
12763       (MMX) */
12764    if (!has_66_pfx
12765        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
12766       IRTemp sV = newTemp(Ity_I64);
12767       IRTemp dV = newTemp(Ity_I64);
12768 
12769       modrm = insn[3];
12770       do_MMX_preamble();
12771       assign( dV, getMMXReg(gregOfRM(modrm)) );
12772 
12773       if (epartIsReg(modrm)) {
12774          assign( sV, getMMXReg(eregOfRM(modrm)) );
12775          delta += 3+1;
12776          DIP("pmulhrsw %s,%s\n", nameMMXReg(eregOfRM(modrm)),
12777                                  nameMMXReg(gregOfRM(modrm)));
12778       } else {
12779          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12780          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12781          delta += 3+alen;
12782          DIP("pmulhrsw %s,%s\n", dis_buf,
12783                                  nameMMXReg(gregOfRM(modrm)));
12784       }
12785 
12786       putMMXReg(
12787          gregOfRM(modrm),
12788          dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) )
12789       );
12790       goto decode_success;
12791    }
12792 
12793    /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and
12794       Scale (XMM) */
12795    if (has_66_pfx
12796        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
12797       IRTemp sV  = newTemp(Ity_V128);
12798       IRTemp dV  = newTemp(Ity_V128);
12799       IRTemp sHi = newTemp(Ity_I64);
12800       IRTemp sLo = newTemp(Ity_I64);
12801       IRTemp dHi = newTemp(Ity_I64);
12802       IRTemp dLo = newTemp(Ity_I64);
12803 
12804       modrm = insn[3];
12805       assign( dV, getXMMReg(gregOfRM(modrm)) );
12806 
12807       if (epartIsReg(modrm)) {
12808          assign( sV, getXMMReg(eregOfRM(modrm)) );
12809          delta += 3+1;
12810          DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
12811                                  nameXMMReg(gregOfRM(modrm)));
12812       } else {
12813          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12814          gen_SEGV_if_not_16_aligned( addr );
12815          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12816          delta += 3+alen;
12817          DIP("pmulhrsw %s,%s\n", dis_buf,
12818                                  nameXMMReg(gregOfRM(modrm)));
12819       }
12820 
12821       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
12822       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
12823       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12824       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12825 
12826       putXMMReg(
12827          gregOfRM(modrm),
12828          binop(Iop_64HLtoV128,
12829                dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
12830                dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
12831          )
12832       );
12833       goto decode_success;
12834    }
12835 
12836    /* 0F 38 08 = PSIGNB -- Packed Sign 8x8  (MMX) */
12837    /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */
12838    /* 0F 38 09 = PSIGND -- Packed Sign 32x2 (MMX) */
12839    if (!has_66_pfx
12840        && insn[0] == 0x0F && insn[1] == 0x38
12841        && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
12842       IRTemp sV      = newTemp(Ity_I64);
12843       IRTemp dV      = newTemp(Ity_I64);
12844       const HChar* str = "???";
12845       Int    laneszB = 0;
12846 
12847       switch (insn[2]) {
12848          case 0x08: laneszB = 1; str = "b"; break;
12849          case 0x09: laneszB = 2; str = "w"; break;
12850          case 0x0A: laneszB = 4; str = "d"; break;
12851          default: vassert(0);
12852       }
12853 
12854       modrm = insn[3];
12855       do_MMX_preamble();
12856       assign( dV, getMMXReg(gregOfRM(modrm)) );
12857 
12858       if (epartIsReg(modrm)) {
12859          assign( sV, getMMXReg(eregOfRM(modrm)) );
12860          delta += 3+1;
12861          DIP("psign%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
12862                                      nameMMXReg(gregOfRM(modrm)));
12863       } else {
12864          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12865          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12866          delta += 3+alen;
12867          DIP("psign%s %s,%s\n", str, dis_buf,
12868                                      nameMMXReg(gregOfRM(modrm)));
12869       }
12870 
12871       putMMXReg(
12872          gregOfRM(modrm),
12873          dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB )
12874       );
12875       goto decode_success;
12876    }
12877 
12878    /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */
12879    /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */
12880    /* 66 0F 38 09 = PSIGND -- Packed Sign 32x4 (XMM) */
12881    if (has_66_pfx
12882        && insn[0] == 0x0F && insn[1] == 0x38
12883        && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
12884       IRTemp sV      = newTemp(Ity_V128);
12885       IRTemp dV      = newTemp(Ity_V128);
12886       IRTemp sHi     = newTemp(Ity_I64);
12887       IRTemp sLo     = newTemp(Ity_I64);
12888       IRTemp dHi     = newTemp(Ity_I64);
12889       IRTemp dLo     = newTemp(Ity_I64);
12890       const HChar* str = "???";
12891       Int    laneszB = 0;
12892 
12893       switch (insn[2]) {
12894          case 0x08: laneszB = 1; str = "b"; break;
12895          case 0x09: laneszB = 2; str = "w"; break;
12896          case 0x0A: laneszB = 4; str = "d"; break;
12897          default: vassert(0);
12898       }
12899 
12900       modrm = insn[3];
12901       assign( dV, getXMMReg(gregOfRM(modrm)) );
12902 
12903       if (epartIsReg(modrm)) {
12904          assign( sV, getXMMReg(eregOfRM(modrm)) );
12905          delta += 3+1;
12906          DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
12907                                      nameXMMReg(gregOfRM(modrm)));
12908       } else {
12909          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12910          gen_SEGV_if_not_16_aligned( addr );
12911          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12912          delta += 3+alen;
12913          DIP("psign%s %s,%s\n", str, dis_buf,
12914                                      nameXMMReg(gregOfRM(modrm)));
12915       }
12916 
12917       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
12918       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
12919       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12920       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12921 
12922       putXMMReg(
12923          gregOfRM(modrm),
12924          binop(Iop_64HLtoV128,
12925                dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
12926                dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
12927          )
12928       );
12929       goto decode_success;
12930    }
12931 
12932    /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8  (MMX) */
12933    /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */
12934    /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */
12935    if (!has_66_pfx
12936        && insn[0] == 0x0F && insn[1] == 0x38
12937        && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
12938       IRTemp sV      = newTemp(Ity_I64);
12939       const HChar* str = "???";
12940       Int    laneszB = 0;
12941 
12942       switch (insn[2]) {
12943          case 0x1C: laneszB = 1; str = "b"; break;
12944          case 0x1D: laneszB = 2; str = "w"; break;
12945          case 0x1E: laneszB = 4; str = "d"; break;
12946          default: vassert(0);
12947       }
12948 
12949       modrm = insn[3];
12950       do_MMX_preamble();
12951 
12952       if (epartIsReg(modrm)) {
12953          assign( sV, getMMXReg(eregOfRM(modrm)) );
12954          delta += 3+1;
12955          DIP("pabs%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
12956                                     nameMMXReg(gregOfRM(modrm)));
12957       } else {
12958          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12959          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12960          delta += 3+alen;
12961          DIP("pabs%s %s,%s\n", str, dis_buf,
12962                                     nameMMXReg(gregOfRM(modrm)));
12963       }
12964 
12965       putMMXReg(
12966          gregOfRM(modrm),
12967          dis_PABS_helper( mkexpr(sV), laneszB )
12968       );
12969       goto decode_success;
12970    }
12971 
12972    /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */
12973    /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */
12974    /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */
12975    if (has_66_pfx
12976        && insn[0] == 0x0F && insn[1] == 0x38
12977        && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
12978       IRTemp sV      = newTemp(Ity_V128);
12979       IRTemp sHi     = newTemp(Ity_I64);
12980       IRTemp sLo     = newTemp(Ity_I64);
12981       const HChar* str = "???";
12982       Int    laneszB = 0;
12983 
12984       switch (insn[2]) {
12985          case 0x1C: laneszB = 1; str = "b"; break;
12986          case 0x1D: laneszB = 2; str = "w"; break;
12987          case 0x1E: laneszB = 4; str = "d"; break;
12988          default: vassert(0);
12989       }
12990 
12991       modrm = insn[3];
12992 
12993       if (epartIsReg(modrm)) {
12994          assign( sV, getXMMReg(eregOfRM(modrm)) );
12995          delta += 3+1;
12996          DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
12997                                     nameXMMReg(gregOfRM(modrm)));
12998       } else {
12999          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
13000          gen_SEGV_if_not_16_aligned( addr );
13001          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
13002          delta += 3+alen;
13003          DIP("pabs%s %s,%s\n", str, dis_buf,
13004                                     nameXMMReg(gregOfRM(modrm)));
13005       }
13006 
13007       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
13008       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
13009 
13010       putXMMReg(
13011          gregOfRM(modrm),
13012          binop(Iop_64HLtoV128,
13013                dis_PABS_helper( mkexpr(sHi), laneszB ),
13014                dis_PABS_helper( mkexpr(sLo), laneszB )
13015          )
13016       );
13017       goto decode_success;
13018    }
13019 
13020    /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */
13021    if (!has_66_pfx
13022        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
13023       IRTemp sV  = newTemp(Ity_I64);
13024       IRTemp dV  = newTemp(Ity_I64);
13025       IRTemp res = newTemp(Ity_I64);
13026 
13027       modrm = insn[3];
13028       do_MMX_preamble();
13029       assign( dV, getMMXReg(gregOfRM(modrm)) );
13030 
13031       if (epartIsReg(modrm)) {
13032          assign( sV, getMMXReg(eregOfRM(modrm)) );
13033          d32 = (UInt)insn[3+1];
13034          delta += 3+1+1;
13035          DIP("palignr $%u,%s,%s\n",  d32,
13036                                      nameMMXReg(eregOfRM(modrm)),
13037                                      nameMMXReg(gregOfRM(modrm)));
13038       } else {
13039          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
13040          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
13041          d32 = (UInt)insn[3+alen];
13042          delta += 3+alen+1;
13043          DIP("palignr $%u%s,%s\n", d32,
13044                                    dis_buf,
13045                                    nameMMXReg(gregOfRM(modrm)));
13046       }
13047 
13048       if (d32 == 0) {
13049          assign( res, mkexpr(sV) );
13050       }
13051       else if (d32 >= 1 && d32 <= 7) {
13052          assign(res,
13053                 binop(Iop_Or64,
13054                       binop(Iop_Shr64, mkexpr(sV), mkU8(8*d32)),
13055                       binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d32))
13056                      )));
13057       }
13058       else if (d32 == 8) {
13059         assign( res, mkexpr(dV) );
13060       }
13061       else if (d32 >= 9 && d32 <= 15) {
13062          assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d32-8))) );
13063       }
13064       else if (d32 >= 16 && d32 <= 255) {
13065          assign( res, mkU64(0) );
13066       }
13067       else
13068          vassert(0);
13069 
13070       putMMXReg( gregOfRM(modrm), mkexpr(res) );
13071       goto decode_success;
13072    }
13073 
13074    /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */
13075    if (has_66_pfx
13076        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
13077       IRTemp sV  = newTemp(Ity_V128);
13078       IRTemp dV  = newTemp(Ity_V128);
13079       IRTemp sHi = newTemp(Ity_I64);
13080       IRTemp sLo = newTemp(Ity_I64);
13081       IRTemp dHi = newTemp(Ity_I64);
13082       IRTemp dLo = newTemp(Ity_I64);
13083       IRTemp rHi = newTemp(Ity_I64);
13084       IRTemp rLo = newTemp(Ity_I64);
13085 
13086       modrm = insn[3];
13087       assign( dV, getXMMReg(gregOfRM(modrm)) );
13088 
13089       if (epartIsReg(modrm)) {
13090          assign( sV, getXMMReg(eregOfRM(modrm)) );
13091          d32 = (UInt)insn[3+1];
13092          delta += 3+1+1;
13093          DIP("palignr $%u,%s,%s\n", d32,
13094                                     nameXMMReg(eregOfRM(modrm)),
13095                                     nameXMMReg(gregOfRM(modrm)));
13096       } else {
13097          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
13098          gen_SEGV_if_not_16_aligned( addr );
13099          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
13100          d32 = (UInt)insn[3+alen];
13101          delta += 3+alen+1;
13102          DIP("palignr $%u,%s,%s\n", d32,
13103                                     dis_buf,
13104                                     nameXMMReg(gregOfRM(modrm)));
13105       }
13106 
13107       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
13108       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
13109       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
13110       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
13111 
13112       if (d32 == 0) {
13113          assign( rHi, mkexpr(sHi) );
13114          assign( rLo, mkexpr(sLo) );
13115       }
13116       else if (d32 >= 1 && d32 <= 7) {
13117          assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, d32) );
13118          assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, d32) );
13119       }
13120       else if (d32 == 8) {
13121          assign( rHi, mkexpr(dLo) );
13122          assign( rLo, mkexpr(sHi) );
13123       }
13124       else if (d32 >= 9 && d32 <= 15) {
13125          assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, d32-8) );
13126          assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, d32-8) );
13127       }
13128       else if (d32 == 16) {
13129          assign( rHi, mkexpr(dHi) );
13130          assign( rLo, mkexpr(dLo) );
13131       }
13132       else if (d32 >= 17 && d32 <= 23) {
13133          assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d32-16))) );
13134          assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, d32-16) );
13135       }
13136       else if (d32 == 24) {
13137          assign( rHi, mkU64(0) );
13138          assign( rLo, mkexpr(dHi) );
13139       }
13140       else if (d32 >= 25 && d32 <= 31) {
13141          assign( rHi, mkU64(0) );
13142          assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d32-24))) );
13143       }
13144       else if (d32 >= 32 && d32 <= 255) {
13145          assign( rHi, mkU64(0) );
13146          assign( rLo, mkU64(0) );
13147       }
13148       else
13149          vassert(0);
13150 
13151       putXMMReg(
13152          gregOfRM(modrm),
13153          binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
13154       );
13155       goto decode_success;
13156    }
13157 
13158    /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */
13159    if (!has_66_pfx
13160        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
13161       IRTemp sV      = newTemp(Ity_I64);
13162       IRTemp dV      = newTemp(Ity_I64);
13163 
13164       modrm = insn[3];
13165       do_MMX_preamble();
13166       assign( dV, getMMXReg(gregOfRM(modrm)) );
13167 
13168       if (epartIsReg(modrm)) {
13169          assign( sV, getMMXReg(eregOfRM(modrm)) );
13170          delta += 3+1;
13171          DIP("pshufb %s,%s\n", nameMMXReg(eregOfRM(modrm)),
13172                                nameMMXReg(gregOfRM(modrm)));
13173       } else {
13174          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
13175          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
13176          delta += 3+alen;
13177          DIP("pshufb %s,%s\n", dis_buf,
13178                                nameMMXReg(gregOfRM(modrm)));
13179       }
13180 
13181       putMMXReg(
13182          gregOfRM(modrm),
13183          binop(
13184             Iop_And64,
13185             /* permute the lanes */
13186             binop(
13187                Iop_Perm8x8,
13188                mkexpr(dV),
13189                binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL))
13190             ),
13191             /* mask off lanes which have (index & 0x80) == 0x80 */
13192             unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7)))
13193          )
13194       );
13195       goto decode_success;
13196    }
13197 
13198    /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */
13199    if (has_66_pfx
13200        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
13201       IRTemp sV         = newTemp(Ity_V128);
13202       IRTemp dV         = newTemp(Ity_V128);
13203       IRTemp sHi        = newTemp(Ity_I64);
13204       IRTemp sLo        = newTemp(Ity_I64);
13205       IRTemp dHi        = newTemp(Ity_I64);
13206       IRTemp dLo        = newTemp(Ity_I64);
13207       IRTemp rHi        = newTemp(Ity_I64);
13208       IRTemp rLo        = newTemp(Ity_I64);
13209       IRTemp sevens     = newTemp(Ity_I64);
13210       IRTemp mask0x80hi = newTemp(Ity_I64);
13211       IRTemp mask0x80lo = newTemp(Ity_I64);
13212       IRTemp maskBit3hi = newTemp(Ity_I64);
13213       IRTemp maskBit3lo = newTemp(Ity_I64);
13214       IRTemp sAnd7hi    = newTemp(Ity_I64);
13215       IRTemp sAnd7lo    = newTemp(Ity_I64);
13216       IRTemp permdHi    = newTemp(Ity_I64);
13217       IRTemp permdLo    = newTemp(Ity_I64);
13218 
13219       modrm = insn[3];
13220       assign( dV, getXMMReg(gregOfRM(modrm)) );
13221 
13222       if (epartIsReg(modrm)) {
13223          assign( sV, getXMMReg(eregOfRM(modrm)) );
13224          delta += 3+1;
13225          DIP("pshufb %s,%s\n", nameXMMReg(eregOfRM(modrm)),
13226                                nameXMMReg(gregOfRM(modrm)));
13227       } else {
13228          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
13229          gen_SEGV_if_not_16_aligned( addr );
13230          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
13231          delta += 3+alen;
13232          DIP("pshufb %s,%s\n", dis_buf,
13233                                nameXMMReg(gregOfRM(modrm)));
13234       }
13235 
13236       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
13237       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
13238       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
13239       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
13240 
13241       assign( sevens, mkU64(0x0707070707070707ULL) );
13242 
13243       /*
13244       mask0x80hi = Not(SarN8x8(sHi,7))
13245       maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7)
13246       sAnd7hi    = And(sHi,sevens)
13247       permdHi    = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi),
13248                        And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) )
13249       rHi        = And(permdHi,mask0x80hi)
13250       */
13251       assign(
13252          mask0x80hi,
13253          unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7))));
13254 
13255       assign(
13256          maskBit3hi,
13257          binop(Iop_SarN8x8,
13258                binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)),
13259                mkU8(7)));
13260 
13261       assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens)));
13262 
13263       assign(
13264          permdHi,
13265          binop(
13266             Iop_Or64,
13267             binop(Iop_And64,
13268                   binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)),
13269                   mkexpr(maskBit3hi)),
13270             binop(Iop_And64,
13271                   binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)),
13272                   unop(Iop_Not64,mkexpr(maskBit3hi))) ));
13273 
13274       assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) );
13275 
13276       /* And the same for the lower half of the result.  What fun. */
13277 
13278       assign(
13279          mask0x80lo,
13280          unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7))));
13281 
13282       assign(
13283          maskBit3lo,
13284          binop(Iop_SarN8x8,
13285                binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)),
13286                mkU8(7)));
13287 
13288       assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens)));
13289 
13290       assign(
13291          permdLo,
13292          binop(
13293             Iop_Or64,
13294             binop(Iop_And64,
13295                   binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)),
13296                   mkexpr(maskBit3lo)),
13297             binop(Iop_And64,
13298                   binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)),
13299                   unop(Iop_Not64,mkexpr(maskBit3lo))) ));
13300 
13301       assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) );
13302 
13303       putXMMReg(
13304          gregOfRM(modrm),
13305          binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
13306       );
13307       goto decode_success;
13308    }
13309 
13310    /* 0F 38 F0 = MOVBE m16/32(E), r16/32(G) */
13311    /* 0F 38 F1 = MOVBE r16/32(G), m16/32(E) */
13312    if ((sz == 2 || sz == 4)
13313        && insn[0] == 0x0F && insn[1] == 0x38
13314        && (insn[2] == 0xF0 || insn[2] == 0xF1)
13315        && !epartIsReg(insn[3])) {
13316 
13317       modrm = insn[3];
13318       addr = disAMode(&alen, sorb, delta + 3, dis_buf);
13319       delta += 3 + alen;
13320       ty = szToITy(sz);
13321       IRTemp src = newTemp(ty);
13322 
13323       if (insn[2] == 0xF0) { /* LOAD */
13324          assign(src, loadLE(ty, mkexpr(addr)));
13325          IRTemp dst = math_BSWAP(src, ty);
13326          putIReg(sz, gregOfRM(modrm), mkexpr(dst));
13327          DIP("movbe %s,%s\n", dis_buf, nameIReg(sz, gregOfRM(modrm)));
13328       } else { /* STORE */
13329          assign(src, getIReg(sz, gregOfRM(modrm)));
13330          IRTemp dst = math_BSWAP(src, ty);
13331          storeLE(mkexpr(addr), mkexpr(dst));
13332          DIP("movbe %s,%s\n", nameIReg(sz, gregOfRM(modrm)), dis_buf);
13333       }
13334       goto decode_success;
13335    }
13336 
13337    /* ---------------------------------------------------- */
13338    /* --- end of the SSSE3 decoder.                    --- */
13339    /* ---------------------------------------------------- */
13340 
13341    /* ---------------------------------------------------- */
13342    /* --- start of the SSE4 decoder                    --- */
13343    /* ---------------------------------------------------- */
13344 
13345    /* 66 0F 3A 0B /r ib = ROUNDSD imm8, xmm2/m64, xmm1
13346       (Partial implementation only -- only deal with cases where
13347       the rounding mode is specified directly by the immediate byte.)
13348       66 0F 3A 0A /r ib = ROUNDSS imm8, xmm2/m32, xmm1
13349       (Limitations ditto)
13350    */
13351    if (has_66_pfx
13352        && insn[0] == 0x0F && insn[1] == 0x3A
13353        && (insn[2] == 0x0B || insn[2] == 0x0A)) {
13354 
13355       Bool   isD = insn[2] == 0x0B;
13356       IRTemp src = newTemp(isD ? Ity_F64 : Ity_F32);
13357       IRTemp res = newTemp(isD ? Ity_F64 : Ity_F32);
13358       Int    imm = 0;
13359 
13360       modrm = insn[3];
13361 
13362       if (epartIsReg(modrm)) {
13363          assign( src,
13364                  isD ? getXMMRegLane64F( eregOfRM(modrm), 0 )
13365                      : getXMMRegLane32F( eregOfRM(modrm), 0 ) );
13366          imm = insn[3+1];
13367          if (imm & ~3) goto decode_failure;
13368          delta += 3+1+1;
13369          DIP( "rounds%c $%d,%s,%s\n",
13370               isD ? 'd' : 's',
13371               imm, nameXMMReg( eregOfRM(modrm) ),
13372                    nameXMMReg( gregOfRM(modrm) ) );
13373       } else {
13374          addr = disAMode( &alen, sorb, delta+3, dis_buf );
13375          assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
13376          imm = insn[3+alen];
13377          if (imm & ~3) goto decode_failure;
13378          delta += 3+alen+1;
13379          DIP( "roundsd $%d,%s,%s\n",
13380               imm, dis_buf, nameXMMReg( gregOfRM(modrm) ) );
13381       }
13382 
13383       /* (imm & 3) contains an Intel-encoded rounding mode.  Because
13384          that encoding is the same as the encoding for IRRoundingMode,
13385          we can use that value directly in the IR as a rounding
13386          mode. */
13387       assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
13388                   mkU32(imm & 3), mkexpr(src)) );
13389 
13390       if (isD)
13391          putXMMRegLane64F( gregOfRM(modrm), 0, mkexpr(res) );
13392       else
13393          putXMMRegLane32F( gregOfRM(modrm), 0, mkexpr(res) );
13394 
13395       goto decode_success;
13396    }
13397 
13398    /* F3 0F BD -- LZCNT (count leading zeroes.  An AMD extension,
13399       which we can only decode if we're sure this is an AMD cpu that
13400       supports LZCNT, since otherwise it's BSR, which behaves
13401       differently. */
13402    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xBD
13403        && 0 != (archinfo->hwcaps & VEX_HWCAPS_X86_LZCNT)) {
13404       vassert(sz == 2 || sz == 4);
13405       /*IRType*/ ty  = szToITy(sz);
13406       IRTemp     src = newTemp(ty);
13407       modrm = insn[3];
13408       if (epartIsReg(modrm)) {
13409          assign(src, getIReg(sz, eregOfRM(modrm)));
13410          delta += 3+1;
13411          DIP("lzcnt%c %s, %s\n", nameISize(sz),
13412              nameIReg(sz, eregOfRM(modrm)),
13413              nameIReg(sz, gregOfRM(modrm)));
13414       } else {
13415          addr = disAMode( &alen, sorb, delta+3, dis_buf );
13416          assign(src, loadLE(ty, mkexpr(addr)));
13417          delta += 3+alen;
13418          DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf,
13419              nameIReg(sz, gregOfRM(modrm)));
13420       }
13421 
13422       IRTemp res = gen_LZCNT(ty, src);
13423       putIReg(sz, gregOfRM(modrm), mkexpr(res));
13424 
13425       // Update flags.  This is pretty lame .. perhaps can do better
13426       // if this turns out to be performance critical.
13427       // O S A P are cleared.  Z is set if RESULT == 0.
13428       // C is set if SRC is zero.
13429       IRTemp src32 = newTemp(Ity_I32);
13430       IRTemp res32 = newTemp(Ity_I32);
13431       assign(src32, widenUto32(mkexpr(src)));
13432       assign(res32, widenUto32(mkexpr(res)));
13433 
13434       IRTemp oszacp = newTemp(Ity_I32);
13435       assign(
13436          oszacp,
13437          binop(Iop_Or32,
13438                binop(Iop_Shl32,
13439                      unop(Iop_1Uto32,
13440                           binop(Iop_CmpEQ32, mkexpr(res32), mkU32(0))),
13441                      mkU8(X86G_CC_SHIFT_Z)),
13442                binop(Iop_Shl32,
13443                      unop(Iop_1Uto32,
13444                           binop(Iop_CmpEQ32, mkexpr(src32), mkU32(0))),
13445                      mkU8(X86G_CC_SHIFT_C))
13446          )
13447       );
13448 
13449       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
13450       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
13451       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
13452       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
13453 
13454       goto decode_success;
13455    }
13456 
13457    /* ---------------------------------------------------- */
13458    /* --- end of the SSE4 decoder                      --- */
13459    /* ---------------------------------------------------- */
13460 
13461    after_sse_decoders:
13462 
13463    /* ---------------------------------------------------- */
13464    /* --- deal with misc 0x67 pfxs (addr size override) -- */
13465    /* ---------------------------------------------------- */
13466 
13467    /* 67 E3 = JCXZ (for JECXZ see below) */
13468    if (insn[0] == 0x67 && insn[1] == 0xE3 && sz == 4) {
13469       delta += 2;
13470       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
13471       delta ++;
13472       stmt( IRStmt_Exit(
13473                binop(Iop_CmpEQ16, getIReg(2,R_ECX), mkU16(0)),
13474                Ijk_Boring,
13475                IRConst_U32(d32),
13476                OFFB_EIP
13477             ));
13478        if (vex_control.strict_block_end) {
13479           jmp_lit(&dres, Ijk_Boring, ((Addr32)guest_EIP_bbstart)+delta);
13480        }
13481        DIP("jcxz 0x%x\n", d32);
13482        goto decode_success;
13483    }
13484 
13485    /* ---------------------------------------------------- */
13486    /* --- start of the baseline insn decoder            -- */
13487    /* ---------------------------------------------------- */
13488 
13489    /* Get the primary opcode. */
13490    opc = getIByte(delta); delta++;
13491 
13492    /* We get here if the current insn isn't SSE, or this CPU doesn't
13493       support SSE. */
13494 
13495    switch (opc) {
13496 
13497    /* ------------------------ Control flow --------------- */
13498 
13499    case 0xC2: /* RET imm16 */
13500       d32 = getUDisp16(delta);
13501       delta += 2;
13502       dis_ret(&dres, d32);
13503       DIP("ret %u\n", d32);
13504       break;
13505    case 0xC3: /* RET */
13506       dis_ret(&dres, 0);
13507       DIP("ret\n");
13508       break;
13509 
13510    case 0xCF: /* IRET */
13511       /* Note, this is an extremely kludgey and limited implementation
13512          of iret.  All it really does is:
13513             popl %EIP; popl %CS; popl %EFLAGS.
13514          %CS is set but ignored (as it is in (eg) popw %cs)". */
13515       t1 = newTemp(Ity_I32); /* ESP */
13516       t2 = newTemp(Ity_I32); /* new EIP */
13517       t3 = newTemp(Ity_I32); /* new CS */
13518       t4 = newTemp(Ity_I32); /* new EFLAGS */
13519       assign(t1, getIReg(4,R_ESP));
13520       assign(t2, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(0) )));
13521       assign(t3, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(4) )));
13522       assign(t4, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(8) )));
13523       /* Get stuff off stack */
13524       putIReg(4, R_ESP,binop(Iop_Add32, mkexpr(t1), mkU32(12)));
13525       /* set %CS (which is ignored anyway) */
13526       putSReg( R_CS, unop(Iop_32to16, mkexpr(t3)) );
13527       /* set %EFLAGS */
13528       set_EFLAGS_from_value( t4, False/*!emit_AC_emwarn*/, 0/*unused*/ );
13529       /* goto new EIP value */
13530       jmp_treg(&dres, Ijk_Ret, t2);
13531       vassert(dres.whatNext == Dis_StopHere);
13532       DIP("iret (very kludgey)\n");
13533       break;
13534 
13535    case 0xE8: /* CALL J4 */
13536       d32 = getUDisp32(delta); delta += 4;
13537       d32 += (guest_EIP_bbstart+delta);
13538       /* (guest_eip_bbstart+delta) == return-to addr, d32 == call-to addr */
13539       Bool isPICIdiom = d32 == guest_EIP_bbstart+delta
13540                             && getIByte(delta) >= 0x58
13541                             && getIByte(delta) <= 0x5F;
13542       if (isPICIdiom && vex_control.x86_optimize_callpop_idiom) {
13543          /* Specially treat the position-independent-code idiom
13544                  call X
13545               X: popl %reg
13546             as
13547                  movl %eip, %reg.
13548             since this generates better code, but for no other reason. */
13549          Int archReg = getIByte(delta) - 0x58;
13550          /* vex_printf("-- fPIC thingy\n"); */
13551          putIReg(4, archReg, mkU32(guest_EIP_bbstart+delta));
13552          delta++; /* Step over the POP */
13553          DIP("call 0x%x ; popl %s\n",d32,nameIReg(4,archReg));
13554       } else {
13555          /* The normal sequence for a call. */
13556          t1 = newTemp(Ity_I32);
13557          assign(t1, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
13558          putIReg(4, R_ESP, mkexpr(t1));
13559          storeLE( mkexpr(t1), mkU32(guest_EIP_bbstart+delta));
13560          if (resteerOkFn( callback_opaque, (Addr32)d32 )) {
13561             /* follow into the call target. */
13562             dres.whatNext   = Dis_ResteerU;
13563             dres.continueAt = (Addr32)d32;
13564          } else {
13565             jmp_lit(&dres, isPICIdiom ? Ijk_Boring : Ijk_Call, d32);
13566             vassert(dres.whatNext == Dis_StopHere);
13567          }
13568          DIP("call 0x%x\n",d32);
13569       }
13570       break;
13571 
13572 //--    case 0xC8: /* ENTER */
13573 //--       d32 = getUDisp16(eip); eip += 2;
13574 //--       abyte = getIByte(delta); delta++;
13575 //--
13576 //--       vg_assert(sz == 4);
13577 //--       vg_assert(abyte == 0);
13578 //--
13579 //--       t1 = newTemp(cb); t2 = newTemp(cb);
13580 //--       uInstr2(cb, GET,   sz, ArchReg, R_EBP, TempReg, t1);
13581 //--       uInstr2(cb, GET,    4, ArchReg, R_ESP, TempReg, t2);
13582 //--       uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
13583 //--       uLiteral(cb, sz);
13584 //--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
13585 //--       uInstr2(cb, STORE,  4, TempReg, t1,    TempReg, t2);
13586 //--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_EBP);
13587 //--       if (d32) {
13588 //--          uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
13589 //--          uLiteral(cb, d32);
13590 //--          uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
13591 //--       }
13592 //--       DIP("enter 0x%x, 0x%x", d32, abyte);
13593 //--       break;
13594 
13595    case 0xC9: /* LEAVE */
13596       vassert(sz == 4);
13597       t1 = newTemp(Ity_I32); t2 = newTemp(Ity_I32);
13598       assign(t1, getIReg(4,R_EBP));
13599       /* First PUT ESP looks redundant, but need it because ESP must
13600          always be up-to-date for Memcheck to work... */
13601       putIReg(4, R_ESP, mkexpr(t1));
13602       assign(t2, loadLE(Ity_I32,mkexpr(t1)));
13603       putIReg(4, R_EBP, mkexpr(t2));
13604       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t1), mkU32(4)) );
13605       DIP("leave\n");
13606       break;
13607 
13608    /* ---------------- Misc weird-ass insns --------------- */
13609 
13610    case 0x27: /* DAA */
13611    case 0x2F: /* DAS */
13612    case 0x37: /* AAA */
13613    case 0x3F: /* AAS */
13614       /* An ugly implementation for some ugly instructions.  Oh
13615 	 well. */
13616       if (sz != 4) goto decode_failure;
13617       t1 = newTemp(Ity_I32);
13618       t2 = newTemp(Ity_I32);
13619       /* Make up a 32-bit value (t1), with the old value of AX in the
13620          bottom 16 bits, and the old OSZACP bitmask in the upper 16
13621          bits. */
13622       assign(t1,
13623              binop(Iop_16HLto32,
13624                    unop(Iop_32to16,
13625                         mk_x86g_calculate_eflags_all()),
13626                    getIReg(2, R_EAX)
13627             ));
13628       /* Call the helper fn, to get a new AX and OSZACP value, and
13629          poke both back into the guest state.  Also pass the helper
13630          the actual opcode so it knows which of the 4 instructions it
13631          is doing the computation for. */
13632       vassert(opc == 0x27 || opc == 0x2F || opc == 0x37 || opc == 0x3F);
13633       assign(t2,
13634               mkIRExprCCall(
13635                  Ity_I32, 0/*regparm*/, "x86g_calculate_daa_das_aaa_aas",
13636                  &x86g_calculate_daa_das_aaa_aas,
13637                  mkIRExprVec_2( mkexpr(t1), mkU32( opc & 0xFF) )
13638             ));
13639      putIReg(2, R_EAX, unop(Iop_32to16, mkexpr(t2) ));
13640 
13641      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
13642      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
13643      stmt( IRStmt_Put( OFFB_CC_DEP1,
13644                        binop(Iop_And32,
13645                              binop(Iop_Shr32, mkexpr(t2), mkU8(16)),
13646                              mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
13647                                     | X86G_CC_MASK_A | X86G_CC_MASK_Z
13648                                     | X86G_CC_MASK_S| X86G_CC_MASK_O )
13649                             )
13650                       )
13651          );
13652      /* Set NDEP even though it isn't used.  This makes redundant-PUT
13653         elimination of previous stores to this field work better. */
13654      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
13655      switch (opc) {
13656         case 0x27: DIP("daa\n"); break;
13657         case 0x2F: DIP("das\n"); break;
13658         case 0x37: DIP("aaa\n"); break;
13659         case 0x3F: DIP("aas\n"); break;
13660         default: vassert(0);
13661      }
13662      break;
13663 
13664    case 0xD4: /* AAM */
13665    case 0xD5: /* AAD */
13666       d32 = getIByte(delta); delta++;
13667       if (sz != 4 || d32 != 10) goto decode_failure;
13668       t1 = newTemp(Ity_I32);
13669       t2 = newTemp(Ity_I32);
13670       /* Make up a 32-bit value (t1), with the old value of AX in the
13671          bottom 16 bits, and the old OSZACP bitmask in the upper 16
13672          bits. */
13673       assign(t1,
13674              binop(Iop_16HLto32,
13675                    unop(Iop_32to16,
13676                         mk_x86g_calculate_eflags_all()),
13677                    getIReg(2, R_EAX)
13678             ));
13679       /* Call the helper fn, to get a new AX and OSZACP value, and
13680          poke both back into the guest state.  Also pass the helper
13681          the actual opcode so it knows which of the 2 instructions it
13682          is doing the computation for. */
13683       assign(t2,
13684               mkIRExprCCall(
13685                  Ity_I32, 0/*regparm*/, "x86g_calculate_aad_aam",
13686                  &x86g_calculate_aad_aam,
13687                  mkIRExprVec_2( mkexpr(t1), mkU32( opc & 0xFF) )
13688             ));
13689       putIReg(2, R_EAX, unop(Iop_32to16, mkexpr(t2) ));
13690 
13691       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
13692       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
13693       stmt( IRStmt_Put( OFFB_CC_DEP1,
13694                         binop(Iop_And32,
13695                               binop(Iop_Shr32, mkexpr(t2), mkU8(16)),
13696                               mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
13697                                      | X86G_CC_MASK_A | X86G_CC_MASK_Z
13698                                      | X86G_CC_MASK_S| X86G_CC_MASK_O )
13699                              )
13700                        )
13701           );
13702       /* Set NDEP even though it isn't used.  This makes
13703          redundant-PUT elimination of previous stores to this field
13704          work better. */
13705       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
13706 
13707       DIP(opc == 0xD4 ? "aam\n" : "aad\n");
13708       break;
13709 
13710    /* ------------------------ CWD/CDQ -------------------- */
13711 
13712    case 0x98: /* CBW */
13713       if (sz == 4) {
13714          putIReg(4, R_EAX, unop(Iop_16Sto32, getIReg(2, R_EAX)));
13715          DIP("cwde\n");
13716       } else {
13717          vassert(sz == 2);
13718          putIReg(2, R_EAX, unop(Iop_8Sto16, getIReg(1, R_EAX)));
13719          DIP("cbw\n");
13720       }
13721       break;
13722 
13723    case 0x99: /* CWD/CDQ */
13724       ty = szToITy(sz);
13725       putIReg(sz, R_EDX,
13726                   binop(mkSizedOp(ty,Iop_Sar8),
13727                         getIReg(sz, R_EAX),
13728                         mkU8(sz == 2 ? 15 : 31)) );
13729       DIP(sz == 2 ? "cwdq\n" : "cdqq\n");
13730       break;
13731 
13732    /* ------------------------ FPU ops -------------------- */
13733 
13734    case 0x9E: /* SAHF */
13735       codegen_SAHF();
13736       DIP("sahf\n");
13737       break;
13738 
13739    case 0x9F: /* LAHF */
13740       codegen_LAHF();
13741       DIP("lahf\n");
13742       break;
13743 
13744    case 0x9B: /* FWAIT */
13745       /* ignore? */
13746       DIP("fwait\n");
13747       break;
13748 
13749    case 0xD8:
13750    case 0xD9:
13751    case 0xDA:
13752    case 0xDB:
13753    case 0xDC:
13754    case 0xDD:
13755    case 0xDE:
13756    case 0xDF: {
13757       Int  delta0    = delta;
13758       Bool decode_OK = False;
13759       delta = dis_FPU ( &decode_OK, sorb, delta );
13760       if (!decode_OK) {
13761          delta = delta0;
13762          goto decode_failure;
13763       }
13764       break;
13765    }
13766 
13767    /* ------------------------ INC & DEC ------------------ */
13768 
13769    case 0x40: /* INC eAX */
13770    case 0x41: /* INC eCX */
13771    case 0x42: /* INC eDX */
13772    case 0x43: /* INC eBX */
13773    case 0x44: /* INC eSP */
13774    case 0x45: /* INC eBP */
13775    case 0x46: /* INC eSI */
13776    case 0x47: /* INC eDI */
13777       vassert(sz == 2 || sz == 4);
13778       ty = szToITy(sz);
13779       t1 = newTemp(ty);
13780       assign( t1, binop(mkSizedOp(ty,Iop_Add8),
13781                         getIReg(sz, (UInt)(opc - 0x40)),
13782                         mkU(ty,1)) );
13783       setFlags_INC_DEC( True, t1, ty );
13784       putIReg(sz, (UInt)(opc - 0x40), mkexpr(t1));
13785       DIP("inc%c %s\n", nameISize(sz), nameIReg(sz,opc-0x40));
13786       break;
13787 
13788    case 0x48: /* DEC eAX */
13789    case 0x49: /* DEC eCX */
13790    case 0x4A: /* DEC eDX */
13791    case 0x4B: /* DEC eBX */
13792    case 0x4C: /* DEC eSP */
13793    case 0x4D: /* DEC eBP */
13794    case 0x4E: /* DEC eSI */
13795    case 0x4F: /* DEC eDI */
13796       vassert(sz == 2 || sz == 4);
13797       ty = szToITy(sz);
13798       t1 = newTemp(ty);
13799       assign( t1, binop(mkSizedOp(ty,Iop_Sub8),
13800                         getIReg(sz, (UInt)(opc - 0x48)),
13801                         mkU(ty,1)) );
13802       setFlags_INC_DEC( False, t1, ty );
13803       putIReg(sz, (UInt)(opc - 0x48), mkexpr(t1));
13804       DIP("dec%c %s\n", nameISize(sz), nameIReg(sz,opc-0x48));
13805       break;
13806 
13807    /* ------------------------ INT ------------------------ */
13808 
13809    case 0xCC: /* INT 3 */
13810       jmp_lit(&dres, Ijk_SigTRAP, ((Addr32)guest_EIP_bbstart)+delta);
13811       vassert(dres.whatNext == Dis_StopHere);
13812       DIP("int $0x3\n");
13813       break;
13814 
13815    case 0xCD: /* INT imm8 */
13816       d32 = getIByte(delta); delta++;
13817 
13818       /* For any of the cases where we emit a jump (that is, for all
13819          currently handled cases), it's important that all ArchRegs
13820          carry their up-to-date value at this point.  So we declare an
13821          end-of-block here, which forces any TempRegs caching ArchRegs
13822          to be flushed. */
13823 
13824       /* Handle int $0x3F .. $0x4F by synthesising a segfault and a
13825          restart of this instruction (hence the "-2" two lines below,
13826          to get the restart EIP to be this instruction.  This is
13827          probably Linux-specific and it would be more correct to only
13828          do this if the VexAbiInfo says that is what we should do.
13829          This used to handle just 0x40-0x43; Jikes RVM uses a larger
13830          range (0x3F-0x49), and this allows some slack as well. */
13831       if (d32 >= 0x3F && d32 <= 0x4F) {
13832          jmp_lit(&dres, Ijk_SigSEGV, ((Addr32)guest_EIP_bbstart)+delta-2);
13833          vassert(dres.whatNext == Dis_StopHere);
13834          DIP("int $0x%x\n", d32);
13835          break;
13836       }
13837 
13838       /* Handle int $0x80 (linux syscalls), int $0x81 and $0x82
13839          (darwin syscalls), int $0x91 (Solaris syscalls) and int $0xD2
13840          (Solaris fasttrap syscalls).  As part of this, note where we are, so we
13841          can back up the guest to this point if the syscall needs to
13842          be restarted. */
13843       IRJumpKind jump_kind;
13844       switch (d32) {
13845       case 0x80:
13846          jump_kind = Ijk_Sys_int128;
13847          break;
13848       case 0x81:
13849          jump_kind = Ijk_Sys_int129;
13850          break;
13851       case 0x82:
13852          jump_kind = Ijk_Sys_int130;
13853          break;
13854       case 0x91:
13855          jump_kind = Ijk_Sys_int145;
13856          break;
13857       case 0xD2:
13858          jump_kind = Ijk_Sys_int210;
13859          break;
13860       default:
13861          /* none of the above */
13862          goto decode_failure;
13863       }
13864 
13865       stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
13866                         mkU32(guest_EIP_curr_instr) ) );
13867       jmp_lit(&dres, jump_kind, ((Addr32)guest_EIP_bbstart)+delta);
13868       vassert(dres.whatNext == Dis_StopHere);
13869       DIP("int $0x%x\n", d32);
13870       break;
13871 
13872    /* ------------------------ Jcond, byte offset --------- */
13873 
13874    case 0xEB: /* Jb (jump, byte offset) */
13875       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
13876       delta++;
13877       if (resteerOkFn( callback_opaque, (Addr32)d32) ) {
13878          dres.whatNext   = Dis_ResteerU;
13879          dres.continueAt = (Addr32)d32;
13880       } else {
13881          jmp_lit(&dres, Ijk_Boring, d32);
13882          vassert(dres.whatNext == Dis_StopHere);
13883       }
13884       DIP("jmp-8 0x%x\n", d32);
13885       break;
13886 
13887    case 0xEA: {/* jump far, 16/32 address */
13888       vassert(sz == 4 || sz == 2);
13889       UInt addr_offset = getUDisp(sz, delta);
13890       delta += sz;
13891       UInt selector = getUDisp16(delta);
13892       delta += 2;
13893 
13894       ty = szToITy(sz);
13895       IRTemp final_addr = newTemp(Ity_I32);
13896       IRTemp tmp_selector = newTemp(Ity_I32);
13897       IRTemp tmp_addr_offset = newTemp(ty);
13898       assign(tmp_selector, mkU32(selector));
13899       assign(tmp_addr_offset, sz == 4 ? mkU32(addr_offset) : mkU16(addr_offset));
13900       assign(final_addr, handleSegOverrideAux(tmp_selector, mkexpr(tmp_addr_offset)));
13901 
13902       jmp_treg(&dres, Ijk_Boring, final_addr);
13903       vassert(dres.whatNext == Dis_StopHere);
13904       break;
13905    }
13906    case 0xE9: /* Jv (jump, 16/32 offset) */
13907       vassert(sz == 4 || sz == 2);
13908       d32 = (((Addr32)guest_EIP_bbstart)+delta+sz) + getSDisp(sz,delta);
13909       delta += sz;
13910       if (resteerOkFn( callback_opaque, (Addr32)d32) ) {
13911          dres.whatNext   = Dis_ResteerU;
13912          dres.continueAt = (Addr32)d32;
13913       } else {
13914          jmp_lit(&dres, Ijk_Boring, d32);
13915          vassert(dres.whatNext == Dis_StopHere);
13916       }
13917       DIP("jmp 0x%x\n", d32);
13918       break;
13919 
13920    case 0x70:
13921    case 0x71:
13922    case 0x72: /* JBb/JNAEb (jump below) */
13923    case 0x73: /* JNBb/JAEb (jump not below) */
13924    case 0x74: /* JZb/JEb (jump zero) */
13925    case 0x75: /* JNZb/JNEb (jump not zero) */
13926    case 0x76: /* JBEb/JNAb (jump below or equal) */
13927    case 0x77: /* JNBEb/JAb (jump not below or equal) */
13928    case 0x78: /* JSb (jump negative) */
13929    case 0x79: /* JSb (jump not negative) */
13930    case 0x7A: /* JP (jump parity even) */
13931    case 0x7B: /* JNP/JPO (jump parity odd) */
13932    case 0x7C: /* JLb/JNGEb (jump less) */
13933    case 0x7D: /* JGEb/JNLb (jump greater or equal) */
13934    case 0x7E: /* JLEb/JNGb (jump less or equal) */
13935    case 0x7F: /* JGb/JNLEb (jump greater) */
13936     { Int    jmpDelta;
13937       const HChar* comment  = "";
13938       jmpDelta = (Int)getSDisp8(delta);
13939       vassert(-128 <= jmpDelta && jmpDelta < 128);
13940       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + jmpDelta;
13941       delta++;
13942       if (resteerCisOk
13943           && vex_control.guest_chase_cond
13944           && (Addr32)d32 != (Addr32)guest_EIP_bbstart
13945           && jmpDelta < 0
13946           && resteerOkFn( callback_opaque, (Addr32)d32) ) {
13947          /* Speculation: assume this backward branch is taken.  So we
13948             need to emit a side-exit to the insn following this one,
13949             on the negation of the condition, and continue at the
13950             branch target address (d32).  If we wind up back at the
13951             first instruction of the trace, just stop; it's better to
13952             let the IR loop unroller handle that case. */
13953          stmt( IRStmt_Exit(
13954                   mk_x86g_calculate_condition((X86Condcode)(1 ^ (opc - 0x70))),
13955                   Ijk_Boring,
13956                   IRConst_U32(guest_EIP_bbstart+delta),
13957                   OFFB_EIP ) );
13958          dres.whatNext   = Dis_ResteerC;
13959          dres.continueAt = (Addr32)d32;
13960          comment = "(assumed taken)";
13961       }
13962       else
13963       if (resteerCisOk
13964           && vex_control.guest_chase_cond
13965           && (Addr32)d32 != (Addr32)guest_EIP_bbstart
13966           && jmpDelta >= 0
13967           && resteerOkFn( callback_opaque,
13968                           (Addr32)(guest_EIP_bbstart+delta)) ) {
13969          /* Speculation: assume this forward branch is not taken.  So
13970             we need to emit a side-exit to d32 (the dest) and continue
13971             disassembling at the insn immediately following this
13972             one. */
13973          stmt( IRStmt_Exit(
13974                   mk_x86g_calculate_condition((X86Condcode)(opc - 0x70)),
13975                   Ijk_Boring,
13976                   IRConst_U32(d32),
13977                   OFFB_EIP ) );
13978          dres.whatNext   = Dis_ResteerC;
13979          dres.continueAt = guest_EIP_bbstart + delta;
13980          comment = "(assumed not taken)";
13981       }
13982       else {
13983          /* Conservative default translation - end the block at this
13984             point. */
13985          jcc_01( &dres, (X86Condcode)(opc - 0x70),
13986                  (Addr32)(guest_EIP_bbstart+delta), d32);
13987          vassert(dres.whatNext == Dis_StopHere);
13988       }
13989       DIP("j%s-8 0x%x %s\n", name_X86Condcode(opc - 0x70), d32, comment);
13990       break;
13991     }
13992 
13993    case 0xE3: /* JECXZ (for JCXZ see above) */
13994       if (sz != 4) goto decode_failure;
13995       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
13996       delta ++;
13997       stmt( IRStmt_Exit(
13998                binop(Iop_CmpEQ32, getIReg(4,R_ECX), mkU32(0)),
13999             Ijk_Boring,
14000             IRConst_U32(d32),
14001             OFFB_EIP
14002           ));
14003       if (vex_control.strict_block_end) {
14004          jmp_lit(&dres, Ijk_Boring, ((Addr32)guest_EIP_bbstart)+delta);
14005       }
14006       DIP("jecxz 0x%x\n", d32);
14007       break;
14008 
14009    case 0xE0: /* LOOPNE disp8: decrement count, jump if count != 0 && ZF==0 */
14010    case 0xE1: /* LOOPE  disp8: decrement count, jump if count != 0 && ZF==1 */
14011    case 0xE2: /* LOOP   disp8: decrement count, jump if count != 0 */
14012     { /* Again, the docs say this uses ECX/CX as a count depending on
14013          the address size override, not the operand one.  Since we
14014          don't handle address size overrides, I guess that means
14015          ECX. */
14016       IRExpr* zbit  = NULL;
14017       IRExpr* count = NULL;
14018       IRExpr* cond  = NULL;
14019       const HChar* xtra = NULL;
14020 
14021       if (sz != 4) goto decode_failure;
14022       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
14023       delta++;
14024       putIReg(4, R_ECX, binop(Iop_Sub32, getIReg(4,R_ECX), mkU32(1)));
14025 
14026       count = getIReg(4,R_ECX);
14027       cond = binop(Iop_CmpNE32, count, mkU32(0));
14028       switch (opc) {
14029          case 0xE2:
14030             xtra = "";
14031             break;
14032          case 0xE1:
14033             xtra = "e";
14034             zbit = mk_x86g_calculate_condition( X86CondZ );
14035 	    cond = mkAnd1(cond, zbit);
14036             break;
14037          case 0xE0:
14038             xtra = "ne";
14039             zbit = mk_x86g_calculate_condition( X86CondNZ );
14040 	    cond = mkAnd1(cond, zbit);
14041             break;
14042          default:
14043 	    vassert(0);
14044       }
14045       stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U32(d32), OFFB_EIP) );
14046 
14047       if (vex_control.strict_block_end) {
14048          jmp_lit(&dres, Ijk_Boring, ((Addr32)guest_EIP_bbstart)+delta);
14049       }
14050 
14051       DIP("loop%s 0x%x\n", xtra, d32);
14052       break;
14053     }
14054 
14055    /* ------------------------ IMUL ----------------------- */
14056 
14057    case 0x69: /* IMUL Iv, Ev, Gv */
14058       delta = dis_imul_I_E_G ( sorb, sz, delta, sz );
14059       break;
14060    case 0x6B: /* IMUL Ib, Ev, Gv */
14061       delta = dis_imul_I_E_G ( sorb, sz, delta, 1 );
14062       break;
14063 
14064    /* ------------------------ MOV ------------------------ */
14065 
14066    case 0x88: /* MOV Gb,Eb */
14067       delta = dis_mov_G_E(sorb, 1, delta);
14068       break;
14069 
14070    case 0x89: /* MOV Gv,Ev */
14071       delta = dis_mov_G_E(sorb, sz, delta);
14072       break;
14073 
14074    case 0x8A: /* MOV Eb,Gb */
14075       delta = dis_mov_E_G(sorb, 1, delta);
14076       break;
14077 
14078    case 0x8B: /* MOV Ev,Gv */
14079       delta = dis_mov_E_G(sorb, sz, delta);
14080       break;
14081 
14082    case 0x8D: /* LEA M,Gv */
14083       if (sz != 4)
14084          goto decode_failure;
14085       modrm = getIByte(delta);
14086       if (epartIsReg(modrm))
14087          goto decode_failure;
14088       /* NOTE!  this is the one place where a segment override prefix
14089          has no effect on the address calculation.  Therefore we pass
14090          zero instead of sorb here. */
14091       addr = disAMode ( &alen, /*sorb*/ 0, delta, dis_buf );
14092       delta += alen;
14093       putIReg(sz, gregOfRM(modrm), mkexpr(addr));
14094       DIP("lea%c %s, %s\n", nameISize(sz), dis_buf,
14095                             nameIReg(sz,gregOfRM(modrm)));
14096       break;
14097 
14098    case 0x8C: /* MOV Sw,Ew -- MOV from a SEGMENT REGISTER */
14099       delta = dis_mov_Sw_Ew(sorb, sz, delta);
14100       break;
14101 
14102    case 0x8E: /* MOV Ew,Sw -- MOV to a SEGMENT REGISTER */
14103       delta = dis_mov_Ew_Sw(sorb, delta);
14104       break;
14105 
14106    case 0xA0: /* MOV Ob,AL */
14107       sz = 1;
14108       /* Fall through ... */
14109    case 0xA1: /* MOV Ov,eAX */
14110       d32 = getUDisp32(delta); delta += 4;
14111       ty = szToITy(sz);
14112       addr = newTemp(Ity_I32);
14113       assign( addr, handleSegOverride(sorb, mkU32(d32)) );
14114       putIReg(sz, R_EAX, loadLE(ty, mkexpr(addr)));
14115       DIP("mov%c %s0x%x, %s\n", nameISize(sz), sorbTxt(sorb),
14116                                 d32, nameIReg(sz,R_EAX));
14117       break;
14118 
14119    case 0xA2: /* MOV Ob,AL */
14120       sz = 1;
14121       /* Fall through ... */
14122    case 0xA3: /* MOV eAX,Ov */
14123       d32 = getUDisp32(delta); delta += 4;
14124       ty = szToITy(sz);
14125       addr = newTemp(Ity_I32);
14126       assign( addr, handleSegOverride(sorb, mkU32(d32)) );
14127       storeLE( mkexpr(addr), getIReg(sz,R_EAX) );
14128       DIP("mov%c %s, %s0x%x\n", nameISize(sz), nameIReg(sz,R_EAX),
14129                                 sorbTxt(sorb), d32);
14130       break;
14131 
14132    case 0xB0: /* MOV imm,AL */
14133    case 0xB1: /* MOV imm,CL */
14134    case 0xB2: /* MOV imm,DL */
14135    case 0xB3: /* MOV imm,BL */
14136    case 0xB4: /* MOV imm,AH */
14137    case 0xB5: /* MOV imm,CH */
14138    case 0xB6: /* MOV imm,DH */
14139    case 0xB7: /* MOV imm,BH */
14140       d32 = getIByte(delta); delta += 1;
14141       putIReg(1, opc-0xB0, mkU8(d32));
14142       DIP("movb $0x%x,%s\n", d32, nameIReg(1,opc-0xB0));
14143       break;
14144 
14145    case 0xB8: /* MOV imm,eAX */
14146    case 0xB9: /* MOV imm,eCX */
14147    case 0xBA: /* MOV imm,eDX */
14148    case 0xBB: /* MOV imm,eBX */
14149    case 0xBC: /* MOV imm,eSP */
14150    case 0xBD: /* MOV imm,eBP */
14151    case 0xBE: /* MOV imm,eSI */
14152    case 0xBF: /* MOV imm,eDI */
14153       d32 = getUDisp(sz,delta); delta += sz;
14154       putIReg(sz, opc-0xB8, mkU(szToITy(sz), d32));
14155       DIP("mov%c $0x%x,%s\n", nameISize(sz), d32, nameIReg(sz,opc-0xB8));
14156       break;
14157 
14158    case 0xC6: /* C6 /0 = MOV Ib,Eb */
14159       sz = 1;
14160       goto maybe_do_Mov_I_E;
14161    case 0xC7: /* C7 /0 = MOV Iv,Ev */
14162       goto maybe_do_Mov_I_E;
14163 
14164    maybe_do_Mov_I_E:
14165       modrm = getIByte(delta);
14166       if (gregOfRM(modrm) == 0) {
14167          if (epartIsReg(modrm)) {
14168             delta++; /* mod/rm byte */
14169             d32 = getUDisp(sz,delta); delta += sz;
14170             putIReg(sz, eregOfRM(modrm), mkU(szToITy(sz), d32));
14171             DIP("mov%c $0x%x, %s\n", nameISize(sz), d32,
14172                                      nameIReg(sz,eregOfRM(modrm)));
14173          } else {
14174             addr = disAMode ( &alen, sorb, delta, dis_buf );
14175             delta += alen;
14176             d32 = getUDisp(sz,delta); delta += sz;
14177             storeLE(mkexpr(addr), mkU(szToITy(sz), d32));
14178             DIP("mov%c $0x%x, %s\n", nameISize(sz), d32, dis_buf);
14179          }
14180          break;
14181       }
14182       goto decode_failure;
14183 
14184    /* ------------------------ opl imm, A ----------------- */
14185 
14186    case 0x04: /* ADD Ib, AL */
14187       delta = dis_op_imm_A(  1, False, Iop_Add8, True, delta, "add" );
14188       break;
14189    case 0x05: /* ADD Iv, eAX */
14190       delta = dis_op_imm_A( sz, False, Iop_Add8, True, delta, "add" );
14191       break;
14192 
14193    case 0x0C: /* OR Ib, AL */
14194       delta = dis_op_imm_A(  1, False, Iop_Or8, True, delta, "or" );
14195       break;
14196    case 0x0D: /* OR Iv, eAX */
14197       delta = dis_op_imm_A( sz, False, Iop_Or8, True, delta, "or" );
14198       break;
14199 
14200    case 0x14: /* ADC Ib, AL */
14201       delta = dis_op_imm_A(  1, True, Iop_Add8, True, delta, "adc" );
14202       break;
14203    case 0x15: /* ADC Iv, eAX */
14204       delta = dis_op_imm_A( sz, True, Iop_Add8, True, delta, "adc" );
14205       break;
14206 
14207    case 0x1C: /* SBB Ib, AL */
14208       delta = dis_op_imm_A( 1, True, Iop_Sub8, True, delta, "sbb" );
14209       break;
14210    case 0x1D: /* SBB Iv, eAX */
14211       delta = dis_op_imm_A( sz, True, Iop_Sub8, True, delta, "sbb" );
14212       break;
14213 
14214    case 0x24: /* AND Ib, AL */
14215       delta = dis_op_imm_A(  1, False, Iop_And8, True, delta, "and" );
14216       break;
14217    case 0x25: /* AND Iv, eAX */
14218       delta = dis_op_imm_A( sz, False, Iop_And8, True, delta, "and" );
14219       break;
14220 
14221    case 0x2C: /* SUB Ib, AL */
14222       delta = dis_op_imm_A(  1, False, Iop_Sub8, True, delta, "sub" );
14223       break;
14224    case 0x2D: /* SUB Iv, eAX */
14225       delta = dis_op_imm_A( sz, False, Iop_Sub8, True, delta, "sub" );
14226       break;
14227 
14228    case 0x34: /* XOR Ib, AL */
14229       delta = dis_op_imm_A(  1, False, Iop_Xor8, True, delta, "xor" );
14230       break;
14231    case 0x35: /* XOR Iv, eAX */
14232       delta = dis_op_imm_A( sz, False, Iop_Xor8, True, delta, "xor" );
14233       break;
14234 
14235    case 0x3C: /* CMP Ib, AL */
14236       delta = dis_op_imm_A(  1, False, Iop_Sub8, False, delta, "cmp" );
14237       break;
14238    case 0x3D: /* CMP Iv, eAX */
14239       delta = dis_op_imm_A( sz, False, Iop_Sub8, False, delta, "cmp" );
14240       break;
14241 
14242    case 0xA8: /* TEST Ib, AL */
14243       delta = dis_op_imm_A(  1, False, Iop_And8, False, delta, "test" );
14244       break;
14245    case 0xA9: /* TEST Iv, eAX */
14246       delta = dis_op_imm_A( sz, False, Iop_And8, False, delta, "test" );
14247       break;
14248 
14249    /* ------------------------ opl Ev, Gv ----------------- */
14250 
14251    case 0x02: /* ADD Eb,Gb */
14252       delta = dis_op2_E_G ( sorb, False, Iop_Add8, True, 1, delta, "add" );
14253       break;
14254    case 0x03: /* ADD Ev,Gv */
14255       delta = dis_op2_E_G ( sorb, False, Iop_Add8, True, sz, delta, "add" );
14256       break;
14257 
14258    case 0x0A: /* OR Eb,Gb */
14259       delta = dis_op2_E_G ( sorb, False, Iop_Or8, True, 1, delta, "or" );
14260       break;
14261    case 0x0B: /* OR Ev,Gv */
14262       delta = dis_op2_E_G ( sorb, False, Iop_Or8, True, sz, delta, "or" );
14263       break;
14264 
14265    case 0x12: /* ADC Eb,Gb */
14266       delta = dis_op2_E_G ( sorb, True, Iop_Add8, True, 1, delta, "adc" );
14267       break;
14268    case 0x13: /* ADC Ev,Gv */
14269       delta = dis_op2_E_G ( sorb, True, Iop_Add8, True, sz, delta, "adc" );
14270       break;
14271 
14272    case 0x1A: /* SBB Eb,Gb */
14273       delta = dis_op2_E_G ( sorb, True, Iop_Sub8, True, 1, delta, "sbb" );
14274       break;
14275    case 0x1B: /* SBB Ev,Gv */
14276       delta = dis_op2_E_G ( sorb, True, Iop_Sub8, True, sz, delta, "sbb" );
14277       break;
14278 
14279    case 0x22: /* AND Eb,Gb */
14280       delta = dis_op2_E_G ( sorb, False, Iop_And8, True, 1, delta, "and" );
14281       break;
14282    case 0x23: /* AND Ev,Gv */
14283       delta = dis_op2_E_G ( sorb, False, Iop_And8, True, sz, delta, "and" );
14284       break;
14285 
14286    case 0x2A: /* SUB Eb,Gb */
14287       delta = dis_op2_E_G ( sorb, False, Iop_Sub8, True, 1, delta, "sub" );
14288       break;
14289    case 0x2B: /* SUB Ev,Gv */
14290       delta = dis_op2_E_G ( sorb, False, Iop_Sub8, True, sz, delta, "sub" );
14291       break;
14292 
14293    case 0x32: /* XOR Eb,Gb */
14294       delta = dis_op2_E_G ( sorb, False, Iop_Xor8, True, 1, delta, "xor" );
14295       break;
14296    case 0x33: /* XOR Ev,Gv */
14297       delta = dis_op2_E_G ( sorb, False, Iop_Xor8, True, sz, delta, "xor" );
14298       break;
14299 
14300    case 0x3A: /* CMP Eb,Gb */
14301       delta = dis_op2_E_G ( sorb, False, Iop_Sub8, False, 1, delta, "cmp" );
14302       break;
14303    case 0x3B: /* CMP Ev,Gv */
14304       delta = dis_op2_E_G ( sorb, False, Iop_Sub8, False, sz, delta, "cmp" );
14305       break;
14306 
14307    case 0x84: /* TEST Eb,Gb */
14308       delta = dis_op2_E_G ( sorb, False, Iop_And8, False, 1, delta, "test" );
14309       break;
14310    case 0x85: /* TEST Ev,Gv */
14311       delta = dis_op2_E_G ( sorb, False, Iop_And8, False, sz, delta, "test" );
14312       break;
14313 
14314    /* ------------------------ opl Gv, Ev ----------------- */
14315 
14316    case 0x00: /* ADD Gb,Eb */
14317       delta = dis_op2_G_E ( sorb, pfx_lock, False,
14318                             Iop_Add8, True, 1, delta, "add" );
14319       break;
14320    case 0x01: /* ADD Gv,Ev */
14321       delta = dis_op2_G_E ( sorb, pfx_lock, False,
14322                             Iop_Add8, True, sz, delta, "add" );
14323       break;
14324 
14325    case 0x08: /* OR Gb,Eb */
14326       delta = dis_op2_G_E ( sorb, pfx_lock, False,
14327                             Iop_Or8, True, 1, delta, "or" );
14328       break;
14329    case 0x09: /* OR Gv,Ev */
14330       delta = dis_op2_G_E ( sorb, pfx_lock, False,
14331                             Iop_Or8, True, sz, delta, "or" );
14332       break;
14333 
14334    case 0x10: /* ADC Gb,Eb */
14335       delta = dis_op2_G_E ( sorb, pfx_lock, True,
14336                             Iop_Add8, True, 1, delta, "adc" );
14337       break;
14338    case 0x11: /* ADC Gv,Ev */
14339       delta = dis_op2_G_E ( sorb, pfx_lock, True,
14340                             Iop_Add8, True, sz, delta, "adc" );
14341       break;
14342 
14343    case 0x18: /* SBB Gb,Eb */
14344       delta = dis_op2_G_E ( sorb, pfx_lock, True,
14345                             Iop_Sub8, True, 1, delta, "sbb" );
14346       break;
14347    case 0x19: /* SBB Gv,Ev */
14348       delta = dis_op2_G_E ( sorb, pfx_lock, True,
14349                             Iop_Sub8, True, sz, delta, "sbb" );
14350       break;
14351 
14352    case 0x20: /* AND Gb,Eb */
14353       delta = dis_op2_G_E ( sorb, pfx_lock, False,
14354                             Iop_And8, True, 1, delta, "and" );
14355       break;
14356    case 0x21: /* AND Gv,Ev */
14357       delta = dis_op2_G_E ( sorb, pfx_lock, False,
14358                             Iop_And8, True, sz, delta, "and" );
14359       break;
14360 
14361    case 0x28: /* SUB Gb,Eb */
14362       delta = dis_op2_G_E ( sorb, pfx_lock, False,
14363                             Iop_Sub8, True, 1, delta, "sub" );
14364       break;
14365    case 0x29: /* SUB Gv,Ev */
14366       delta = dis_op2_G_E ( sorb, pfx_lock, False,
14367                             Iop_Sub8, True, sz, delta, "sub" );
14368       break;
14369 
14370    case 0x30: /* XOR Gb,Eb */
14371       delta = dis_op2_G_E ( sorb, pfx_lock, False,
14372                             Iop_Xor8, True, 1, delta, "xor" );
14373       break;
14374    case 0x31: /* XOR Gv,Ev */
14375       delta = dis_op2_G_E ( sorb, pfx_lock, False,
14376                             Iop_Xor8, True, sz, delta, "xor" );
14377       break;
14378 
14379    case 0x38: /* CMP Gb,Eb */
14380       delta = dis_op2_G_E ( sorb, pfx_lock, False,
14381                             Iop_Sub8, False, 1, delta, "cmp" );
14382       break;
14383    case 0x39: /* CMP Gv,Ev */
14384       delta = dis_op2_G_E ( sorb, pfx_lock, False,
14385                             Iop_Sub8, False, sz, delta, "cmp" );
14386       break;
14387 
14388    /* ------------------------ POP ------------------------ */
14389 
14390    case 0x58: /* POP eAX */
14391    case 0x59: /* POP eCX */
14392    case 0x5A: /* POP eDX */
14393    case 0x5B: /* POP eBX */
14394    case 0x5D: /* POP eBP */
14395    case 0x5E: /* POP eSI */
14396    case 0x5F: /* POP eDI */
14397    case 0x5C: /* POP eSP */
14398       vassert(sz == 2 || sz == 4);
14399       t1 = newTemp(szToITy(sz)); t2 = newTemp(Ity_I32);
14400       assign(t2, getIReg(4, R_ESP));
14401       assign(t1, loadLE(szToITy(sz),mkexpr(t2)));
14402       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t2), mkU32(sz)));
14403       putIReg(sz, opc-0x58, mkexpr(t1));
14404       DIP("pop%c %s\n", nameISize(sz), nameIReg(sz,opc-0x58));
14405       break;
14406 
14407    case 0x9D: /* POPF */
14408       vassert(sz == 2 || sz == 4);
14409       t1 = newTemp(Ity_I32); t2 = newTemp(Ity_I32);
14410       assign(t2, getIReg(4, R_ESP));
14411       assign(t1, widenUto32(loadLE(szToITy(sz),mkexpr(t2))));
14412       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t2), mkU32(sz)));
14413 
14414       /* Generate IR to set %EFLAGS{O,S,Z,A,C,P,D,ID,AC} from the
14415 	 value in t1. */
14416       set_EFLAGS_from_value( t1, True/*emit_AC_emwarn*/,
14417                                  ((Addr32)guest_EIP_bbstart)+delta );
14418 
14419       DIP("popf%c\n", nameISize(sz));
14420       break;
14421 
14422    case 0x61: /* POPA */
14423       /* This is almost certainly wrong for sz==2.  So ... */
14424       if (sz != 4) goto decode_failure;
14425 
14426       /* t5 is the old %ESP value. */
14427       t5 = newTemp(Ity_I32);
14428       assign( t5, getIReg(4, R_ESP) );
14429 
14430       /* Reload all the registers, except %esp. */
14431       putIReg(4,R_EAX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(28)) ));
14432       putIReg(4,R_ECX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(24)) ));
14433       putIReg(4,R_EDX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(20)) ));
14434       putIReg(4,R_EBX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(16)) ));
14435       /* ignore saved %ESP */
14436       putIReg(4,R_EBP, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 8)) ));
14437       putIReg(4,R_ESI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 4)) ));
14438       putIReg(4,R_EDI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 0)) ));
14439 
14440       /* and move %ESP back up */
14441       putIReg( 4, R_ESP, binop(Iop_Add32, mkexpr(t5), mkU32(8*4)) );
14442 
14443       DIP("popa%c\n", nameISize(sz));
14444       break;
14445 
14446    case 0x8F: /* POPL/POPW m32 */
14447      { Int    len;
14448        UChar  rm = getIByte(delta);
14449 
14450        /* make sure this instruction is correct POP */
14451        if (epartIsReg(rm) || gregOfRM(rm) != 0)
14452           goto decode_failure;
14453        /* and has correct size */
14454        if (sz != 4 && sz != 2)
14455           goto decode_failure;
14456        ty = szToITy(sz);
14457 
14458        t1 = newTemp(Ity_I32); /* stack address */
14459        t3 = newTemp(ty); /* data */
14460        /* set t1 to ESP: t1 = ESP */
14461        assign( t1, getIReg(4, R_ESP) );
14462        /* load M[ESP] to virtual register t3: t3 = M[t1] */
14463        assign( t3, loadLE(ty, mkexpr(t1)) );
14464 
14465        /* increase ESP; must be done before the STORE.  Intel manual says:
14466             If the ESP register is used as a base register for addressing
14467             a destination operand in memory, the POP instruction computes
14468             the effective address of the operand after it increments the
14469             ESP register.
14470        */
14471        putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t1), mkU32(sz)) );
14472 
14473        /* resolve MODR/M */
14474        addr = disAMode ( &len, sorb, delta, dis_buf);
14475        storeLE( mkexpr(addr), mkexpr(t3) );
14476 
14477        DIP("pop%c %s\n", sz==2 ? 'w' : 'l', dis_buf);
14478 
14479        delta += len;
14480        break;
14481      }
14482 
14483    case 0x1F: /* POP %DS */
14484       dis_pop_segreg( R_DS, sz ); break;
14485    case 0x07: /* POP %ES */
14486       dis_pop_segreg( R_ES, sz ); break;
14487    case 0x17: /* POP %SS */
14488       dis_pop_segreg( R_SS, sz ); break;
14489 
14490    /* ------------------------ PUSH ----------------------- */
14491 
14492    case 0x50: /* PUSH eAX */
14493    case 0x51: /* PUSH eCX */
14494    case 0x52: /* PUSH eDX */
14495    case 0x53: /* PUSH eBX */
14496    case 0x55: /* PUSH eBP */
14497    case 0x56: /* PUSH eSI */
14498    case 0x57: /* PUSH eDI */
14499    case 0x54: /* PUSH eSP */
14500       /* This is the Right Way, in that the value to be pushed is
14501          established before %esp is changed, so that pushl %esp
14502          correctly pushes the old value. */
14503       vassert(sz == 2 || sz == 4);
14504       ty = sz==2 ? Ity_I16 : Ity_I32;
14505       t1 = newTemp(ty); t2 = newTemp(Ity_I32);
14506       assign(t1, getIReg(sz, opc-0x50));
14507       assign(t2, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)));
14508       putIReg(4, R_ESP, mkexpr(t2) );
14509       storeLE(mkexpr(t2),mkexpr(t1));
14510       DIP("push%c %s\n", nameISize(sz), nameIReg(sz,opc-0x50));
14511       break;
14512 
14513 
14514    case 0x68: /* PUSH Iv */
14515       d32 = getUDisp(sz,delta); delta += sz;
14516       goto do_push_I;
14517    case 0x6A: /* PUSH Ib, sign-extended to sz */
14518       d32 = getSDisp8(delta); delta += 1;
14519       goto do_push_I;
14520    do_push_I:
14521       ty = szToITy(sz);
14522       t1 = newTemp(Ity_I32); t2 = newTemp(ty);
14523       assign( t1, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
14524       putIReg(4, R_ESP, mkexpr(t1) );
14525       /* stop mkU16 asserting if d32 is a negative 16-bit number
14526          (bug #132813) */
14527       if (ty == Ity_I16)
14528          d32 &= 0xFFFF;
14529       storeLE( mkexpr(t1), mkU(ty,d32) );
14530       DIP("push%c $0x%x\n", nameISize(sz), d32);
14531       break;
14532 
14533    case 0x9C: /* PUSHF */ {
14534       vassert(sz == 2 || sz == 4);
14535 
14536       t1 = newTemp(Ity_I32);
14537       assign( t1, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
14538       putIReg(4, R_ESP, mkexpr(t1) );
14539 
14540       /* Calculate OSZACP, and patch in fixed fields as per
14541          Intel docs.
14542          - bit 1 is always 1
14543          - bit 9 is Interrupt Enable (should always be 1 in user mode?)
14544       */
14545       t2 = newTemp(Ity_I32);
14546       assign( t2, binop(Iop_Or32,
14547                         mk_x86g_calculate_eflags_all(),
14548                         mkU32( (1<<1)|(1<<9) ) ));
14549 
14550       /* Patch in the D flag.  This can simply be a copy of bit 10 of
14551          baseBlock[OFFB_DFLAG]. */
14552       t3 = newTemp(Ity_I32);
14553       assign( t3, binop(Iop_Or32,
14554                         mkexpr(t2),
14555                         binop(Iop_And32,
14556                               IRExpr_Get(OFFB_DFLAG,Ity_I32),
14557                               mkU32(1<<10)))
14558             );
14559 
14560       /* And patch in the ID flag. */
14561       t4 = newTemp(Ity_I32);
14562       assign( t4, binop(Iop_Or32,
14563                         mkexpr(t3),
14564                         binop(Iop_And32,
14565                               binop(Iop_Shl32, IRExpr_Get(OFFB_IDFLAG,Ity_I32),
14566                                                mkU8(21)),
14567                               mkU32(1<<21)))
14568             );
14569 
14570       /* And patch in the AC flag. */
14571       t5 = newTemp(Ity_I32);
14572       assign( t5, binop(Iop_Or32,
14573                         mkexpr(t4),
14574                         binop(Iop_And32,
14575                               binop(Iop_Shl32, IRExpr_Get(OFFB_ACFLAG,Ity_I32),
14576                                                mkU8(18)),
14577                               mkU32(1<<18)))
14578             );
14579 
14580       /* if sz==2, the stored value needs to be narrowed. */
14581       if (sz == 2)
14582         storeLE( mkexpr(t1), unop(Iop_32to16,mkexpr(t5)) );
14583       else
14584         storeLE( mkexpr(t1), mkexpr(t5) );
14585 
14586       DIP("pushf%c\n", nameISize(sz));
14587       break;
14588    }
14589 
14590    case 0x60: /* PUSHA */
14591       /* This is almost certainly wrong for sz==2.  So ... */
14592       if (sz != 4) goto decode_failure;
14593 
14594       /* This is the Right Way, in that the value to be pushed is
14595          established before %esp is changed, so that pusha
14596          correctly pushes the old %esp value.  New value of %esp is
14597          pushed at start. */
14598       /* t0 is the %ESP value we're going to push. */
14599       t0 = newTemp(Ity_I32);
14600       assign( t0, getIReg(4, R_ESP) );
14601 
14602       /* t5 will be the new %ESP value. */
14603       t5 = newTemp(Ity_I32);
14604       assign( t5, binop(Iop_Sub32, mkexpr(t0), mkU32(8*4)) );
14605 
14606       /* Update guest state before prodding memory. */
14607       putIReg(4, R_ESP, mkexpr(t5));
14608 
14609       /* Dump all the registers. */
14610       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(28)), getIReg(4,R_EAX) );
14611       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(24)), getIReg(4,R_ECX) );
14612       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(20)), getIReg(4,R_EDX) );
14613       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(16)), getIReg(4,R_EBX) );
14614       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(12)), mkexpr(t0) /*esp*/);
14615       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 8)), getIReg(4,R_EBP) );
14616       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 4)), getIReg(4,R_ESI) );
14617       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 0)), getIReg(4,R_EDI) );
14618 
14619       DIP("pusha%c\n", nameISize(sz));
14620       break;
14621 
14622    case 0x0E: /* PUSH %CS */
14623       dis_push_segreg( R_CS, sz ); break;
14624    case 0x1E: /* PUSH %DS */
14625       dis_push_segreg( R_DS, sz ); break;
14626    case 0x06: /* PUSH %ES */
14627       dis_push_segreg( R_ES, sz ); break;
14628    case 0x16: /* PUSH %SS */
14629       dis_push_segreg( R_SS, sz ); break;
14630 
14631    /* ------------------------ SCAS et al ----------------- */
14632 
14633    case 0xA4: /* MOVS, no REP prefix */
14634    case 0xA5:
14635       if (sorb != 0)
14636          goto decode_failure; /* else dis_string_op asserts */
14637       dis_string_op( dis_MOVS, ( opc == 0xA4 ? 1 : sz ), "movs", sorb );
14638       break;
14639 
14640   case 0xA6: /* CMPSb, no REP prefix */
14641   case 0xA7:
14642       if (sorb != 0)
14643          goto decode_failure; /* else dis_string_op asserts */
14644       dis_string_op( dis_CMPS, ( opc == 0xA6 ? 1 : sz ), "cmps", sorb );
14645       break;
14646 
14647    case 0xAA: /* STOS, no REP prefix */
14648    case 0xAB:
14649       if (sorb != 0)
14650          goto decode_failure; /* else dis_string_op asserts */
14651       dis_string_op( dis_STOS, ( opc == 0xAA ? 1 : sz ), "stos", sorb );
14652       break;
14653 
14654    case 0xAC: /* LODS, no REP prefix */
14655    case 0xAD:
14656       if (sorb != 0)
14657          goto decode_failure; /* else dis_string_op asserts */
14658       dis_string_op( dis_LODS, ( opc == 0xAC ? 1 : sz ), "lods", sorb );
14659       break;
14660 
14661    case 0xAE: /* SCAS, no REP prefix */
14662    case 0xAF:
14663       if (sorb != 0)
14664          goto decode_failure; /* else dis_string_op asserts */
14665       dis_string_op( dis_SCAS, ( opc == 0xAE ? 1 : sz ), "scas", sorb );
14666       break;
14667 
14668 
14669    case 0xFC: /* CLD */
14670       stmt( IRStmt_Put( OFFB_DFLAG, mkU32(1)) );
14671       DIP("cld\n");
14672       break;
14673 
14674    case 0xFD: /* STD */
14675       stmt( IRStmt_Put( OFFB_DFLAG, mkU32(0xFFFFFFFF)) );
14676       DIP("std\n");
14677       break;
14678 
14679    case 0xF8: /* CLC */
14680    case 0xF9: /* STC */
14681    case 0xF5: /* CMC */
14682       t0 = newTemp(Ity_I32);
14683       t1 = newTemp(Ity_I32);
14684       assign( t0, mk_x86g_calculate_eflags_all() );
14685       switch (opc) {
14686          case 0xF8:
14687             assign( t1, binop(Iop_And32, mkexpr(t0),
14688                                          mkU32(~X86G_CC_MASK_C)));
14689             DIP("clc\n");
14690             break;
14691          case 0xF9:
14692             assign( t1, binop(Iop_Or32, mkexpr(t0),
14693                                         mkU32(X86G_CC_MASK_C)));
14694             DIP("stc\n");
14695             break;
14696          case 0xF5:
14697             assign( t1, binop(Iop_Xor32, mkexpr(t0),
14698                                          mkU32(X86G_CC_MASK_C)));
14699             DIP("cmc\n");
14700             break;
14701          default:
14702             vpanic("disInstr(x86)(clc/stc/cmc)");
14703       }
14704       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
14705       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
14706       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t1) ));
14707       /* Set NDEP even though it isn't used.  This makes redundant-PUT
14708          elimination of previous stores to this field work better. */
14709       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
14710       break;
14711 
14712    case 0xD6: /* SALC */
14713       t0 = newTemp(Ity_I32);
14714       t1 = newTemp(Ity_I32);
14715       assign( t0,  binop(Iop_And32,
14716                          mk_x86g_calculate_eflags_c(),
14717                          mkU32(1)) );
14718       assign( t1, binop(Iop_Sar32,
14719                         binop(Iop_Shl32, mkexpr(t0), mkU8(31)),
14720                         mkU8(31)) );
14721       putIReg(1, R_EAX, unop(Iop_32to8, mkexpr(t1)) );
14722       DIP("salc\n");
14723       break;
14724 
14725    /* REPNE prefix insn */
14726    case 0xF2: {
14727       Addr32 eip_orig = guest_EIP_bbstart + delta_start;
14728       if (sorb != 0) goto decode_failure;
14729       abyte = getIByte(delta); delta++;
14730 
14731       if (abyte == 0x66) { sz = 2; abyte = getIByte(delta); delta++; }
14732 
14733       switch (abyte) {
14734       /* According to the Intel manual, "repne movs" should never occur, but
14735        * in practice it has happened, so allow for it here... */
14736       case 0xA4: sz = 1;   /* REPNE MOVS<sz> */
14737       case 0xA5:
14738          dis_REP_op ( &dres, X86CondNZ, dis_MOVS, sz, eip_orig,
14739                              guest_EIP_bbstart+delta, "repne movs" );
14740          break;
14741 
14742       case 0xA6: sz = 1;   /* REPNE CMP<sz> */
14743       case 0xA7:
14744          dis_REP_op ( &dres, X86CondNZ, dis_CMPS, sz, eip_orig,
14745                              guest_EIP_bbstart+delta, "repne cmps" );
14746          break;
14747 
14748       case 0xAA: sz = 1;   /* REPNE STOS<sz> */
14749       case 0xAB:
14750          dis_REP_op ( &dres, X86CondNZ, dis_STOS, sz, eip_orig,
14751                              guest_EIP_bbstart+delta, "repne stos" );
14752          break;
14753 
14754       case 0xAE: sz = 1;   /* REPNE SCAS<sz> */
14755       case 0xAF:
14756          dis_REP_op ( &dres, X86CondNZ, dis_SCAS, sz, eip_orig,
14757                              guest_EIP_bbstart+delta, "repne scas" );
14758          break;
14759 
14760       case 0xC3:           /* REPNE RET, used to help out AMD cpus */
14761                            /* identical to normal RET */
14762          dis_ret(&dres, 0);
14763          DIP("repne ret\n");
14764          break;
14765 
14766       case 0x70: case 0x71: case 0x72: case 0x73:
14767       case 0x74: case 0x75: case 0x76: case 0x77:
14768       case 0x78: case 0x79: case 0x7A: case 0x7B:
14769       case 0x7C: case 0x7D: case 0x7E: case 0x7F:
14770          /* Jump instructions, same reason as RET */
14771          { Int    jmpDelta;
14772            jmpDelta = (Int)getSDisp8(delta);
14773            vassert(-128 <= jmpDelta && jmpDelta < 128);
14774            d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + jmpDelta;
14775            delta++;
14776            jcc_01( &dres, (X86Condcode)(abyte - 0x70),
14777                    (Addr32)(guest_EIP_bbstart+delta), d32);
14778            vassert(dres.whatNext == Dis_StopHere);
14779          }
14780          DIP("repne j%s-8 0x%x\n", name_X86Condcode(abyte - 0x70), d32);
14781          break;
14782 
14783       case 0xE9: /* Jv (jump, 16/32 offset) */
14784          vassert(sz == 4 || sz == 2);
14785          d32 = (((Addr32)guest_EIP_bbstart)+delta+sz) + getSDisp(sz,delta);
14786          delta += sz;
14787          jmp_lit(&dres, Ijk_Boring, d32);
14788          vassert(dres.whatNext == Dis_StopHere);
14789          DIP("repne jmp 0x%x\n", d32);
14790          break;
14791 
14792       default:
14793          goto decode_failure;
14794       }
14795       break;
14796    }
14797 
14798    /* REP/REPE prefix insn (for SCAS and CMPS, 0xF3 means REPE,
14799       for the rest, it means REP) */
14800    case 0xF3: {
14801       Addr32 eip_orig = guest_EIP_bbstart + delta_start;
14802       abyte = getIByte(delta); delta++;
14803 
14804       if (abyte == 0x66) { sz = 2; abyte = getIByte(delta); delta++; }
14805 
14806       if (sorb != 0 && abyte != 0x0F) goto decode_failure;
14807 
14808       switch (abyte) {
14809       case 0x0F:
14810          switch (getIByte(delta)) {
14811          /* On older CPUs, TZCNT behaves the same as BSF.  */
14812          case 0xBC: /* REP BSF Gv,Ev */
14813             delta = dis_bs_E_G ( sorb, sz, delta + 1, True );
14814             break;
14815          /* On older CPUs, LZCNT behaves the same as BSR.  */
14816          case 0xBD: /* REP BSR Gv,Ev */
14817             delta = dis_bs_E_G ( sorb, sz, delta + 1, False );
14818             break;
14819          case 0x1e: /* ENDBR */
14820             delta++;
14821             switch (getIByte(delta++)) {
14822                case 0xfa:
14823                   DIP("endbr64");
14824                   break;
14825                case 0xfb:
14826                   DIP("endbr32");
14827                   break;
14828                default:
14829                   goto decode_failure;
14830             }
14831             break;
14832          default:
14833             goto decode_failure;
14834          }
14835          break;
14836 
14837       case 0xA4: sz = 1;   /* REP MOVS<sz> */
14838       case 0xA5:
14839          dis_REP_op ( &dres, X86CondAlways, dis_MOVS, sz, eip_orig,
14840                              guest_EIP_bbstart+delta, "rep movs" );
14841          break;
14842 
14843       case 0xA6: sz = 1;   /* REPE CMP<sz> */
14844       case 0xA7:
14845          dis_REP_op ( &dres, X86CondZ, dis_CMPS, sz, eip_orig,
14846                              guest_EIP_bbstart+delta, "repe cmps" );
14847          break;
14848 
14849       case 0xAA: sz = 1;   /* REP STOS<sz> */
14850       case 0xAB:
14851          dis_REP_op ( &dres, X86CondAlways, dis_STOS, sz, eip_orig,
14852                              guest_EIP_bbstart+delta, "rep stos" );
14853          break;
14854 
14855       case 0xAC: sz = 1;   /* REP LODS<sz> */
14856       case 0xAD:
14857          dis_REP_op ( &dres, X86CondAlways, dis_LODS, sz, eip_orig,
14858                              guest_EIP_bbstart+delta, "rep lods" );
14859          break;
14860 
14861       case 0xAE: sz = 1;   /* REPE SCAS<sz> */
14862       case 0xAF:
14863          dis_REP_op ( &dres, X86CondZ, dis_SCAS, sz, eip_orig,
14864                              guest_EIP_bbstart+delta, "repe scas" );
14865          break;
14866 
14867       case 0x90:           /* REP NOP (PAUSE) */
14868          /* a hint to the P4 re spin-wait loop */
14869          DIP("rep nop (P4 pause)\n");
14870          /* "observe" the hint.  The Vex client needs to be careful not
14871             to cause very long delays as a result, though. */
14872          jmp_lit(&dres, Ijk_Yield, ((Addr32)guest_EIP_bbstart)+delta);
14873          vassert(dres.whatNext == Dis_StopHere);
14874          break;
14875 
14876       case 0xC3:           /* REP RET, used to help out AMD cpus */
14877          dis_ret(&dres, 0);
14878          DIP("rep ret\n");
14879          break;
14880 
14881       default:
14882          goto decode_failure;
14883       }
14884       break;
14885    }
14886 
14887    /* ------------------------ XCHG ----------------------- */
14888 
14889    /* XCHG reg,mem automatically asserts LOCK# even without a LOCK
14890       prefix; hence it must be translated with an IRCAS (at least, the
14891       memory variant). */
14892    case 0x86: /* XCHG Gb,Eb */
14893       sz = 1;
14894       /* Fall through ... */
14895    case 0x87: /* XCHG Gv,Ev */
14896       modrm = getIByte(delta);
14897       ty = szToITy(sz);
14898       t1 = newTemp(ty); t2 = newTemp(ty);
14899       if (epartIsReg(modrm)) {
14900          assign(t1, getIReg(sz, eregOfRM(modrm)));
14901          assign(t2, getIReg(sz, gregOfRM(modrm)));
14902          putIReg(sz, gregOfRM(modrm), mkexpr(t1));
14903          putIReg(sz, eregOfRM(modrm), mkexpr(t2));
14904          delta++;
14905          DIP("xchg%c %s, %s\n",
14906              nameISize(sz), nameIReg(sz,gregOfRM(modrm)),
14907                             nameIReg(sz,eregOfRM(modrm)));
14908       } else {
14909          *expect_CAS = True;
14910          addr = disAMode ( &alen, sorb, delta, dis_buf );
14911          assign( t1, loadLE(ty,mkexpr(addr)) );
14912          assign( t2, getIReg(sz,gregOfRM(modrm)) );
14913          casLE( mkexpr(addr),
14914                 mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
14915          putIReg( sz, gregOfRM(modrm), mkexpr(t1) );
14916          delta += alen;
14917          DIP("xchg%c %s, %s\n", nameISize(sz),
14918                                 nameIReg(sz,gregOfRM(modrm)), dis_buf);
14919       }
14920       break;
14921 
14922    case 0x90: /* XCHG eAX,eAX */
14923       DIP("nop\n");
14924       break;
14925    case 0x91: /* XCHG eAX,eCX */
14926    case 0x92: /* XCHG eAX,eDX */
14927    case 0x93: /* XCHG eAX,eBX */
14928    case 0x94: /* XCHG eAX,eSP */
14929    case 0x95: /* XCHG eAX,eBP */
14930    case 0x96: /* XCHG eAX,eSI */
14931    case 0x97: /* XCHG eAX,eDI */
14932       codegen_xchg_eAX_Reg ( sz, opc - 0x90 );
14933       break;
14934 
14935    /* ------------------------ XLAT ----------------------- */
14936 
14937    case 0xD7: /* XLAT */
14938       if (sz != 4) goto decode_failure; /* sz == 2 is also allowed (0x66) */
14939       putIReg(
14940          1,
14941          R_EAX/*AL*/,
14942          loadLE(Ity_I8,
14943                 handleSegOverride(
14944                    sorb,
14945                    binop(Iop_Add32,
14946                          getIReg(4, R_EBX),
14947                          unop(Iop_8Uto32, getIReg(1, R_EAX/*AL*/))))));
14948 
14949       DIP("xlat%c [ebx]\n", nameISize(sz));
14950       break;
14951 
14952    /* ------------------------ IN / OUT ----------------------- */
14953 
14954    case 0xE4: /* IN imm8, AL */
14955       sz = 1;
14956       t1 = newTemp(Ity_I32);
14957       abyte = getIByte(delta); delta++;
14958       assign(t1, mkU32( abyte & 0xFF ));
14959       DIP("in%c $%d,%s\n", nameISize(sz), abyte, nameIReg(sz,R_EAX));
14960       goto do_IN;
14961    case 0xE5: /* IN imm8, eAX */
14962       vassert(sz == 2 || sz == 4);
14963       t1 = newTemp(Ity_I32);
14964       abyte = getIByte(delta); delta++;
14965       assign(t1, mkU32( abyte & 0xFF ));
14966       DIP("in%c $%d,%s\n", nameISize(sz), abyte, nameIReg(sz,R_EAX));
14967       goto do_IN;
14968    case 0xEC: /* IN %DX, AL */
14969       sz = 1;
14970       t1 = newTemp(Ity_I32);
14971       assign(t1, unop(Iop_16Uto32, getIReg(2, R_EDX)));
14972       DIP("in%c %s,%s\n", nameISize(sz), nameIReg(2,R_EDX),
14973                                          nameIReg(sz,R_EAX));
14974       goto do_IN;
14975    case 0xED: /* IN %DX, eAX */
14976       vassert(sz == 2 || sz == 4);
14977       t1 = newTemp(Ity_I32);
14978       assign(t1, unop(Iop_16Uto32, getIReg(2, R_EDX)));
14979       DIP("in%c %s,%s\n", nameISize(sz), nameIReg(2,R_EDX),
14980                                          nameIReg(sz,R_EAX));
14981       goto do_IN;
14982    do_IN: {
14983       /* At this point, sz indicates the width, and t1 is a 32-bit
14984          value giving port number. */
14985       IRDirty* d;
14986       vassert(sz == 1 || sz == 2 || sz == 4);
14987       ty = szToITy(sz);
14988       t2 = newTemp(Ity_I32);
14989       d = unsafeIRDirty_1_N(
14990              t2,
14991              0/*regparms*/,
14992              "x86g_dirtyhelper_IN",
14993              &x86g_dirtyhelper_IN,
14994              mkIRExprVec_2( mkexpr(t1), mkU32(sz) )
14995           );
14996       /* do the call, dumping the result in t2. */
14997       stmt( IRStmt_Dirty(d) );
14998       putIReg(sz, R_EAX, narrowTo( ty, mkexpr(t2) ) );
14999       break;
15000    }
15001 
15002    case 0xE6: /* OUT AL, imm8 */
15003       sz = 1;
15004       t1 = newTemp(Ity_I32);
15005       abyte = getIByte(delta); delta++;
15006       assign( t1, mkU32( abyte & 0xFF ) );
15007       DIP("out%c %s,$%d\n", nameISize(sz), nameIReg(sz,R_EAX), abyte);
15008       goto do_OUT;
15009    case 0xE7: /* OUT eAX, imm8 */
15010       vassert(sz == 2 || sz == 4);
15011       t1 = newTemp(Ity_I32);
15012       abyte = getIByte(delta); delta++;
15013       assign( t1, mkU32( abyte & 0xFF ) );
15014       DIP("out%c %s,$%d\n", nameISize(sz), nameIReg(sz,R_EAX), abyte);
15015       goto do_OUT;
15016    case 0xEE: /* OUT AL, %DX */
15017       sz = 1;
15018       t1 = newTemp(Ity_I32);
15019       assign( t1, unop(Iop_16Uto32, getIReg(2, R_EDX)) );
15020       DIP("out%c %s,%s\n", nameISize(sz), nameIReg(sz,R_EAX),
15021                                           nameIReg(2,R_EDX));
15022       goto do_OUT;
15023    case 0xEF: /* OUT eAX, %DX */
15024       vassert(sz == 2 || sz == 4);
15025       t1 = newTemp(Ity_I32);
15026       assign( t1, unop(Iop_16Uto32, getIReg(2, R_EDX)) );
15027       DIP("out%c %s,%s\n", nameISize(sz), nameIReg(sz,R_EAX),
15028                                           nameIReg(2,R_EDX));
15029       goto do_OUT;
15030    do_OUT: {
15031       /* At this point, sz indicates the width, and t1 is a 32-bit
15032          value giving port number. */
15033       IRDirty* d;
15034       vassert(sz == 1 || sz == 2 || sz == 4);
15035       ty = szToITy(sz);
15036       d = unsafeIRDirty_0_N(
15037              0/*regparms*/,
15038              "x86g_dirtyhelper_OUT",
15039              &x86g_dirtyhelper_OUT,
15040              mkIRExprVec_3( mkexpr(t1),
15041                             widenUto32( getIReg(sz, R_EAX) ),
15042                             mkU32(sz) )
15043           );
15044       stmt( IRStmt_Dirty(d) );
15045       break;
15046    }
15047 
15048    /* ------------------------ (Grp1 extensions) ---------- */
15049 
15050    case 0x82: /* Grp1 Ib,Eb too.  Apparently this is the same as
15051                  case 0x80, but only in 32-bit mode. */
15052       /* fallthru */
15053    case 0x80: /* Grp1 Ib,Eb */
15054       modrm = getIByte(delta);
15055       am_sz = lengthAMode(delta);
15056       sz    = 1;
15057       d_sz  = 1;
15058       d32   = getUChar(delta + am_sz);
15059       delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
15060       break;
15061 
15062    case 0x81: /* Grp1 Iv,Ev */
15063       modrm = getIByte(delta);
15064       am_sz = lengthAMode(delta);
15065       d_sz  = sz;
15066       d32   = getUDisp(d_sz, delta + am_sz);
15067       delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
15068       break;
15069 
15070    case 0x83: /* Grp1 Ib,Ev */
15071       modrm = getIByte(delta);
15072       am_sz = lengthAMode(delta);
15073       d_sz  = 1;
15074       d32   = getSDisp8(delta + am_sz);
15075       delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
15076       break;
15077 
15078    /* ------------------------ (Grp2 extensions) ---------- */
15079 
15080    case 0xC0: { /* Grp2 Ib,Eb */
15081       Bool decode_OK = True;
15082       modrm = getIByte(delta);
15083       am_sz = lengthAMode(delta);
15084       d_sz  = 1;
15085       d32   = getUChar(delta + am_sz);
15086       sz    = 1;
15087       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
15088                          mkU8(d32 & 0xFF), NULL, &decode_OK );
15089       if (!decode_OK)
15090          goto decode_failure;
15091       break;
15092    }
15093    case 0xC1: { /* Grp2 Ib,Ev */
15094       Bool decode_OK = True;
15095       modrm = getIByte(delta);
15096       am_sz = lengthAMode(delta);
15097       d_sz  = 1;
15098       d32   = getUChar(delta + am_sz);
15099       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
15100                          mkU8(d32 & 0xFF), NULL, &decode_OK );
15101       if (!decode_OK)
15102          goto decode_failure;
15103       break;
15104    }
15105    case 0xD0: { /* Grp2 1,Eb */
15106       Bool decode_OK = True;
15107       modrm = getIByte(delta);
15108       am_sz = lengthAMode(delta);
15109       d_sz  = 0;
15110       d32   = 1;
15111       sz    = 1;
15112       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
15113                          mkU8(d32), NULL, &decode_OK );
15114       if (!decode_OK)
15115          goto decode_failure;
15116       break;
15117    }
15118    case 0xD1: { /* Grp2 1,Ev */
15119       Bool decode_OK = True;
15120       modrm = getUChar(delta);
15121       am_sz = lengthAMode(delta);
15122       d_sz  = 0;
15123       d32   = 1;
15124       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
15125                          mkU8(d32), NULL, &decode_OK );
15126       if (!decode_OK)
15127          goto decode_failure;
15128       break;
15129    }
15130    case 0xD2: { /* Grp2 CL,Eb */
15131       Bool decode_OK = True;
15132       modrm = getUChar(delta);
15133       am_sz = lengthAMode(delta);
15134       d_sz  = 0;
15135       sz    = 1;
15136       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
15137                          getIReg(1,R_ECX), "%cl", &decode_OK );
15138       if (!decode_OK)
15139          goto decode_failure;
15140       break;
15141    }
15142    case 0xD3: { /* Grp2 CL,Ev */
15143       Bool decode_OK = True;
15144       modrm = getIByte(delta);
15145       am_sz = lengthAMode(delta);
15146       d_sz  = 0;
15147       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
15148                          getIReg(1,R_ECX), "%cl", &decode_OK );
15149       if (!decode_OK)
15150          goto decode_failure;
15151       break;
15152    }
15153 
15154    /* ------------------------ (Grp3 extensions) ---------- */
15155 
15156    case 0xF6: { /* Grp3 Eb */
15157       Bool decode_OK = True;
15158       delta = dis_Grp3 ( sorb, pfx_lock, 1, delta, &decode_OK );
15159       if (!decode_OK)
15160          goto decode_failure;
15161       break;
15162    }
15163    case 0xF7: { /* Grp3 Ev */
15164       Bool decode_OK = True;
15165       delta = dis_Grp3 ( sorb, pfx_lock, sz, delta, &decode_OK );
15166       if (!decode_OK)
15167          goto decode_failure;
15168       break;
15169    }
15170 
15171    /* ------------------------ (Grp4 extensions) ---------- */
15172 
15173    case 0xFE: { /* Grp4 Eb */
15174       Bool decode_OK = True;
15175       delta = dis_Grp4 ( sorb, pfx_lock, delta, &decode_OK );
15176       if (!decode_OK)
15177          goto decode_failure;
15178       break;
15179    }
15180 
15181    /* ------------------------ (Grp5 extensions) ---------- */
15182 
15183    case 0xFF: { /* Grp5 Ev */
15184       Bool decode_OK = True;
15185       delta = dis_Grp5 ( sorb, pfx_lock, sz, delta, &dres, &decode_OK );
15186       if (!decode_OK)
15187          goto decode_failure;
15188       break;
15189    }
15190 
15191    /* -------------------------- CLI/STI ------------------- */
15192    /* We treat them as NOP */
15193    case 0xFA: {
15194       DIP("cli\n");
15195       /* vvv fallthrough */
15196    }
15197    case 0xFB:
15198       DIP("sti\n");
15199       /* Treated as nop for now. could add actual behavior based on whatever or just an emnote */
15200       jmp_lit(&dres, Ijk_Privileged, ((Addr32)guest_EIP_bbstart) + delta);
15201       vassert(dres.whatNext == Dis_StopHere);
15202       break;
15203 
15204    /* -------------------------- halt ---------------------- */
15205    case 0xF4: { /* hlt */
15206       jmp_lit(&dres, Ijk_SigTRAP, ((Addr32)guest_EIP_bbstart)+delta);
15207       vassert(dres.whatNext == Dis_StopHere);
15208       DIP("hlt\n");
15209       break;
15210    }
15211 
15212    /* ------------------------ Escapes to 2-byte opcodes -- */
15213 
15214    case 0x0F: {
15215       opc = getIByte(delta); delta++;
15216       switch (opc) {
15217 
15218       case 0x20: { /* mov r32, crX (X \in \{0, 2, 3, 4}) */
15219         UChar rm = getIByte(delta++);
15220         /* We only support cr0 for the moment */
15221         if (gregOfRM(rm) != 0)
15222           goto decode_failure;
15223         putIReg(4, gregOfRM(rm), mkU32(archinfo->x86_cr0));
15224         break;
15225       }
15226       case 0x22: {/* mov crX (X \in \{0, 2, 3, 4}), r32 */
15227         UChar rm = getIByte(delta++);
15228         /* We only support cr0 for the moment */
15229         if (gregOfRM(rm) != 0)
15230           goto decode_failure;
15231         IRTemp value = newTemp(Ity_I32);
15232         assign(value, getIReg(4, eregOfRM(rm)));
15233         IRDirty* d = unsafeIRDirty_0_N (
15234                            0/*regparms*/,
15235                            "x86g_dirtyhelper_write_cr0",
15236                            &x86g_dirtyhelper_write_cr0,
15237                            mkIRExprVec_1( mkexpr(value) )
15238                         );
15239         stmt( IRStmt_Dirty(d) );
15240         dres.whatNext    = Dis_StopHere;
15241         dres.jk_StopHere = Ijk_Yield;
15242         stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_bbstart + delta) ) );
15243         break;
15244       }
15245 
15246       case 0x09: /* WBINVD */
15247         /* We treat it as NOP */
15248         break;
15249 
15250       /* =-=-=-=-=-=-=-=-=- Grp8 =-=-=-=-=-=-=-=-=-=-=-= */
15251 
15252       case 0xBA: { /* Grp8 Ib,Ev */
15253          Bool decode_OK = False;
15254          modrm = getUChar(delta);
15255          am_sz = lengthAMode(delta);
15256          d32   = getSDisp8(delta + am_sz);
15257          delta = dis_Grp8_Imm ( sorb, pfx_lock, delta, modrm,
15258                                 am_sz, sz, d32, &decode_OK );
15259          if (!decode_OK)
15260             goto decode_failure;
15261          break;
15262       }
15263 
15264       /* =-=-=-=-=-=-=-=-=- BSF/BSR -=-=-=-=-=-=-=-=-=-= */
15265 
15266       case 0xBC: /* BSF Gv,Ev */
15267          delta = dis_bs_E_G ( sorb, sz, delta, True );
15268          break;
15269       case 0xBD: /* BSR Gv,Ev */
15270          delta = dis_bs_E_G ( sorb, sz, delta, False );
15271          break;
15272 
15273       /* =-=-=-=-=-=-=-=-=- BSWAP -=-=-=-=-=-=-=-=-=-=-= */
15274 
15275       case 0xC8: /* BSWAP %eax */
15276       case 0xC9:
15277       case 0xCA:
15278       case 0xCB:
15279       case 0xCC:
15280       case 0xCD:
15281       case 0xCE:
15282       case 0xCF: /* BSWAP %edi */
15283          /* AFAICS from the Intel docs, this only exists at size 4. */
15284 
15285          /* however, we are in the business of emulating stuff, and an
15286           * emulator has no business crashing when it sees an "undefined"
15287           * instruction. My CPU just clears the lowest two bytes of the
15288           * register so let's implement that. */
15289          if (sz == 2) {
15290             putIReg(2, opc-0xC8, mkU16(0));
15291             DIP("bswapw %s (UNDEFINED)\n", nameIReg(2, opc-0xC8));
15292             break;
15293          }
15294 
15295          if (sz != 4) goto decode_failure;
15296 
15297          t1 = newTemp(Ity_I32);
15298          assign( t1, getIReg(4, opc-0xC8) );
15299          t2 = math_BSWAP(t1, Ity_I32);
15300 
15301          putIReg(4, opc-0xC8, mkexpr(t2));
15302          DIP("bswapl %s\n", nameIReg(4, opc-0xC8));
15303          break;
15304 
15305       /* =-=-=-=-=-=-=-=-=- BT/BTS/BTR/BTC =-=-=-=-=-=-= */
15306 
15307       case 0xA3: /* BT Gv,Ev */
15308          delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpNone );
15309          break;
15310       case 0xB3: /* BTR Gv,Ev */
15311          delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpReset );
15312          break;
15313       case 0xAB: /* BTS Gv,Ev */
15314          delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpSet );
15315          break;
15316       case 0xBB: /* BTC Gv,Ev */
15317          delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpComp );
15318          break;
15319 
15320       /* =-=-=-=-=-=-=-=-=- CMOV =-=-=-=-=-=-=-=-=-=-=-= */
15321 
15322       case 0x40:
15323       case 0x41:
15324       case 0x42: /* CMOVBb/CMOVNAEb (cmov below) */
15325       case 0x43: /* CMOVNBb/CMOVAEb (cmov not below) */
15326       case 0x44: /* CMOVZb/CMOVEb (cmov zero) */
15327       case 0x45: /* CMOVNZb/CMOVNEb (cmov not zero) */
15328       case 0x46: /* CMOVBEb/CMOVNAb (cmov below or equal) */
15329       case 0x47: /* CMOVNBEb/CMOVAb (cmov not below or equal) */
15330       case 0x48: /* CMOVSb (cmov negative) */
15331       case 0x49: /* CMOVSb (cmov not negative) */
15332       case 0x4A: /* CMOVP (cmov parity even) */
15333       case 0x4B: /* CMOVNP (cmov parity odd) */
15334       case 0x4C: /* CMOVLb/CMOVNGEb (cmov less) */
15335       case 0x4D: /* CMOVGEb/CMOVNLb (cmov greater or equal) */
15336       case 0x4E: /* CMOVLEb/CMOVNGb (cmov less or equal) */
15337       case 0x4F: /* CMOVGb/CMOVNLEb (cmov greater) */
15338          delta = dis_cmov_E_G(sorb, sz, (X86Condcode)(opc - 0x40), delta);
15339          break;
15340 
15341       /* =-=-=-=-=-=-=-=-=- CMPXCHG -=-=-=-=-=-=-=-=-=-= */
15342 
15343       case 0xB0: /* CMPXCHG Gb,Eb */
15344          delta = dis_cmpxchg_G_E ( sorb, pfx_lock, 1, delta );
15345          break;
15346       case 0xB1: /* CMPXCHG Gv,Ev */
15347          delta = dis_cmpxchg_G_E ( sorb, pfx_lock, sz, delta );
15348          break;
15349 
15350       case 0xC7: { /* CMPXCHG8B Gv (0F C7 /1) */
15351          IRTemp expdHi    = newTemp(Ity_I32);
15352          IRTemp expdLo    = newTemp(Ity_I32);
15353          IRTemp dataHi    = newTemp(Ity_I32);
15354          IRTemp dataLo    = newTemp(Ity_I32);
15355          IRTemp oldHi     = newTemp(Ity_I32);
15356          IRTemp oldLo     = newTemp(Ity_I32);
15357          IRTemp flags_old = newTemp(Ity_I32);
15358          IRTemp flags_new = newTemp(Ity_I32);
15359          IRTemp success   = newTemp(Ity_I1);
15360 
15361          /* Translate this using a DCAS, even if there is no LOCK
15362             prefix.  Life is too short to bother with generating two
15363             different translations for the with/without-LOCK-prefix
15364             cases. */
15365          *expect_CAS = True;
15366 
15367 	 /* Decode, and generate address. */
15368          if (sz != 4) goto decode_failure;
15369          modrm = getIByte(delta);
15370          if (epartIsReg(modrm)) goto decode_failure;
15371          if (gregOfRM(modrm) != 1) goto decode_failure;
15372          addr = disAMode ( &alen, sorb, delta, dis_buf );
15373          delta += alen;
15374 
15375          /* Get the expected and new values. */
15376          assign( expdHi, getIReg(4,R_EDX) );
15377          assign( expdLo, getIReg(4,R_EAX) );
15378          assign( dataHi, getIReg(4,R_ECX) );
15379          assign( dataLo, getIReg(4,R_EBX) );
15380 
15381          /* Do the DCAS */
15382          stmt( IRStmt_CAS(
15383                   mkIRCAS( oldHi, oldLo,
15384                            Iend_LE, mkexpr(addr),
15385                            mkexpr(expdHi), mkexpr(expdLo),
15386                            mkexpr(dataHi), mkexpr(dataLo)
15387                )));
15388 
15389          /* success when oldHi:oldLo == expdHi:expdLo */
15390          assign( success,
15391                  binop(Iop_CasCmpEQ32,
15392                        binop(Iop_Or32,
15393                              binop(Iop_Xor32, mkexpr(oldHi), mkexpr(expdHi)),
15394                              binop(Iop_Xor32, mkexpr(oldLo), mkexpr(expdLo))
15395                        ),
15396                        mkU32(0)
15397                  ));
15398 
15399          /* If the DCAS is successful, that is to say oldHi:oldLo ==
15400             expdHi:expdLo, then put expdHi:expdLo back in EDX:EAX,
15401             which is where they came from originally.  Both the actual
15402             contents of these two regs, and any shadow values, are
15403             unchanged.  If the DCAS fails then we're putting into
15404             EDX:EAX the value seen in memory. */
15405          putIReg(4, R_EDX,
15406                     IRExpr_ITE( mkexpr(success),
15407                                 mkexpr(expdHi), mkexpr(oldHi)
15408                 ));
15409          putIReg(4, R_EAX,
15410                     IRExpr_ITE( mkexpr(success),
15411                                 mkexpr(expdLo), mkexpr(oldLo)
15412                 ));
15413 
15414          /* Copy the success bit into the Z flag and leave the others
15415             unchanged */
15416          assign( flags_old, widenUto32(mk_x86g_calculate_eflags_all()));
15417          assign(
15418             flags_new,
15419             binop(Iop_Or32,
15420                   binop(Iop_And32, mkexpr(flags_old),
15421                                    mkU32(~X86G_CC_MASK_Z)),
15422                   binop(Iop_Shl32,
15423                         binop(Iop_And32,
15424                               unop(Iop_1Uto32, mkexpr(success)), mkU32(1)),
15425                         mkU8(X86G_CC_SHIFT_Z)) ));
15426 
15427          stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
15428          stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(flags_new) ));
15429          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
15430          /* Set NDEP even though it isn't used.  This makes
15431             redundant-PUT elimination of previous stores to this field
15432             work better. */
15433          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
15434 
15435          /* Sheesh.  Aren't you glad it was me and not you that had to
15436 	    write and validate all this grunge? */
15437 
15438 	 DIP("cmpxchg8b %s\n", dis_buf);
15439 	 break;
15440       }
15441 
15442       /* =-=-=-=-=-=-=-=-=- CPUID -=-=-=-=-=-=-=-=-=-=-= */
15443 
15444       case 0xA2: { /* CPUID */
15445          /* Uses dirty helper:
15446                void dirtyhelper_CPUID_sse[012] ( VexGuestX86State* )
15447             declared to mod eax, wr ebx, ecx, edx
15448          */
15449          IRDirty* d     = NULL;
15450          void*    fAddr = NULL;
15451          const HChar* fName = NULL;
15452          if (archinfo->hwcaps & VEX_HWCAPS_X86_SSE3) {
15453             fName = "x86g_dirtyhelper_CPUID_sse3";
15454             fAddr = &x86g_dirtyhelper_CPUID_sse3;
15455          }
15456          else
15457          if (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2) {
15458             fName = "x86g_dirtyhelper_CPUID_sse2";
15459             fAddr = &x86g_dirtyhelper_CPUID_sse2;
15460          }
15461          else
15462          if (archinfo->hwcaps & VEX_HWCAPS_X86_SSE1) {
15463             fName = "x86g_dirtyhelper_CPUID_sse1";
15464             fAddr = &x86g_dirtyhelper_CPUID_sse1;
15465          }
15466          else
15467          if (archinfo->hwcaps & VEX_HWCAPS_X86_MMXEXT) {
15468             fName = "x86g_dirtyhelper_CPUID_mmxext";
15469             fAddr = &x86g_dirtyhelper_CPUID_mmxext;
15470          }
15471          else
15472          if (archinfo->hwcaps == 0/*no SSE*/) {
15473             fName = "x86g_dirtyhelper_CPUID_sse0";
15474             fAddr = &x86g_dirtyhelper_CPUID_sse0;
15475          } else
15476             vpanic("disInstr(x86)(cpuid)");
15477 
15478          vassert(fName); vassert(fAddr);
15479          d = unsafeIRDirty_0_N ( 0/*regparms*/,
15480                                  fName, fAddr, mkIRExprVec_1(IRExpr_GSPTR()) );
15481          /* declare guest state effects */
15482          d->nFxState = 4;
15483          vex_bzero(&d->fxState, sizeof(d->fxState));
15484          d->fxState[0].fx     = Ifx_Modify;
15485          d->fxState[0].offset = OFFB_EAX;
15486          d->fxState[0].size   = 4;
15487          d->fxState[1].fx     = Ifx_Write;
15488          d->fxState[1].offset = OFFB_EBX;
15489          d->fxState[1].size   = 4;
15490          d->fxState[2].fx     = Ifx_Modify;
15491          d->fxState[2].offset = OFFB_ECX;
15492          d->fxState[2].size   = 4;
15493          d->fxState[3].fx     = Ifx_Write;
15494          d->fxState[3].offset = OFFB_EDX;
15495          d->fxState[3].size   = 4;
15496          /* execute the dirty call, side-effecting guest state */
15497          stmt( IRStmt_Dirty(d) );
15498          /* CPUID is a serialising insn.  So, just in case someone is
15499             using it as a memory fence ... */
15500          stmt( IRStmt_MBE(Imbe_Fence) );
15501          DIP("cpuid\n");
15502          break;
15503       }
15504 
15505 //--          if (!VG_(cpu_has_feature)(VG_X86_FEAT_CPUID))
15506 //--             goto decode_failure;
15507 //--
15508 //--          t1 = newTemp(cb);
15509 //--          t2 = newTemp(cb);
15510 //--          t3 = newTemp(cb);
15511 //--          t4 = newTemp(cb);
15512 //--          uInstr0(cb, CALLM_S, 0);
15513 //--
15514 //--          uInstr2(cb, GET,   4, ArchReg, R_EAX, TempReg, t1);
15515 //--          uInstr1(cb, PUSH,  4, TempReg, t1);
15516 //--
15517 //--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t2);
15518 //--          uLiteral(cb, 0);
15519 //--          uInstr1(cb, PUSH,  4, TempReg, t2);
15520 //--
15521 //--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t3);
15522 //--          uLiteral(cb, 0);
15523 //--          uInstr1(cb, PUSH,  4, TempReg, t3);
15524 //--
15525 //--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t4);
15526 //--          uLiteral(cb, 0);
15527 //--          uInstr1(cb, PUSH,  4, TempReg, t4);
15528 //--
15529 //--          uInstr1(cb, CALLM, 0, Lit16,   VGOFF_(helper_CPUID));
15530 //--          uFlagsRWU(cb, FlagsEmpty, FlagsEmpty, FlagsEmpty);
15531 //--
15532 //--          uInstr1(cb, POP,   4, TempReg, t4);
15533 //--          uInstr2(cb, PUT,   4, TempReg, t4, ArchReg, R_EDX);
15534 //--
15535 //--          uInstr1(cb, POP,   4, TempReg, t3);
15536 //--          uInstr2(cb, PUT,   4, TempReg, t3, ArchReg, R_ECX);
15537 //--
15538 //--          uInstr1(cb, POP,   4, TempReg, t2);
15539 //--          uInstr2(cb, PUT,   4, TempReg, t2, ArchReg, R_EBX);
15540 //--
15541 //--          uInstr1(cb, POP,   4, TempReg, t1);
15542 //--          uInstr2(cb, PUT,   4, TempReg, t1, ArchReg, R_EAX);
15543 //--
15544 //--          uInstr0(cb, CALLM_E, 0);
15545 //--          DIP("cpuid\n");
15546 //--          break;
15547 //--
15548       /* =-=-=-=-=-=-=-=-=- MOVZX, MOVSX =-=-=-=-=-=-=-= */
15549 
15550       case 0xB6: /* MOVZXb Eb,Gv */
15551          if (sz != 2 && sz != 4)
15552             goto decode_failure;
15553          delta = dis_movx_E_G ( sorb, delta, 1, sz, False );
15554          break;
15555 
15556       case 0xB7: /* MOVZXw Ew,Gv */
15557          if (sz != 4)
15558             goto decode_failure;
15559          delta = dis_movx_E_G ( sorb, delta, 2, 4, False );
15560          break;
15561 
15562       case 0xBE: /* MOVSXb Eb,Gv */
15563          if (sz != 2 && sz != 4)
15564             goto decode_failure;
15565          delta = dis_movx_E_G ( sorb, delta, 1, sz, True );
15566          break;
15567 
15568       case 0xBF: /* MOVSXw Ew,Gv */
15569          if (sz != 4 && /* accept movsww, sigh, see #250799 */sz != 2)
15570             goto decode_failure;
15571          delta = dis_movx_E_G ( sorb, delta, 2, sz, True );
15572          break;
15573 
15574 //--       /* =-=-=-=-=-=-=-=-=-=-= MOVNTI -=-=-=-=-=-=-=-=-= */
15575 //--
15576 //--       case 0xC3: /* MOVNTI Gv,Ev */
15577 //--          vg_assert(sz == 4);
15578 //--          modrm = getUChar(eip);
15579 //--          vg_assert(!epartIsReg(modrm));
15580 //--          t1 = newTemp(cb);
15581 //--          uInstr2(cb, GET, 4, ArchReg, gregOfRM(modrm), TempReg, t1);
15582 //--          pair = disAMode ( cb, sorb, eip, dis_buf );
15583 //--          t2 = LOW24(pair);
15584 //--          eip += HI8(pair);
15585 //--          uInstr2(cb, STORE, 4, TempReg, t1, TempReg, t2);
15586 //--          DIP("movnti %s,%s\n", nameIReg(4,gregOfRM(modrm)), dis_buf);
15587 //--          break;
15588 
15589       /* =-=-=-=-=-=-=-=-=- MUL/IMUL =-=-=-=-=-=-=-=-=-= */
15590 
15591       case 0xAF: /* IMUL Ev, Gv */
15592          delta = dis_mul_E_G ( sorb, sz, delta );
15593          break;
15594 
15595       /* =-=-=-=-=-=-=-=-=- NOPs =-=-=-=-=-=-=-=-=-=-=-= */
15596 
15597       case 0x1F:
15598          modrm = getUChar(delta);
15599          if (epartIsReg(modrm)) goto decode_failure;
15600          addr = disAMode ( &alen, sorb, delta, dis_buf );
15601          delta += alen;
15602          DIP("nop%c %s\n", nameISize(sz), dis_buf);
15603          break;
15604 
15605       /* =-=-=-=-=-=-=-=-=- Jcond d32 -=-=-=-=-=-=-=-=-= */
15606       case 0x80:
15607       case 0x81:
15608       case 0x82: /* JBb/JNAEb (jump below) */
15609       case 0x83: /* JNBb/JAEb (jump not below) */
15610       case 0x84: /* JZb/JEb (jump zero) */
15611       case 0x85: /* JNZb/JNEb (jump not zero) */
15612       case 0x86: /* JBEb/JNAb (jump below or equal) */
15613       case 0x87: /* JNBEb/JAb (jump not below or equal) */
15614       case 0x88: /* JSb (jump negative) */
15615       case 0x89: /* JSb (jump not negative) */
15616       case 0x8A: /* JP (jump parity even) */
15617       case 0x8B: /* JNP/JPO (jump parity odd) */
15618       case 0x8C: /* JLb/JNGEb (jump less) */
15619       case 0x8D: /* JGEb/JNLb (jump greater or equal) */
15620       case 0x8E: /* JLEb/JNGb (jump less or equal) */
15621       case 0x8F: /* JGb/JNLEb (jump greater) */
15622        { Int    jmpDelta;
15623          const HChar* comment  = "";
15624          jmpDelta = (Int)getUDisp(current_sz_data, delta);
15625          d32 = (((Addr32)guest_EIP_bbstart)+delta+current_sz_data) + jmpDelta;
15626          delta += current_sz_data;
15627          if (resteerCisOk
15628              && vex_control.guest_chase_cond
15629              && (Addr32)d32 != (Addr32)guest_EIP_bbstart
15630              && jmpDelta < 0
15631              && resteerOkFn( callback_opaque, (Addr32)d32) ) {
15632             /* Speculation: assume this backward branch is taken.  So
15633                we need to emit a side-exit to the insn following this
15634                one, on the negation of the condition, and continue at
15635                the branch target address (d32).  If we wind up back at
15636                the first instruction of the trace, just stop; it's
15637                better to let the IR loop unroller handle that case.*/
15638             stmt( IRStmt_Exit(
15639                      mk_x86g_calculate_condition((X86Condcode)
15640                                                  (1 ^ (opc - 0x80))),
15641                      Ijk_Boring,
15642                      IRConst_U32(guest_EIP_bbstart+delta),
15643                      OFFB_EIP ) );
15644             dres.whatNext   = Dis_ResteerC;
15645             dres.continueAt = (Addr32)d32;
15646             comment = "(assumed taken)";
15647          }
15648          else
15649          if (resteerCisOk
15650              && vex_control.guest_chase_cond
15651              && (Addr32)d32 != (Addr32)guest_EIP_bbstart
15652              && jmpDelta >= 0
15653              && resteerOkFn( callback_opaque,
15654                              (Addr32)(guest_EIP_bbstart+delta)) ) {
15655             /* Speculation: assume this forward branch is not taken.
15656                So we need to emit a side-exit to d32 (the dest) and
15657                continue disassembling at the insn immediately
15658                following this one. */
15659             stmt( IRStmt_Exit(
15660                      mk_x86g_calculate_condition((X86Condcode)(opc - 0x80)),
15661                      Ijk_Boring,
15662                      IRConst_U32(d32),
15663                      OFFB_EIP ) );
15664             dres.whatNext   = Dis_ResteerC;
15665             dres.continueAt = guest_EIP_bbstart + delta;
15666             comment = "(assumed not taken)";
15667          }
15668          else {
15669             /* Conservative default translation - end the block at
15670                this point. */
15671             jcc_01( &dres, (X86Condcode)(opc - 0x80),
15672                     (Addr32)(guest_EIP_bbstart+delta), d32);
15673             vassert(dres.whatNext == Dis_StopHere);
15674          }
15675          DIP("j%s-32 0x%x %s\n", name_X86Condcode(opc - 0x80), d32, comment);
15676          break;
15677        }
15678 
15679       /* =-=-=-=-=-=-=-=-=- RDTSC -=-=-=-=-=-=-=-=-=-=-= */
15680       case 0x31: { /* RDTSC */
15681          IRTemp   val  = newTemp(Ity_I64);
15682          IRExpr** args = mkIRExprVec_0();
15683          IRDirty* d    = unsafeIRDirty_1_N (
15684                             val,
15685                             0/*regparms*/,
15686                             "x86g_dirtyhelper_RDTSC",
15687                             &x86g_dirtyhelper_RDTSC,
15688                             args
15689                          );
15690          /* execute the dirty call, dumping the result in val. */
15691          stmt( IRStmt_Dirty(d) );
15692          putIReg(4, R_EDX, unop(Iop_64HIto32, mkexpr(val)));
15693          putIReg(4, R_EAX, unop(Iop_64to32, mkexpr(val)));
15694          DIP("rdtsc\n");
15695          break;
15696       }
15697 
15698       /* =-=-=-=-=-=-=-=-=- PUSH/POP Sreg =-=-=-=-=-=-=-=-=-= */
15699 
15700       case 0xA1: /* POP %FS */
15701          dis_pop_segreg( R_FS, sz ); break;
15702       case 0xA9: /* POP %GS */
15703          dis_pop_segreg( R_GS, sz ); break;
15704 
15705       case 0xA0: /* PUSH %FS */
15706          dis_push_segreg( R_FS, sz ); break;
15707       case 0xA8: /* PUSH %GS */
15708          dis_push_segreg( R_GS, sz ); break;
15709 
15710       /* =-=-=-=-=-=-=-=-=- SETcc Eb =-=-=-=-=-=-=-=-=-= */
15711       case 0x90:
15712       case 0x91:
15713       case 0x92: /* set-Bb/set-NAEb (jump below) */
15714       case 0x93: /* set-NBb/set-AEb (jump not below) */
15715       case 0x94: /* set-Zb/set-Eb (jump zero) */
15716       case 0x95: /* set-NZb/set-NEb (jump not zero) */
15717       case 0x96: /* set-BEb/set-NAb (jump below or equal) */
15718       case 0x97: /* set-NBEb/set-Ab (jump not below or equal) */
15719       case 0x98: /* set-Sb (jump negative) */
15720       case 0x99: /* set-Sb (jump not negative) */
15721       case 0x9A: /* set-P (jump parity even) */
15722       case 0x9B: /* set-NP (jump parity odd) */
15723       case 0x9C: /* set-Lb/set-NGEb (jump less) */
15724       case 0x9D: /* set-GEb/set-NLb (jump greater or equal) */
15725       case 0x9E: /* set-LEb/set-NGb (jump less or equal) */
15726       case 0x9F: /* set-Gb/set-NLEb (jump greater) */
15727          t1 = newTemp(Ity_I8);
15728          assign( t1, unop(Iop_1Uto8,mk_x86g_calculate_condition(opc-0x90)) );
15729          modrm = getIByte(delta);
15730          if (epartIsReg(modrm)) {
15731             delta++;
15732             putIReg(1, eregOfRM(modrm), mkexpr(t1));
15733             DIP("set%s %s\n", name_X86Condcode(opc-0x90),
15734                               nameIReg(1,eregOfRM(modrm)));
15735          } else {
15736            addr = disAMode ( &alen, sorb, delta, dis_buf );
15737            delta += alen;
15738            storeLE( mkexpr(addr), mkexpr(t1) );
15739            DIP("set%s %s\n", name_X86Condcode(opc-0x90), dis_buf);
15740          }
15741          break;
15742 
15743       /* =-=-=-=-=-=-=-=-=- SHLD/SHRD -=-=-=-=-=-=-=-=-= */
15744 
15745       case 0xA4: /* SHLDv imm8,Gv,Ev */
15746          modrm = getIByte(delta);
15747          d32   = delta + lengthAMode(delta);
15748          vex_sprintf(dis_buf, "$%d", getIByte(d32));
15749          delta = dis_SHLRD_Gv_Ev (
15750                   sorb, delta, modrm, sz,
15751                   mkU8(getIByte(d32)), True, /* literal */
15752                   dis_buf, True );
15753          break;
15754       case 0xA5: /* SHLDv %cl,Gv,Ev */
15755          modrm = getIByte(delta);
15756          delta = dis_SHLRD_Gv_Ev (
15757                     sorb, delta, modrm, sz,
15758                     getIReg(1,R_ECX), False, /* not literal */
15759                     "%cl", True );
15760          break;
15761 
15762       case 0xAC: /* SHRDv imm8,Gv,Ev */
15763          modrm = getIByte(delta);
15764          d32   = delta + lengthAMode(delta);
15765          vex_sprintf(dis_buf, "$%d", getIByte(d32));
15766          delta = dis_SHLRD_Gv_Ev (
15767                     sorb, delta, modrm, sz,
15768                     mkU8(getIByte(d32)), True, /* literal */
15769                     dis_buf, False );
15770          break;
15771       case 0xAD: /* SHRDv %cl,Gv,Ev */
15772          modrm = getIByte(delta);
15773          delta = dis_SHLRD_Gv_Ev (
15774                     sorb, delta, modrm, sz,
15775                     getIReg(1,R_ECX), False, /* not literal */
15776                     "%cl", False );
15777          break;
15778 
15779       /* =-=-=-=-=-=-=-=-=- SYSENTER -=-=-=-=-=-=-=-=-=-= */
15780 
15781       case 0x34:
15782          /* Simple implementation needing a long explaination.
15783 
15784             sysenter is a kind of syscall entry.  The key thing here
15785             is that the return address is not known -- that is
15786             something that is beyond Vex's knowledge.  So this IR
15787             forces a return to the scheduler, which can do what it
15788             likes to simulate the systenter, but it MUST set this
15789             thread's guest_EIP field with the continuation address
15790             before resuming execution.  If that doesn't happen, the
15791             thread will jump to address zero, which is probably
15792             fatal.
15793          */
15794 
15795          /* Note where we are, so we can back up the guest to this
15796             point if the syscall needs to be restarted. */
15797          stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
15798                            mkU32(guest_EIP_curr_instr) ) );
15799          jmp_lit(&dres, Ijk_Sys_sysenter, 0/*bogus next EIP value*/);
15800          vassert(dres.whatNext == Dis_StopHere);
15801          DIP("sysenter");
15802          break;
15803 
15804       /* =-=-=-=-=-=-=-=-=- XADD -=-=-=-=-=-=-=-=-=-= */
15805 
15806       case 0xC0: { /* XADD Gb,Eb */
15807          Bool decodeOK;
15808          delta = dis_xadd_G_E ( sorb, pfx_lock, 1, delta, &decodeOK );
15809          if (!decodeOK) goto decode_failure;
15810          break;
15811       }
15812       case 0xC1: { /* XADD Gv,Ev */
15813          Bool decodeOK;
15814          delta = dis_xadd_G_E ( sorb, pfx_lock, sz, delta, &decodeOK );
15815          if (!decodeOK) goto decode_failure;
15816          break;
15817       }
15818 
15819       /* =-=-=-=-=-=-=-=-=- MMXery =-=-=-=-=-=-=-=-=-=-= */
15820 
15821       case 0x71:
15822       case 0x72:
15823       case 0x73: /* PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
15824 
15825       case 0x6E: /* MOVD (src)ireg-or-mem, (dst)mmxreg */
15826       case 0x7E: /* MOVD (src)mmxreg, (dst)ireg-or-mem */
15827       case 0x7F: /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
15828       case 0x6F: /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
15829 
15830       case 0xFC:
15831       case 0xFD:
15832       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
15833 
15834       case 0xEC:
15835       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
15836 
15837       case 0xDC:
15838       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
15839 
15840       case 0xF8:
15841       case 0xF9:
15842       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
15843 
15844       case 0xE8:
15845       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
15846 
15847       case 0xD8:
15848       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
15849 
15850       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
15851       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
15852 
15853       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
15854 
15855       case 0x74:
15856       case 0x75:
15857       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
15858 
15859       case 0x64:
15860       case 0x65:
15861       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
15862 
15863       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
15864       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
15865       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
15866 
15867       case 0x68:
15868       case 0x69:
15869       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
15870 
15871       case 0x60:
15872       case 0x61:
15873       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
15874 
15875       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
15876       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
15877       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
15878       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
15879 
15880       case 0xF1: /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
15881       case 0xF2:
15882       case 0xF3:
15883 
15884       case 0xD1: /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
15885       case 0xD2:
15886       case 0xD3:
15887 
15888       case 0xE1: /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
15889       case 0xE2:
15890       {
15891          Int  delta0    = delta-1;
15892          Bool decode_OK = False;
15893 
15894          /* If sz==2 this is SSE, and we assume sse idec has
15895             already spotted those cases by now. */
15896          if (sz != 4)
15897             goto decode_failure;
15898 
15899          delta = dis_MMX ( &decode_OK, sorb, sz, delta-1 );
15900          if (!decode_OK) {
15901             delta = delta0;
15902             goto decode_failure;
15903          }
15904          break;
15905       }
15906 
15907       case 0x0E: /* FEMMS */
15908       case 0x77: /* EMMS */
15909          if (sz != 4)
15910             goto decode_failure;
15911          do_EMMS_preamble();
15912          DIP("{f}emms\n");
15913          break;
15914 
15915       /* =-=-=-=-=-=-=-=-=- SGDT and SIDT =-=-=-=-=-=-=-=-=-=-= */
15916       case 0x01: /* 0F 01 /0 -- SGDT */
15917                  /* 0F 01 /1 -- SIDT */
15918                  /* 0F 01 /2 -- LGDT */
15919                  /* 0F 01 /3 -- LIDT */
15920       {
15921           /* This is really revolting, but ... since each processor
15922              (core) only has one IDT and one GDT, just let the guest
15923              see it (pass-through semantics).  I can't see any way to
15924              construct a faked-up value, so don't bother to try. */
15925          Int g;
15926          modrm = getUChar(delta);
15927          if (epartIsReg(modrm))
15928            goto decode_failure;
15929 
15930          g = gregOfRM(modrm);
15931          if (g < 0 || g > 3)
15932             goto decode_failure;
15933 
15934          addr = disAMode ( &alen, sorb, delta, dis_buf );
15935          delta += alen;
15936 
15937          IRDirty* d = NULL;
15938          switch (g) {
15939             case 0: DIP("sgdt %s\n", dis_buf);
15940             case 1: DIP("sidt %s\n", dis_buf);
15941                d = unsafeIRDirty_0_N (
15942                           0/*regparms*/,
15943                           "x86g_dirtyhelper_SxDT",
15944                           &x86g_dirtyhelper_SxDT,
15945                           mkIRExprVec_2( mkexpr(addr),
15946                                          mkU32(gregOfRM(modrm)) )
15947                       );
15948                /* declare we're writing memory */
15949                d->mFx   = Ifx_Write;
15950                d->mAddr = mkexpr(addr);
15951                d->mSize = 6;
15952                break;
15953             case 2: DIP("lgdt %s\n", dis_buf);
15954             case 3: DIP("lidt %s\n", dis_buf);
15955                d = unsafeIRDirty_0_N (
15956                           0/*regparms*/,
15957                           "x86g_dirtyhelper_LGDT_LIDT",
15958                           &x86g_dirtyhelper_LGDT_LIDT,
15959                           mkIRExprVec_2( mkexpr(addr),
15960                                          mkU32(gregOfRM(modrm)) )
15961                       );
15962                /* declare we're reading memory */
15963                d->mFx   = Ifx_Read;
15964                d->mAddr = mkexpr(addr);
15965                d->mSize = 6;
15966                break;
15967             default: vassert(0); /*NOTREACHED*/
15968          }
15969 
15970          vassert(d);
15971 
15972          stmt( IRStmt_Dirty(d) );
15973          break;
15974       }
15975 
15976       case 0x05: /* AMD's syscall */
15977          stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
15978                            mkU32(guest_EIP_curr_instr) ) );
15979          jmp_lit(&dres, Ijk_Sys_syscall, ((Addr32)guest_EIP_bbstart)+delta);
15980          vassert(dres.whatNext == Dis_StopHere);
15981          DIP("syscall\n");
15982          break;
15983 
15984       /* =-=-=-=-=-=-=-=-=- unimp2 =-=-=-=-=-=-=-=-=-=-= */
15985 
15986       default:
15987          goto decode_failure;
15988    } /* switch (opc) for the 2-byte opcodes */
15989    goto decode_success;
15990    } /* case 0x0F: of primary opcode */
15991 
15992    /* ------------------------ ??? ------------------------ */
15993 
15994   default:
15995   decode_failure:
15996    /* All decode failures end up here. */
15997    if (sigill_diag) {
15998       vex_printf("vex x86->IR: unhandled instruction bytes: "
15999                  "0x%x 0x%x 0x%x 0x%x\n",
16000                  getIByte(delta_start+0),
16001                  getIByte(delta_start+1),
16002                  getIByte(delta_start+2),
16003                  getIByte(delta_start+3));
16004    }
16005 
16006    /* Tell the dispatcher that this insn cannot be decoded, and so has
16007       not been executed, and (is currently) the next to be executed.
16008       EIP should be up-to-date since it made so at the start of each
16009       insn, but nevertheless be paranoid and update it again right
16010       now. */
16011    stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_curr_instr) ) );
16012    jmp_lit(&dres, Ijk_NoDecode, guest_EIP_curr_instr);
16013    vassert(dres.whatNext == Dis_StopHere);
16014    dres.len = 0;
16015    /* We also need to say that a CAS is not expected now, regardless
16016       of what it might have been set to at the start of the function,
16017       since the IR that we've emitted just above (to synthesis a
16018       SIGILL) does not involve any CAS, and presumably no other IR has
16019       been emitted for this (non-decoded) insn. */
16020    *expect_CAS = False;
16021    return dres;
16022 
16023    } /* switch (opc) for the main (primary) opcode switch. */
16024 
16025   decode_success:
16026    /* All decode successes end up here. */
16027    switch (dres.whatNext) {
16028       case Dis_Continue:
16029          stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_bbstart + delta) ) );
16030          break;
16031       case Dis_ResteerU:
16032       case Dis_ResteerC:
16033          stmt( IRStmt_Put( OFFB_EIP, mkU32(dres.continueAt) ) );
16034          break;
16035       case Dis_StopHere:
16036          break;
16037       default:
16038          vassert(0);
16039    }
16040 
16041    DIP("\n");
16042    dres.len = delta - delta_start;
16043    return dres;
16044 }
16045 
16046 #undef DIP
16047 #undef DIS
16048 
16049 
16050 /*------------------------------------------------------------*/
16051 /*--- Top-level fn                                         ---*/
16052 /*------------------------------------------------------------*/
16053 
16054 /* Disassemble a single instruction into IR.  The instruction
16055    is located in host memory at &guest_code[delta]. */
16056 
disInstr_X86(IRSB * irsb_IN,Bool (* resteerOkFn)(void *,Addr),Bool resteerCisOk,void * callback_opaque,const UChar * guest_code_IN,Long delta,Addr guest_IP,VexArch guest_arch,const VexArchInfo * archinfo,const VexAbiInfo * abiinfo,VexEndness host_endness_IN,Bool sigill_diag_IN)16057 DisResult disInstr_X86 ( IRSB*        irsb_IN,
16058                          Bool         (*resteerOkFn) ( void*, Addr ),
16059                          Bool         resteerCisOk,
16060                          void*        callback_opaque,
16061                          const UChar* guest_code_IN,
16062                          Long         delta,
16063                          Addr         guest_IP,
16064                          VexArch      guest_arch,
16065                          const VexArchInfo* archinfo,
16066                          const VexAbiInfo*  abiinfo,
16067                          VexEndness   host_endness_IN,
16068                          Bool         sigill_diag_IN )
16069 {
16070    Int       i, x1, x2;
16071    Bool      expect_CAS, has_CAS;
16072    DisResult dres;
16073 
16074    /* Set globals (see top of this file) */
16075    vassert(guest_arch == VexArchX86);
16076    guest_code           = guest_code_IN;
16077    irsb                 = irsb_IN;
16078    host_endness         = host_endness_IN;
16079    guest_EIP_curr_instr = (Addr32)guest_IP;
16080    guest_EIP_bbstart    = (Addr32)toUInt(guest_IP - delta);
16081 
16082    x1 = irsb_IN->stmts_used;
16083    expect_CAS = False;
16084    dres = disInstr_X86_WRK ( &expect_CAS, resteerOkFn,
16085                              resteerCisOk,
16086                              callback_opaque,
16087                              delta, archinfo, abiinfo, sigill_diag_IN );
16088    x2 = irsb_IN->stmts_used;
16089    vassert(x2 >= x1);
16090 
16091    /* See comment at the top of disInstr_X86_WRK for meaning of
16092       expect_CAS.  Here, we (sanity-)check for the presence/absence of
16093       IRCAS as directed by the returned expect_CAS value. */
16094    has_CAS = False;
16095    for (i = x1; i < x2; i++) {
16096       if (irsb_IN->stmts[i]->tag == Ist_CAS)
16097          has_CAS = True;
16098    }
16099 
16100    if (expect_CAS != has_CAS) {
16101       /* inconsistency detected.  re-disassemble the instruction so as
16102          to generate a useful error message; then assert. */
16103       vex_traceflags |= VEX_TRACE_FE;
16104       dres = disInstr_X86_WRK ( &expect_CAS, resteerOkFn,
16105                                 resteerCisOk,
16106                                 callback_opaque,
16107                                 delta, archinfo, abiinfo, sigill_diag_IN );
16108       for (i = x1; i < x2; i++) {
16109          vex_printf("\t\t");
16110          ppIRStmt(irsb_IN->stmts[i]);
16111          vex_printf("\n");
16112       }
16113       /* Failure of this assertion is serious and denotes a bug in
16114          disInstr. */
16115       vpanic("disInstr_X86: inconsistency in LOCK prefix handling");
16116    }
16117 
16118    return dres;
16119 }
16120 
16121 
16122 /*--------------------------------------------------------------------*/
16123 /*--- end                                         guest_x86_toIR.c ---*/
16124 /*--------------------------------------------------------------------*/
16125