1 
2 /*---------------------------------------------------------------*/
3 /*--- begin                                 host_amd64_isel.c ---*/
4 /*---------------------------------------------------------------*/
5 
6 /*
7    This file is part of Valgrind, a dynamic binary instrumentation
8    framework.
9 
10    Copyright (C) 2004-2017 OpenWorks LLP
11       info@open-works.net
12 
13    This program is free software; you can redistribute it and/or
14    modify it under the terms of the GNU General Public License as
15    published by the Free Software Foundation; either version 2 of the
16    License, or (at your option) any later version.
17 
18    This program is distributed in the hope that it will be useful, but
19    WITHOUT ANY WARRANTY; without even the implied warranty of
20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21    General Public License for more details.
22 
23    You should have received a copy of the GNU General Public License
24    along with this program; if not, write to the Free Software
25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26    02110-1301, USA.
27 
28    The GNU General Public License is contained in the file COPYING.
29 
30    Neither the names of the U.S. Department of Energy nor the
31    University of California nor the names of its contributors may be
32    used to endorse or promote products derived from this software
33    without prior written permission.
34 */
35 
36 #include "libvex_basictypes.h"
37 #include "libvex_ir.h"
38 #include "libvex.h"
39 
40 #include "ir_match.h"
41 #include "main_util.h"
42 #include "main_globals.h"
43 #include "host_generic_regs.h"
44 #include "host_generic_simd64.h"
45 #include "host_generic_simd128.h"
46 #include "host_generic_simd256.h"
47 #include "host_generic_maddf.h"
48 #include "host_amd64_defs.h"
49 
50 
51 /*---------------------------------------------------------*/
52 /*--- x87/SSE control word stuff                        ---*/
53 /*---------------------------------------------------------*/
54 
55 /* Vex-generated code expects to run with the FPU set as follows: all
56    exceptions masked, round-to-nearest, precision = 53 bits.  This
57    corresponds to a FPU control word value of 0x027F.
58 
59    Similarly the SSE control word (%mxcsr) should be 0x1F80.
60 
61    %fpucw and %mxcsr should have these values on entry to
62    Vex-generated code, and should those values should be
63    unchanged at exit.
64 */
65 
66 #define DEFAULT_FPUCW 0x027F
67 
68 #define DEFAULT_MXCSR 0x1F80
69 
70 /* debugging only, do not use */
71 /* define DEFAULT_FPUCW 0x037F */
72 
73 
74 /*---------------------------------------------------------*/
75 /*--- misc helpers                                      ---*/
76 /*---------------------------------------------------------*/
77 
78 /* These are duplicated in guest-amd64/toIR.c */
unop(IROp op,IRExpr * a)79 static IRExpr* unop ( IROp op, IRExpr* a )
80 {
81    return IRExpr_Unop(op, a);
82 }
83 
binop(IROp op,IRExpr * a1,IRExpr * a2)84 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
85 {
86    return IRExpr_Binop(op, a1, a2);
87 }
88 
bind(Int binder)89 static IRExpr* bind ( Int binder )
90 {
91    return IRExpr_Binder(binder);
92 }
93 
isZeroU8(const IRExpr * e)94 static Bool isZeroU8 ( const IRExpr* e )
95 {
96    return e->tag == Iex_Const
97           && e->Iex.Const.con->tag == Ico_U8
98           && e->Iex.Const.con->Ico.U8 == 0;
99 }
100 
101 
102 /*---------------------------------------------------------*/
103 /*--- ISelEnv                                           ---*/
104 /*---------------------------------------------------------*/
105 
106 /* This carries around:
107 
108    - A mapping from IRTemp to IRType, giving the type of any IRTemp we
109      might encounter.  This is computed before insn selection starts,
110      and does not change.
111 
112    - A mapping from IRTemp to HReg.  This tells the insn selector
113      which virtual register is associated with each IRTemp
114      temporary.  This is computed before insn selection starts, and
115      does not change.  We expect this mapping to map precisely the
116      same set of IRTemps as the type mapping does.
117 
118         - vregmap   holds the primary register for the IRTemp.
119         - vregmapHI is only used for 128-bit integer-typed
120              IRTemps.  It holds the identity of a second
121              64-bit virtual HReg, which holds the high half
122              of the value.
123 
124    - The host subarchitecture we are selecting insns for.
125      This is set at the start and does not change.
126 
127    - The code array, that is, the insns selected so far.
128 
129    - A counter, for generating new virtual registers.
130 
131    - A Bool for indicating whether we may generate chain-me
132      instructions for control flow transfers, or whether we must use
133      XAssisted.
134 
135    - The maximum guest address of any guest insn in this block.
136      Actually, the address of the highest-addressed byte from any insn
137      in this block.  Is set at the start and does not change.  This is
138      used for detecting jumps which are definitely forward-edges from
139      this block, and therefore can be made (chained) to the fast entry
140      point of the destination, thereby avoiding the destination's
141      event check.
142 
143    Note, this is all host-independent.  (JRS 20050201: well, kinda
144    ... not completely.  Compare with ISelEnv for X86.)
145 */
146 
147 typedef
148    struct {
149       /* Constant -- are set at the start and do not change. */
150       IRTypeEnv*   type_env;
151 
152       HReg*        vregmap;
153       HReg*        vregmapHI;
154       Int          n_vregmap;
155 
156       UInt         hwcaps;
157 
158       Bool         chainingAllowed;
159       Addr64       max_ga;
160 
161       /* These are modified as we go along. */
162       HInstrArray* code;
163       Int          vreg_ctr;
164    }
165    ISelEnv;
166 
167 
lookupIRTemp(ISelEnv * env,IRTemp tmp)168 static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
169 {
170    vassert(tmp >= 0);
171    vassert(tmp < env->n_vregmap);
172    return env->vregmap[tmp];
173 }
174 
lookupIRTempPair(HReg * vrHI,HReg * vrLO,ISelEnv * env,IRTemp tmp)175 static void lookupIRTempPair ( HReg* vrHI, HReg* vrLO,
176                                ISelEnv* env, IRTemp tmp )
177 {
178    vassert(tmp >= 0);
179    vassert(tmp < env->n_vregmap);
180    vassert(! hregIsInvalid(env->vregmapHI[tmp]));
181    *vrLO = env->vregmap[tmp];
182    *vrHI = env->vregmapHI[tmp];
183 }
184 
addInstr(ISelEnv * env,AMD64Instr * instr)185 static void addInstr ( ISelEnv* env, AMD64Instr* instr )
186 {
187    addHInstr(env->code, instr);
188    if (vex_traceflags & VEX_TRACE_VCODE) {
189       ppAMD64Instr(instr, True);
190       vex_printf("\n");
191    }
192 }
193 
newVRegI(ISelEnv * env)194 static HReg newVRegI ( ISelEnv* env )
195 {
196    HReg reg = mkHReg(True/*virtual reg*/, HRcInt64, 0/*enc*/, env->vreg_ctr);
197    env->vreg_ctr++;
198    return reg;
199 }
200 
newVRegV(ISelEnv * env)201 static HReg newVRegV ( ISelEnv* env )
202 {
203    HReg reg = mkHReg(True/*virtual reg*/, HRcVec128, 0/*enc*/, env->vreg_ctr);
204    env->vreg_ctr++;
205    return reg;
206 }
207 
208 
209 /*---------------------------------------------------------*/
210 /*--- ISEL: Forward declarations                        ---*/
211 /*---------------------------------------------------------*/
212 
213 /* These are organised as iselXXX and iselXXX_wrk pairs.  The
214    iselXXX_wrk do the real work, but are not to be called directly.
215    For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
216    checks that all returned registers are virtual.  You should not
217    call the _wrk version directly.
218 */
219 static AMD64RMI*     iselIntExpr_RMI_wrk ( ISelEnv* env, const IRExpr* e );
220 static AMD64RMI*     iselIntExpr_RMI     ( ISelEnv* env, const IRExpr* e );
221 
222 static AMD64RI*      iselIntExpr_RI_wrk  ( ISelEnv* env, const IRExpr* e );
223 static AMD64RI*      iselIntExpr_RI      ( ISelEnv* env, const IRExpr* e );
224 
225 static AMD64RM*      iselIntExpr_RM_wrk  ( ISelEnv* env, const IRExpr* e );
226 static AMD64RM*      iselIntExpr_RM      ( ISelEnv* env, const IRExpr* e );
227 
228 static HReg          iselIntExpr_R_wrk   ( ISelEnv* env, const IRExpr* e );
229 static HReg          iselIntExpr_R       ( ISelEnv* env, const IRExpr* e );
230 
231 static AMD64AMode*   iselIntExpr_AMode_wrk ( ISelEnv* env, const IRExpr* e );
232 static AMD64AMode*   iselIntExpr_AMode     ( ISelEnv* env, const IRExpr* e );
233 
234 static void          iselInt128Expr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
235                                           ISelEnv* env, const IRExpr* e );
236 static void          iselInt128Expr     ( /*OUT*/HReg* rHi, HReg* rLo,
237                                           ISelEnv* env, const IRExpr* e );
238 
239 static AMD64CondCode iselCondCode_wrk    ( ISelEnv* env, const IRExpr* e );
240 static AMD64CondCode iselCondCode        ( ISelEnv* env, const IRExpr* e );
241 
242 static HReg          iselDblExpr_wrk     ( ISelEnv* env, const IRExpr* e );
243 static HReg          iselDblExpr         ( ISelEnv* env, const IRExpr* e );
244 
245 static HReg          iselFltExpr_wrk     ( ISelEnv* env, const IRExpr* e );
246 static HReg          iselFltExpr         ( ISelEnv* env, const IRExpr* e );
247 
248 static HReg          iselVecExpr_wrk     ( ISelEnv* env, const IRExpr* e );
249 static HReg          iselVecExpr         ( ISelEnv* env, const IRExpr* e );
250 
251 static void          iselDVecExpr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
252                                         ISelEnv* env, const IRExpr* e );
253 static void          iselDVecExpr     ( /*OUT*/HReg* rHi, HReg* rLo,
254                                         ISelEnv* env, const IRExpr* e );
255 
256 
257 /*---------------------------------------------------------*/
258 /*--- ISEL: Misc helpers                                ---*/
259 /*---------------------------------------------------------*/
260 
sane_AMode(AMD64AMode * am)261 static Bool sane_AMode ( AMD64AMode* am )
262 {
263    switch (am->tag) {
264       case Aam_IR:
265          return
266             toBool( hregClass(am->Aam.IR.reg) == HRcInt64
267                     && (hregIsVirtual(am->Aam.IR.reg)
268                         || sameHReg(am->Aam.IR.reg, hregAMD64_RBP())) );
269       case Aam_IRRS:
270          return
271             toBool( hregClass(am->Aam.IRRS.base) == HRcInt64
272                     && hregIsVirtual(am->Aam.IRRS.base)
273                     && hregClass(am->Aam.IRRS.index) == HRcInt64
274                     && hregIsVirtual(am->Aam.IRRS.index) );
275       default:
276         vpanic("sane_AMode: unknown amd64 amode tag");
277    }
278 }
279 
280 
281 /* Can the lower 32 bits be signedly widened to produce the whole
282    64-bit value?  In other words, are the top 33 bits either all 0 or
283    all 1 ? */
fitsIn32Bits(ULong x)284 static Bool fitsIn32Bits ( ULong x )
285 {
286    Long y1;
287    y1 = x << 32;
288    y1 >>=/*s*/ 32;
289    return toBool(x == y1);
290 }
291 
292 /* Is this a 64-bit zero expression? */
293 
isZeroU64(const IRExpr * e)294 static Bool isZeroU64 ( const IRExpr* e )
295 {
296    return e->tag == Iex_Const
297           && e->Iex.Const.con->tag == Ico_U64
298           && e->Iex.Const.con->Ico.U64 == 0ULL;
299 }
300 
isZeroU32(const IRExpr * e)301 static Bool isZeroU32 ( const IRExpr* e )
302 {
303    return e->tag == Iex_Const
304           && e->Iex.Const.con->tag == Ico_U32
305           && e->Iex.Const.con->Ico.U32 == 0;
306 }
307 
308 /* Are both args atoms and the same?  This is copy of eqIRAtom
309    that omits the assertions that the args are indeed atoms. */
310 
areAtomsAndEqual(const IRExpr * a1,const IRExpr * a2)311 static Bool areAtomsAndEqual ( const IRExpr* a1, const IRExpr* a2 )
312 {
313    if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp)
314       return toBool(a1->Iex.RdTmp.tmp == a2->Iex.RdTmp.tmp);
315    if (a1->tag == Iex_Const && a2->tag == Iex_Const)
316       return eqIRConst(a1->Iex.Const.con, a2->Iex.Const.con);
317    return False;
318 }
319 
320 /* Make a int reg-reg move. */
321 
mk_iMOVsd_RR(HReg src,HReg dst)322 static AMD64Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
323 {
324    vassert(hregClass(src) == HRcInt64);
325    vassert(hregClass(dst) == HRcInt64);
326    return AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst);
327 }
328 
329 /* Make a vector (128 bit) reg-reg move. */
330 
mk_vMOVsd_RR(HReg src,HReg dst)331 static AMD64Instr* mk_vMOVsd_RR ( HReg src, HReg dst )
332 {
333    vassert(hregClass(src) == HRcVec128);
334    vassert(hregClass(dst) == HRcVec128);
335    return AMD64Instr_SseReRg(Asse_MOV, src, dst);
336 }
337 
338 /* Advance/retreat %rsp by n. */
339 
add_to_rsp(ISelEnv * env,Int n)340 static void add_to_rsp ( ISelEnv* env, Int n )
341 {
342    vassert(n > 0 && n < 256 && (n%8) == 0);
343    addInstr(env,
344             AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(n),
345                                         hregAMD64_RSP()));
346 }
347 
sub_from_rsp(ISelEnv * env,Int n)348 static void sub_from_rsp ( ISelEnv* env, Int n )
349 {
350    vassert(n > 0 && n < 256 && (n%8) == 0);
351    addInstr(env,
352             AMD64Instr_Alu64R(Aalu_SUB, AMD64RMI_Imm(n),
353                                         hregAMD64_RSP()));
354 }
355 
356 /* Push 64-bit constants on the stack. */
push_uimm64(ISelEnv * env,ULong uimm64)357 static void push_uimm64( ISelEnv* env, ULong uimm64 )
358 {
359    /* If uimm64 can be expressed as the sign extension of its
360       lower 32 bits, we can do it the easy way. */
361    Long simm64 = (Long)uimm64;
362    if ( simm64 == ((Long)(uimm64 << 32) >> 32) ) {
363       addInstr( env, AMD64Instr_Push(AMD64RMI_Imm( (UInt)uimm64 )) );
364    } else {
365       HReg tmp = newVRegI(env);
366       addInstr( env, AMD64Instr_Imm64(uimm64, tmp) );
367       addInstr( env, AMD64Instr_Push(AMD64RMI_Reg(tmp)) );
368    }
369 }
370 
371 
372 /* Used only in doHelperCall.  If possible, produce a single
373    instruction which computes 'e' into 'dst'.  If not possible, return
374    NULL. */
375 
iselIntExpr_single_instruction(ISelEnv * env,HReg dst,IRExpr * e)376 static AMD64Instr* iselIntExpr_single_instruction ( ISelEnv* env,
377                                                     HReg     dst,
378                                                     IRExpr*  e )
379 {
380    /* Per comments in doHelperCall below, appearance of
381       Iex_VECRET implies ill-formed IR. */
382    vassert(e->tag != Iex_VECRET);
383 
384    /* In this case we give out a copy of the BaseBlock pointer. */
385    if (UNLIKELY(e->tag == Iex_GSPTR)) {
386       return mk_iMOVsd_RR( hregAMD64_RBP(), dst );
387    }
388 
389    vassert(typeOfIRExpr(env->type_env, e) == Ity_I64);
390 
391    if (e->tag == Iex_Const) {
392       vassert(e->Iex.Const.con->tag == Ico_U64);
393       if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
394          return AMD64Instr_Alu64R(
395                    Aalu_MOV,
396                    AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64)),
397                    dst
398                 );
399       } else {
400          return AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, dst);
401       }
402    }
403 
404    if (e->tag == Iex_RdTmp) {
405       HReg src = lookupIRTemp(env, e->Iex.RdTmp.tmp);
406       return mk_iMOVsd_RR(src, dst);
407    }
408 
409    if (e->tag == Iex_Get) {
410       vassert(e->Iex.Get.ty == Ity_I64);
411       return AMD64Instr_Alu64R(
412                 Aalu_MOV,
413                 AMD64RMI_Mem(
414                    AMD64AMode_IR(e->Iex.Get.offset,
415                                  hregAMD64_RBP())),
416                 dst);
417    }
418 
419    if (e->tag == Iex_Unop
420        && e->Iex.Unop.op == Iop_32Uto64
421        && e->Iex.Unop.arg->tag == Iex_RdTmp) {
422       HReg src = lookupIRTemp(env, e->Iex.Unop.arg->Iex.RdTmp.tmp);
423       return AMD64Instr_MovxLQ(False, src, dst);
424    }
425 
426    if (0) { ppIRExpr(e); vex_printf("\n"); }
427 
428    return NULL;
429 }
430 
431 
432 /* Do a complete function call.  |guard| is a Ity_Bit expression
433    indicating whether or not the call happens.  If guard==NULL, the
434    call is unconditional.  |retloc| is set to indicate where the
435    return value is after the call.  The caller (of this fn) must
436    generate code to add |stackAdjustAfterCall| to the stack pointer
437    after the call is done. */
438 
439 static
doHelperCall(UInt * stackAdjustAfterCall,RetLoc * retloc,ISelEnv * env,IRExpr * guard,IRCallee * cee,IRType retTy,IRExpr ** args)440 void doHelperCall ( /*OUT*/UInt*   stackAdjustAfterCall,
441                     /*OUT*/RetLoc* retloc,
442                     ISelEnv* env,
443                     IRExpr* guard,
444                     IRCallee* cee, IRType retTy, IRExpr** args )
445 {
446    AMD64CondCode cc;
447    HReg          argregs[6];
448    HReg          tmpregs[6];
449    AMD64Instr*   fastinstrs[6];
450    UInt          n_args, i;
451 
452    /* Set default returns.  We'll update them later if needed. */
453    *stackAdjustAfterCall = 0;
454    *retloc               = mk_RetLoc_INVALID();
455 
456    /* These are used for cross-checking that IR-level constraints on
457       the use of IRExpr_VECRET() and IRExpr_GSPTR() are observed. */
458    UInt nVECRETs = 0;
459    UInt nGSPTRs  = 0;
460 
461    /* Marshal args for a call and do the call.
462 
463       This function only deals with a tiny set of possibilities, which
464       cover all helpers in practice.  The restrictions are that only
465       arguments in registers are supported, hence only 6x64 integer
466       bits in total can be passed.  In fact the only supported arg
467       type is I64.
468 
469       The return type can be I{64,32,16,8} or V{128,256}.  In the
470       latter two cases, it is expected that |args| will contain the
471       special node IRExpr_VECRET(), in which case this routine
472       generates code to allocate space on the stack for the vector
473       return value.  Since we are not passing any scalars on the
474       stack, it is enough to preallocate the return space before
475       marshalling any arguments, in this case.
476 
477       |args| may also contain IRExpr_GSPTR(), in which case the
478       value in %rbp is passed as the corresponding argument.
479 
480       Generating code which is both efficient and correct when
481       parameters are to be passed in registers is difficult, for the
482       reasons elaborated in detail in comments attached to
483       doHelperCall() in priv/host-x86/isel.c.  Here, we use a variant
484       of the method described in those comments.
485 
486       The problem is split into two cases: the fast scheme and the
487       slow scheme.  In the fast scheme, arguments are computed
488       directly into the target (real) registers.  This is only safe
489       when we can be sure that computation of each argument will not
490       trash any real registers set by computation of any other
491       argument.
492 
493       In the slow scheme, all args are first computed into vregs, and
494       once they are all done, they are moved to the relevant real
495       regs.  This always gives correct code, but it also gives a bunch
496       of vreg-to-rreg moves which are usually redundant but are hard
497       for the register allocator to get rid of.
498 
499       To decide which scheme to use, all argument expressions are
500       first examined.  If they are all so simple that it is clear they
501       will be evaluated without use of any fixed registers, use the
502       fast scheme, else use the slow scheme.  Note also that only
503       unconditional calls may use the fast scheme, since having to
504       compute a condition expression could itself trash real
505       registers.  Note that for simplicity, in the case where
506       IRExpr_VECRET() is present, we use the slow scheme.  This is
507       motivated by the desire to avoid any possible complexity
508       w.r.t. nested calls.
509 
510       Note this requires being able to examine an expression and
511       determine whether or not evaluation of it might use a fixed
512       register.  That requires knowledge of how the rest of this insn
513       selector works.  Currently just the following 3 are regarded as
514       safe -- hopefully they cover the majority of arguments in
515       practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
516    */
517 
518    /* Note that the cee->regparms field is meaningless on AMD64 host
519       (since there is only one calling convention) and so we always
520       ignore it. */
521    n_args = 0;
522    for (i = 0; args[i]; i++)
523       n_args++;
524 
525    if (n_args > 6)
526       vpanic("doHelperCall(AMD64): cannot currently handle > 6 args");
527 
528    argregs[0] = hregAMD64_RDI();
529    argregs[1] = hregAMD64_RSI();
530    argregs[2] = hregAMD64_RDX();
531    argregs[3] = hregAMD64_RCX();
532    argregs[4] = hregAMD64_R8();
533    argregs[5] = hregAMD64_R9();
534 
535    tmpregs[0] = tmpregs[1] = tmpregs[2] =
536    tmpregs[3] = tmpregs[4] = tmpregs[5] = INVALID_HREG;
537 
538    fastinstrs[0] = fastinstrs[1] = fastinstrs[2] =
539    fastinstrs[3] = fastinstrs[4] = fastinstrs[5] = NULL;
540 
541    /* First decide which scheme (slow or fast) is to be used.  First
542       assume the fast scheme, and select slow if any contraindications
543       (wow) appear. */
544 
545    /* We'll need space on the stack for the return value.  Avoid
546       possible complications with nested calls by using the slow
547       scheme. */
548    if (retTy == Ity_V128 || retTy == Ity_V256)
549       goto slowscheme;
550 
551    if (guard) {
552       if (guard->tag == Iex_Const
553           && guard->Iex.Const.con->tag == Ico_U1
554           && guard->Iex.Const.con->Ico.U1 == True) {
555          /* unconditional */
556       } else {
557          /* Not manifestly unconditional -- be conservative. */
558          goto slowscheme;
559       }
560    }
561 
562    /* Ok, let's try for the fast scheme.  If it doesn't pan out, we'll
563       use the slow scheme.  Because this is tentative, we can't call
564       addInstr (that is, commit to) any instructions until we're
565       handled all the arguments.  So park the resulting instructions
566       in a buffer and emit that if we're successful. */
567 
568    /* FAST SCHEME */
569    /* In this loop, we process args that can be computed into the
570       destination (real) register with a single instruction, without
571       using any fixed regs.  That also includes IRExpr_GSPTR(), but
572       not IRExpr_VECRET().  Indeed, if the IR is well-formed, we can
573       never see IRExpr_VECRET() at this point, since the return-type
574       check above should ensure all those cases use the slow scheme
575       instead. */
576    vassert(n_args >= 0 && n_args <= 6);
577    for (i = 0; i < n_args; i++) {
578       IRExpr* arg = args[i];
579       if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(arg))) {
580          vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
581       }
582       fastinstrs[i]
583          = iselIntExpr_single_instruction( env, argregs[i], args[i] );
584       if (fastinstrs[i] == NULL)
585          goto slowscheme;
586    }
587 
588    /* Looks like we're in luck.  Emit the accumulated instructions and
589       move on to doing the call itself. */
590    for (i = 0; i < n_args; i++)
591       addInstr(env, fastinstrs[i]);
592 
593    /* Fast scheme only applies for unconditional calls.  Hence: */
594    cc = Acc_ALWAYS;
595 
596    goto handle_call;
597 
598 
599    /* SLOW SCHEME; move via temporaries */
600   slowscheme:
601    {}
602 #  if 0 /* debug only */
603    if (n_args > 0) {for (i = 0; args[i]; i++) {
604    ppIRExpr(args[i]); vex_printf(" "); }
605    vex_printf("\n");}
606 #  endif
607 
608    /* If we have a vector return type, allocate a place for it on the
609       stack and record its address. */
610    HReg r_vecRetAddr = INVALID_HREG;
611    if (retTy == Ity_V128) {
612       r_vecRetAddr = newVRegI(env);
613       sub_from_rsp(env, 16);
614       addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr ));
615    }
616    else if (retTy == Ity_V256) {
617       r_vecRetAddr = newVRegI(env);
618       sub_from_rsp(env, 32);
619       addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr ));
620    }
621 
622    vassert(n_args >= 0 && n_args <= 6);
623    for (i = 0; i < n_args; i++) {
624       IRExpr* arg = args[i];
625       if (UNLIKELY(arg->tag == Iex_GSPTR)) {
626          tmpregs[i] = newVRegI(env);
627          addInstr(env, mk_iMOVsd_RR( hregAMD64_RBP(), tmpregs[i]));
628          nGSPTRs++;
629       }
630       else if (UNLIKELY(arg->tag == Iex_VECRET)) {
631          /* We stashed the address of the return slot earlier, so just
632             retrieve it now. */
633          vassert(!hregIsInvalid(r_vecRetAddr));
634          tmpregs[i] = r_vecRetAddr;
635          nVECRETs++;
636       }
637       else {
638          vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
639          tmpregs[i] = iselIntExpr_R(env, args[i]);
640       }
641    }
642 
643    /* Now we can compute the condition.  We can't do it earlier
644       because the argument computations could trash the condition
645       codes.  Be a bit clever to handle the common case where the
646       guard is 1:Bit. */
647    cc = Acc_ALWAYS;
648    if (guard) {
649       if (guard->tag == Iex_Const
650           && guard->Iex.Const.con->tag == Ico_U1
651           && guard->Iex.Const.con->Ico.U1 == True) {
652          /* unconditional -- do nothing */
653       } else {
654          cc = iselCondCode( env, guard );
655       }
656    }
657 
658    /* Move the args to their final destinations. */
659    for (i = 0; i < n_args; i++) {
660       /* None of these insns, including any spill code that might
661          be generated, may alter the condition codes. */
662       addInstr( env, mk_iMOVsd_RR( tmpregs[i], argregs[i] ) );
663    }
664 
665 
666    /* Do final checks, set the return values, and generate the call
667       instruction proper. */
668   handle_call:
669 
670    if (retTy == Ity_V128 || retTy == Ity_V256) {
671       vassert(nVECRETs == 1);
672    } else {
673       vassert(nVECRETs == 0);
674    }
675 
676    vassert(nGSPTRs == 0 || nGSPTRs == 1);
677 
678    vassert(*stackAdjustAfterCall == 0);
679    vassert(is_RetLoc_INVALID(*retloc));
680    switch (retTy) {
681          case Ity_INVALID:
682             /* Function doesn't return a value. */
683             *retloc = mk_RetLoc_simple(RLPri_None);
684             break;
685          case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
686             *retloc = mk_RetLoc_simple(RLPri_Int);
687             break;
688          case Ity_V128:
689             *retloc = mk_RetLoc_spRel(RLPri_V128SpRel, 0);
690             *stackAdjustAfterCall = 16;
691             break;
692          case Ity_V256:
693             *retloc = mk_RetLoc_spRel(RLPri_V256SpRel, 0);
694             *stackAdjustAfterCall = 32;
695             break;
696          default:
697             /* IR can denote other possible return types, but we don't
698                handle those here. */
699            vassert(0);
700    }
701 
702    /* Finally, generate the call itself.  This needs the *retloc value
703       set in the switch above, which is why it's at the end. */
704    addInstr(env,
705             AMD64Instr_Call(cc, (Addr)cee->addr, n_args, *retloc));
706 }
707 
708 
709 /* Given a guest-state array descriptor, an index expression and a
710    bias, generate an AMD64AMode holding the relevant guest state
711    offset. */
712 
713 static
genGuestArrayOffset(ISelEnv * env,IRRegArray * descr,IRExpr * off,Int bias)714 AMD64AMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr,
715                                   IRExpr* off, Int bias )
716 {
717    HReg tmp, roff;
718    Int  elemSz = sizeofIRType(descr->elemTy);
719    Int  nElems = descr->nElems;
720 
721    /* Throw out any cases not generated by an amd64 front end.  In
722       theory there might be a day where we need to handle them -- if
723       we ever run non-amd64-guest on amd64 host. */
724 
725    if (nElems != 8 || (elemSz != 1 && elemSz != 8))
726       vpanic("genGuestArrayOffset(amd64 host)");
727 
728    /* Compute off into a reg, %off.  Then return:
729 
730          movq %off, %tmp
731          addq $bias, %tmp  (if bias != 0)
732          andq %tmp, 7
733          ... base(%rbp, %tmp, shift) ...
734    */
735    tmp  = newVRegI(env);
736    roff = iselIntExpr_R(env, off);
737    addInstr(env, mk_iMOVsd_RR(roff, tmp));
738    if (bias != 0) {
739       /* Make sure the bias is sane, in the sense that there are
740          no significant bits above bit 30 in it. */
741       vassert(-10000 < bias && bias < 10000);
742       addInstr(env,
743                AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(bias), tmp));
744    }
745    addInstr(env,
746             AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(7), tmp));
747    vassert(elemSz == 1 || elemSz == 8);
748    return
749       AMD64AMode_IRRS( descr->base, hregAMD64_RBP(), tmp,
750                                     elemSz==8 ? 3 : 0);
751 }
752 
753 
754 /* Set the SSE unit's rounding mode to default (%mxcsr = 0x1F80) */
755 static
set_SSE_rounding_default(ISelEnv * env)756 void set_SSE_rounding_default ( ISelEnv* env )
757 {
758    /* pushq $DEFAULT_MXCSR
759       ldmxcsr 0(%rsp)
760       addq $8, %rsp
761    */
762    AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
763    addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(DEFAULT_MXCSR)));
764    addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
765    add_to_rsp(env, 8);
766 }
767 
768 /* Mess with the FPU's rounding mode: set to the default rounding mode
769    (DEFAULT_FPUCW). */
770 static
set_FPU_rounding_default(ISelEnv * env)771 void set_FPU_rounding_default ( ISelEnv* env )
772 {
773    /* movq $DEFAULT_FPUCW, -8(%rsp)
774       fldcw -8(%esp)
775    */
776    AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
777    addInstr(env, AMD64Instr_Alu64M(
778                     Aalu_MOV, AMD64RI_Imm(DEFAULT_FPUCW), m8_rsp));
779    addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
780 }
781 
782 
783 /* Mess with the SSE unit's rounding mode: 'mode' is an I32-typed
784    expression denoting a value in the range 0 .. 3, indicating a round
785    mode encoded as per type IRRoundingMode.  Set the SSE machinery to
786    have the same rounding.
787 */
788 static
set_SSE_rounding_mode(ISelEnv * env,IRExpr * mode)789 void set_SSE_rounding_mode ( ISelEnv* env, IRExpr* mode )
790 {
791    /* Note: this sequence only makes sense because DEFAULT_MXCSR has
792       both rounding bits == 0.  If that wasn't the case, we couldn't
793       create a new rounding field simply by ORing the new value into
794       place. */
795 
796    /* movq $3, %reg
797       andq [[mode]], %reg  -- shouldn't be needed; paranoia
798       shlq $13, %reg
799       orq $DEFAULT_MXCSR, %reg
800       pushq %reg
801       ldmxcsr 0(%esp)
802       addq $8, %rsp
803    */
804    HReg        reg      = newVRegI(env);
805    AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
806    addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Imm(3), reg));
807    addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
808                                    iselIntExpr_RMI(env, mode), reg));
809    addInstr(env, AMD64Instr_Sh64(Ash_SHL, 13, reg));
810    addInstr(env, AMD64Instr_Alu64R(
811                     Aalu_OR, AMD64RMI_Imm(DEFAULT_MXCSR), reg));
812    addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(reg)));
813    addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
814    add_to_rsp(env, 8);
815 }
816 
817 
818 /* Mess with the FPU's rounding mode: 'mode' is an I32-typed
819    expression denoting a value in the range 0 .. 3, indicating a round
820    mode encoded as per type IRRoundingMode.  Set the x87 FPU to have
821    the same rounding.
822 */
823 static
set_FPU_rounding_mode(ISelEnv * env,IRExpr * mode)824 void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode )
825 {
826    HReg rrm  = iselIntExpr_R(env, mode);
827    HReg rrm2 = newVRegI(env);
828    AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
829 
830    /* movq  %rrm, %rrm2
831       andq  $3, %rrm2   -- shouldn't be needed; paranoia
832       shlq  $10, %rrm2
833       orq   $DEFAULT_FPUCW, %rrm2
834       movq  %rrm2, -8(%rsp)
835       fldcw -8(%esp)
836    */
837    addInstr(env, mk_iMOVsd_RR(rrm, rrm2));
838    addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(3), rrm2));
839    addInstr(env, AMD64Instr_Sh64(Ash_SHL, 10, rrm2));
840    addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
841                                    AMD64RMI_Imm(DEFAULT_FPUCW), rrm2));
842    addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,
843                                    AMD64RI_Reg(rrm2), m8_rsp));
844    addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
845 }
846 
847 
848 /* Generate all-zeroes into a new vector register.
849 */
generate_zeroes_V128(ISelEnv * env)850 static HReg generate_zeroes_V128 ( ISelEnv* env )
851 {
852    HReg dst = newVRegV(env);
853    addInstr(env, AMD64Instr_SseReRg(Asse_XOR, dst, dst));
854    return dst;
855 }
856 
857 /* Generate all-ones into a new vector register.
858 */
generate_ones_V128(ISelEnv * env)859 static HReg generate_ones_V128 ( ISelEnv* env )
860 {
861    HReg dst = newVRegV(env);
862    addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, dst, dst));
863    return dst;
864 }
865 
866 
867 /* Generate !src into a new vector register.  Amazing that there isn't
868    a less crappy way to do this.
869 */
do_sse_NotV128(ISelEnv * env,HReg src)870 static HReg do_sse_NotV128 ( ISelEnv* env, HReg src )
871 {
872    HReg dst = generate_ones_V128(env);
873    addInstr(env, AMD64Instr_SseReRg(Asse_XOR, src, dst));
874    return dst;
875 }
876 
877 
878 /* Expand the given byte into a 64-bit word, by cloning each bit
879    8 times. */
bitmask8_to_bytemask64(UShort w8)880 static ULong bitmask8_to_bytemask64 ( UShort w8 )
881 {
882    vassert(w8 == (w8 & 0xFF));
883    ULong w64 = 0;
884    Int i;
885    for (i = 0; i < 8; i++) {
886       if (w8 & (1<<i))
887          w64 |= (0xFFULL << (8 * i));
888    }
889    return w64;
890 }
891 
892 
893 /*---------------------------------------------------------*/
894 /*--- ISEL: Integer expressions (64/32/16/8 bit)        ---*/
895 /*---------------------------------------------------------*/
896 
897 /* Select insns for an integer-typed expression, and add them to the
898    code list.  Return a reg holding the result.  This reg will be a
899    virtual register.  THE RETURNED REG MUST NOT BE MODIFIED.  If you
900    want to modify it, ask for a new vreg, copy it in there, and modify
901    the copy.  The register allocator will do its best to map both
902    vregs to the same real register, so the copies will often disappear
903    later in the game.
904 
905    This should handle expressions of 64, 32, 16 and 8-bit type.  All
906    results are returned in a 64-bit register.  For 32-, 16- and 8-bit
907    expressions, the upper 32/48/56 bits are arbitrary, so you should
908    mask or sign extend partial values if necessary.
909 */
910 
iselIntExpr_R(ISelEnv * env,const IRExpr * e)911 static HReg iselIntExpr_R ( ISelEnv* env, const IRExpr* e )
912 {
913    HReg r = iselIntExpr_R_wrk(env, e);
914    /* sanity checks ... */
915 #  if 0
916    vex_printf("\niselIntExpr_R: "); ppIRExpr(e); vex_printf("\n");
917 #  endif
918    vassert(hregClass(r) == HRcInt64);
919    vassert(hregIsVirtual(r));
920    return r;
921 }
922 
923 /* DO NOT CALL THIS DIRECTLY ! */
iselIntExpr_R_wrk(ISelEnv * env,const IRExpr * e)924 static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e )
925 {
926    MatchInfo mi;
927    DECLARE_PATTERN(p_1Uto8_64to1);
928    DECLARE_PATTERN(p_LDle8_then_8Uto64);
929    DECLARE_PATTERN(p_LDle16_then_16Uto64);
930 
931    IRType ty = typeOfIRExpr(env->type_env,e);
932    switch (ty) {
933       case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: break;
934       default: vassert(0);
935    }
936 
937    switch (e->tag) {
938 
939    /* --------- TEMP --------- */
940    case Iex_RdTmp: {
941       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
942    }
943 
944    /* --------- LOAD --------- */
945    case Iex_Load: {
946       HReg dst = newVRegI(env);
947       AMD64AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr );
948 
949       /* We can't handle big-endian loads, nor load-linked. */
950       if (e->Iex.Load.end != Iend_LE)
951          goto irreducible;
952 
953       if (ty == Ity_I64) {
954          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
955                                          AMD64RMI_Mem(amode), dst) );
956          return dst;
957       }
958       if (ty == Ity_I32) {
959          addInstr(env, AMD64Instr_LoadEX(4,False,amode,dst));
960          return dst;
961       }
962       if (ty == Ity_I16) {
963          addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
964          return dst;
965       }
966       if (ty == Ity_I8) {
967          addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
968          return dst;
969       }
970       break;
971    }
972 
973    /* --------- BINARY OP --------- */
974    case Iex_Binop: {
975       AMD64AluOp   aluOp;
976       AMD64ShiftOp shOp;
977 
978       /* Pattern: Sub64(0,x) */
979       /*     and: Sub32(0,x) */
980       if ((e->Iex.Binop.op == Iop_Sub64 && isZeroU64(e->Iex.Binop.arg1))
981           || (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1))) {
982          HReg dst = newVRegI(env);
983          HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
984          addInstr(env, mk_iMOVsd_RR(reg,dst));
985          addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
986          return dst;
987       }
988 
989       /* Is it an addition or logical style op? */
990       switch (e->Iex.Binop.op) {
991          case Iop_Add8: case Iop_Add16: case Iop_Add32: case Iop_Add64:
992             aluOp = Aalu_ADD; break;
993          case Iop_Sub8: case Iop_Sub16: case Iop_Sub32: case Iop_Sub64:
994             aluOp = Aalu_SUB; break;
995          case Iop_And8: case Iop_And16: case Iop_And32: case Iop_And64:
996             aluOp = Aalu_AND; break;
997          case Iop_Or8:  case Iop_Or16:  case Iop_Or32:  case Iop_Or64:
998             aluOp = Aalu_OR; break;
999          case Iop_Xor8: case Iop_Xor16: case Iop_Xor32: case Iop_Xor64:
1000             aluOp = Aalu_XOR; break;
1001          case Iop_Mul16: case Iop_Mul32: case Iop_Mul64:
1002             aluOp = Aalu_MUL; break;
1003          default:
1004             aluOp = Aalu_INVALID; break;
1005       }
1006       /* For commutative ops we assume any literal
1007          values are on the second operand. */
1008       if (aluOp != Aalu_INVALID) {
1009          HReg dst      = newVRegI(env);
1010          HReg reg      = iselIntExpr_R(env, e->Iex.Binop.arg1);
1011          AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
1012          addInstr(env, mk_iMOVsd_RR(reg,dst));
1013          addInstr(env, AMD64Instr_Alu64R(aluOp, rmi, dst));
1014          return dst;
1015       }
1016 
1017       /* Perhaps a shift op? */
1018       switch (e->Iex.Binop.op) {
1019          case Iop_Shl64: case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
1020             shOp = Ash_SHL; break;
1021          case Iop_Shr64: case Iop_Shr32: case Iop_Shr16: case Iop_Shr8:
1022             shOp = Ash_SHR; break;
1023          case Iop_Sar64: case Iop_Sar32: case Iop_Sar16: case Iop_Sar8:
1024             shOp = Ash_SAR; break;
1025          default:
1026             shOp = Ash_INVALID; break;
1027       }
1028       if (shOp != Ash_INVALID) {
1029          HReg dst = newVRegI(env);
1030 
1031          /* regL = the value to be shifted */
1032          HReg regL   = iselIntExpr_R(env, e->Iex.Binop.arg1);
1033          addInstr(env, mk_iMOVsd_RR(regL,dst));
1034 
1035          /* Do any necessary widening for 32/16/8 bit operands */
1036          switch (e->Iex.Binop.op) {
1037             case Iop_Shr64: case Iop_Shl64: case Iop_Sar64:
1038                break;
1039             case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
1040                break;
1041             case Iop_Shr8:
1042                addInstr(env, AMD64Instr_Alu64R(
1043                                 Aalu_AND, AMD64RMI_Imm(0xFF), dst));
1044                break;
1045             case Iop_Shr16:
1046                addInstr(env, AMD64Instr_Alu64R(
1047                                 Aalu_AND, AMD64RMI_Imm(0xFFFF), dst));
1048                break;
1049             case Iop_Shr32:
1050                addInstr(env, AMD64Instr_MovxLQ(False, dst, dst));
1051                break;
1052             case Iop_Sar8:
1053                addInstr(env, AMD64Instr_Sh64(Ash_SHL, 56, dst));
1054                addInstr(env, AMD64Instr_Sh64(Ash_SAR, 56, dst));
1055                break;
1056             case Iop_Sar16:
1057                addInstr(env, AMD64Instr_Sh64(Ash_SHL, 48, dst));
1058                addInstr(env, AMD64Instr_Sh64(Ash_SAR, 48, dst));
1059                break;
1060             case Iop_Sar32:
1061                addInstr(env, AMD64Instr_MovxLQ(True, dst, dst));
1062                break;
1063             default:
1064                ppIROp(e->Iex.Binop.op);
1065                vassert(0);
1066          }
1067 
1068          /* Now consider the shift amount.  If it's a literal, we
1069             can do a much better job than the general case. */
1070          if (e->Iex.Binop.arg2->tag == Iex_Const) {
1071             /* assert that the IR is well-typed */
1072             Int nshift;
1073             vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
1074             nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1075             vassert(nshift >= 0);
1076             if (nshift > 0)
1077                /* Can't allow nshift==0 since that means %cl */
1078                addInstr(env, AMD64Instr_Sh64(shOp, nshift, dst));
1079          } else {
1080             /* General case; we have to force the amount into %cl. */
1081             HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1082             addInstr(env, mk_iMOVsd_RR(regR,hregAMD64_RCX()));
1083             addInstr(env, AMD64Instr_Sh64(shOp, 0/* %cl */, dst));
1084          }
1085          return dst;
1086       }
1087 
1088       /* Handle misc other scalar ops. */
1089       if (e->Iex.Binop.op == Iop_Max32U) {
1090          HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1091          HReg dst  = newVRegI(env);
1092          HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
1093          addInstr(env, mk_iMOVsd_RR(src1, dst));
1094          addInstr(env, AMD64Instr_Alu32R(Aalu_CMP, AMD64RMI_Reg(src2), dst));
1095          addInstr(env, AMD64Instr_CMov64(Acc_B, src2, dst));
1096          return dst;
1097       }
1098 
1099       if (e->Iex.Binop.op == Iop_DivModS64to32
1100           || e->Iex.Binop.op == Iop_DivModU64to32) {
1101          /* 64 x 32 -> (32(rem),32(div)) division */
1102          /* Get the 64-bit operand into edx:eax, and the other into
1103             any old R/M. */
1104          HReg      rax     = hregAMD64_RAX();
1105          HReg      rdx     = hregAMD64_RDX();
1106          HReg      dst     = newVRegI(env);
1107          Bool      syned   = toBool(e->Iex.Binop.op == Iop_DivModS64to32);
1108          AMD64RM*  rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
1109          /* Compute the left operand into a reg, and then
1110             put the top half in edx and the bottom in eax. */
1111          HReg left64 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1112          addInstr(env, mk_iMOVsd_RR(left64, rdx));
1113          addInstr(env, mk_iMOVsd_RR(left64, rax));
1114          addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, rdx));
1115          addInstr(env, AMD64Instr_Div(syned, 4, rmRight));
1116 	 addInstr(env, AMD64Instr_MovxLQ(False, rdx, rdx));
1117 	 addInstr(env, AMD64Instr_MovxLQ(False, rax, rax));
1118          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, rdx));
1119          addInstr(env, mk_iMOVsd_RR(rax, dst));
1120          addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(rdx), dst));
1121          return dst;
1122       }
1123 
1124       if (e->Iex.Binop.op == Iop_32HLto64) {
1125          HReg hi32  = newVRegI(env);
1126          HReg lo32  = newVRegI(env);
1127          HReg hi32s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1128          HReg lo32s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1129          addInstr(env, mk_iMOVsd_RR(hi32s, hi32));
1130          addInstr(env, mk_iMOVsd_RR(lo32s, lo32));
1131          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, hi32));
1132 	 addInstr(env, AMD64Instr_MovxLQ(False, lo32, lo32));
1133          addInstr(env, AMD64Instr_Alu64R(
1134                           Aalu_OR, AMD64RMI_Reg(lo32), hi32));
1135          return hi32;
1136       }
1137 
1138       if (e->Iex.Binop.op == Iop_16HLto32) {
1139          HReg hi16  = newVRegI(env);
1140          HReg lo16  = newVRegI(env);
1141          HReg hi16s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1142          HReg lo16s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1143          addInstr(env, mk_iMOVsd_RR(hi16s, hi16));
1144          addInstr(env, mk_iMOVsd_RR(lo16s, lo16));
1145          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 16, hi16));
1146          addInstr(env, AMD64Instr_Alu64R(
1147                           Aalu_AND, AMD64RMI_Imm(0xFFFF), lo16));
1148          addInstr(env, AMD64Instr_Alu64R(
1149                           Aalu_OR, AMD64RMI_Reg(lo16), hi16));
1150          return hi16;
1151       }
1152 
1153       if (e->Iex.Binop.op == Iop_8HLto16) {
1154          HReg hi8  = newVRegI(env);
1155          HReg lo8  = newVRegI(env);
1156          HReg hi8s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1157          HReg lo8s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1158          addInstr(env, mk_iMOVsd_RR(hi8s, hi8));
1159          addInstr(env, mk_iMOVsd_RR(lo8s, lo8));
1160          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 8, hi8));
1161          addInstr(env, AMD64Instr_Alu64R(
1162                           Aalu_AND, AMD64RMI_Imm(0xFF), lo8));
1163          addInstr(env, AMD64Instr_Alu64R(
1164                           Aalu_OR, AMD64RMI_Reg(lo8), hi8));
1165          return hi8;
1166       }
1167 
1168       if (e->Iex.Binop.op == Iop_MullS32
1169           || e->Iex.Binop.op == Iop_MullS16
1170           || e->Iex.Binop.op == Iop_MullS8
1171           || e->Iex.Binop.op == Iop_MullU32
1172           || e->Iex.Binop.op == Iop_MullU16
1173           || e->Iex.Binop.op == Iop_MullU8) {
1174          HReg a32   = newVRegI(env);
1175          HReg b32   = newVRegI(env);
1176          HReg a32s  = iselIntExpr_R(env, e->Iex.Binop.arg1);
1177          HReg b32s  = iselIntExpr_R(env, e->Iex.Binop.arg2);
1178          Int          shift  = 0;
1179          AMD64ShiftOp shr_op = Ash_SHR;
1180          switch (e->Iex.Binop.op) {
1181             case Iop_MullS32: shr_op = Ash_SAR; shift = 32; break;
1182             case Iop_MullS16: shr_op = Ash_SAR; shift = 48; break;
1183             case Iop_MullS8:  shr_op = Ash_SAR; shift = 56; break;
1184             case Iop_MullU32: shr_op = Ash_SHR; shift = 32; break;
1185             case Iop_MullU16: shr_op = Ash_SHR; shift = 48; break;
1186             case Iop_MullU8:  shr_op = Ash_SHR; shift = 56; break;
1187             default: vassert(0);
1188          }
1189 
1190          addInstr(env, mk_iMOVsd_RR(a32s, a32));
1191          addInstr(env, mk_iMOVsd_RR(b32s, b32));
1192          addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, a32));
1193          addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, b32));
1194          addInstr(env, AMD64Instr_Sh64(shr_op,  shift, a32));
1195          addInstr(env, AMD64Instr_Sh64(shr_op,  shift, b32));
1196          addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(a32), b32));
1197          return b32;
1198       }
1199 
1200       if (e->Iex.Binop.op == Iop_CmpF64) {
1201          HReg fL = iselDblExpr(env, e->Iex.Binop.arg1);
1202          HReg fR = iselDblExpr(env, e->Iex.Binop.arg2);
1203          HReg dst = newVRegI(env);
1204          addInstr(env, AMD64Instr_SseUComIS(8,fL,fR,dst));
1205          /* Mask out irrelevant parts of the result so as to conform
1206             to the CmpF64 definition. */
1207          addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(0x45), dst));
1208          return dst;
1209       }
1210 
1211       if (e->Iex.Binop.op == Iop_F64toI32S
1212           || e->Iex.Binop.op == Iop_F64toI64S) {
1213          Int  szD = e->Iex.Binop.op==Iop_F64toI32S ? 4 : 8;
1214          HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
1215          HReg dst = newVRegI(env);
1216          set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
1217          addInstr(env, AMD64Instr_SseSF2SI( 8, szD, rf, dst ));
1218          set_SSE_rounding_default(env);
1219          return dst;
1220       }
1221 
1222       /* Deal with 64-bit SIMD binary ops.  For the most part these are doable
1223          by using the equivalent 128-bit operation and ignoring the upper half
1224          of the result. */
1225       AMD64SseOp op = Asse_INVALID;
1226       Bool arg1isEReg = False;
1227       Bool preShift32R = False;
1228       switch (e->Iex.Binop.op) {
1229          // The following 3 could be done with 128 bit insns too, but
1230          // first require the inputs to be reformatted.
1231          //case Iop_QNarrowBin32Sto16Sx4:
1232          //op = Asse_PACKSSD; arg1isEReg = True; break;
1233          //case Iop_QNarrowBin16Sto8Sx8:
1234          //op = Asse_PACKSSW; arg1isEReg = True; break;
1235          //case Iop_QNarrowBin16Sto8Ux8:
1236          //op = Asse_PACKUSW; arg1isEReg = True; break;
1237 
1238          case Iop_InterleaveHI8x8:
1239             op = Asse_UNPCKLB; arg1isEReg = True; preShift32R = True;
1240             break;
1241          case Iop_InterleaveHI16x4:
1242             op = Asse_UNPCKLW; arg1isEReg = True; preShift32R = True;
1243             break;
1244          case Iop_InterleaveHI32x2:
1245             op = Asse_UNPCKLD; arg1isEReg = True; preShift32R = True;
1246             break;
1247          case Iop_InterleaveLO8x8:
1248             op = Asse_UNPCKLB; arg1isEReg = True;
1249             break;
1250          case Iop_InterleaveLO16x4:
1251             op = Asse_UNPCKLW; arg1isEReg = True;
1252             break;
1253          case Iop_InterleaveLO32x2:
1254             op = Asse_UNPCKLD; arg1isEReg = True;
1255             break;
1256 
1257          case Iop_Add8x8:     op = Asse_ADD8;     break;
1258          case Iop_Add16x4:    op = Asse_ADD16;    break;
1259          case Iop_Add32x2:    op = Asse_ADD32;    break;
1260          case Iop_QAdd8Sx8:   op = Asse_QADD8S;   break;
1261          case Iop_QAdd16Sx4:  op = Asse_QADD16S;  break;
1262          case Iop_QAdd8Ux8:   op = Asse_QADD8U;   break;
1263          case Iop_QAdd16Ux4:  op = Asse_QADD16U;  break;
1264          case Iop_Avg8Ux8:    op = Asse_AVG8U;    break;
1265          case Iop_Avg16Ux4:   op = Asse_AVG16U;   break;
1266          case Iop_CmpEQ8x8:   op = Asse_CMPEQ8;   break;
1267          case Iop_CmpEQ16x4:  op = Asse_CMPEQ16;  break;
1268          case Iop_CmpEQ32x2:  op = Asse_CMPEQ32;  break;
1269          case Iop_CmpGT8Sx8:  op = Asse_CMPGT8S;  break;
1270          case Iop_CmpGT16Sx4: op = Asse_CMPGT16S; break;
1271          case Iop_CmpGT32Sx2: op = Asse_CMPGT32S; break;
1272          case Iop_Max16Sx4:   op = Asse_MAX16S;   break;
1273          case Iop_Max8Ux8:    op = Asse_MAX8U;    break;
1274          case Iop_Min16Sx4:   op = Asse_MIN16S;   break;
1275          case Iop_Min8Ux8:    op = Asse_MIN8U;    break;
1276          case Iop_MulHi16Ux4: op = Asse_MULHI16U; break;
1277          case Iop_MulHi16Sx4: op = Asse_MULHI16S; break;
1278          case Iop_Mul16x4:    op = Asse_MUL16;    break;
1279          case Iop_Sub8x8:     op = Asse_SUB8;     break;
1280          case Iop_Sub16x4:    op = Asse_SUB16;    break;
1281          case Iop_Sub32x2:    op = Asse_SUB32;    break;
1282          case Iop_QSub8Sx8:   op = Asse_QSUB8S;   break;
1283          case Iop_QSub16Sx4:  op = Asse_QSUB16S;  break;
1284          case Iop_QSub8Ux8:   op = Asse_QSUB8U;   break;
1285          case Iop_QSub16Ux4:  op = Asse_QSUB16U;  break;
1286          default: break;
1287       }
1288       if (op != Asse_INVALID) {
1289          /* This isn't pretty, but .. move each arg to the low half of an XMM
1290             register, do the operation on the whole register, and move the
1291             result back to an integer register. */
1292          const IRExpr* arg1 = e->Iex.Binop.arg1;
1293          const IRExpr* arg2 = e->Iex.Binop.arg2;
1294          vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64);
1295          vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I64);
1296          HReg iarg1 = iselIntExpr_R(env, arg1);
1297          HReg iarg2 = iselIntExpr_R(env, arg2);
1298          HReg varg1 = newVRegV(env);
1299          HReg varg2 = newVRegV(env);
1300          HReg idst  = newVRegI(env);
1301          addInstr(env, AMD64Instr_SseMOVQ(iarg1, varg1, True/*toXMM*/));
1302          addInstr(env, AMD64Instr_SseMOVQ(iarg2, varg2, True/*toXMM*/));
1303          if (arg1isEReg) {
1304             if (preShift32R) {
1305                addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 32, varg1));
1306                addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 32, varg2));
1307             }
1308             addInstr(env, AMD64Instr_SseReRg(op, varg1, varg2));
1309             addInstr(env, AMD64Instr_SseMOVQ(idst, varg2, False/*!toXMM*/));
1310          } else {
1311             vassert(!preShift32R);
1312             addInstr(env, AMD64Instr_SseReRg(op, varg2, varg1));
1313             addInstr(env, AMD64Instr_SseMOVQ(idst, varg1, False/*!toXMM*/));
1314          }
1315          return idst;
1316       }
1317 
1318       UInt laneBits = 0;
1319       op = Asse_INVALID;
1320       switch (e->Iex.Binop.op) {
1321          case Iop_ShlN16x4: laneBits = 16; op = Asse_SHL16; break;
1322          case Iop_ShlN32x2: laneBits = 32; op = Asse_SHL32; break;
1323          case Iop_SarN16x4: laneBits = 16; op = Asse_SAR16; break;
1324          case Iop_SarN32x2: laneBits = 32; op = Asse_SAR32; break;
1325          case Iop_ShrN16x4: laneBits = 16; op = Asse_SHR16; break;
1326          case Iop_ShrN32x2: laneBits = 32; op = Asse_SHR32; break;
1327          default: break;
1328       }
1329       if (op != Asse_INVALID) {
1330          const IRExpr* arg1 = e->Iex.Binop.arg1;
1331          const IRExpr* arg2 = e->Iex.Binop.arg2;
1332          vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64);
1333          vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I8);
1334          HReg igreg = iselIntExpr_R(env, arg1);
1335          HReg vgreg = newVRegV(env);
1336          HReg idst  = newVRegI(env);
1337          addInstr(env, AMD64Instr_SseMOVQ(igreg, vgreg, True/*toXMM*/));
1338          /* If it's a shift by an in-range immediate, generate a single
1339             instruction. */
1340          if (arg2->tag == Iex_Const) {
1341             IRConst* c = arg2->Iex.Const.con;
1342             vassert(c->tag == Ico_U8);
1343             UInt shift = c->Ico.U8;
1344             if (shift < laneBits) {
1345                addInstr(env, AMD64Instr_SseShiftN(op, shift, vgreg));
1346                addInstr(env, AMD64Instr_SseMOVQ(idst, vgreg, False/*!toXMM*/));
1347                return idst;
1348             }
1349          }
1350          /* Otherwise we have to do it the longwinded way. */
1351          HReg ishift = iselIntExpr_R(env, arg2);
1352          HReg vshift = newVRegV(env);
1353          addInstr(env, AMD64Instr_SseMOVQ(ishift, vshift, True/*toXMM*/));
1354          addInstr(env, AMD64Instr_SseReRg(op, vshift, vgreg));
1355          addInstr(env, AMD64Instr_SseMOVQ(idst, vgreg, False/*!toXMM*/));
1356          return idst;
1357       }
1358 
1359       if (e->Iex.Binop.op == Iop_Mul32x2) {
1360          const IRExpr* arg1 = e->Iex.Binop.arg1;
1361          const IRExpr* arg2 = e->Iex.Binop.arg2;
1362          vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64);
1363          vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I64);
1364          HReg s1 = iselIntExpr_R(env, arg1);
1365          HReg s2 = iselIntExpr_R(env, arg2);
1366          HReg resLo = newVRegI(env);
1367          // resLo = (s1 *64 s2) & 0xFFFF'FFFF
1368          addInstr(env, mk_iMOVsd_RR(s1, resLo));
1369          addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(s2), resLo));
1370          addInstr(env, AMD64Instr_MovxLQ(False, resLo, resLo));
1371 
1372          // resHi = ((s1 >>u 32) *64 (s2 >>u 32)) << 32;
1373          HReg resHi = newVRegI(env);
1374          addInstr(env, mk_iMOVsd_RR(s1, resHi));
1375          addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, resHi));
1376          HReg tmp = newVRegI(env);
1377          addInstr(env, mk_iMOVsd_RR(s2, tmp));
1378          addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, tmp));
1379          addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(tmp), resHi));
1380          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, resHi));
1381 
1382          // final result = resHi | resLo
1383          addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(resHi), resLo));
1384          return resLo;
1385       }
1386 
1387       // A few remaining SIMD64 ops require helper functions, at least for
1388       // now.
1389       Bool second_is_UInt = False;
1390       HWord fn = 0;
1391       switch (e->Iex.Binop.op) {
1392          case Iop_CatOddLanes16x4:
1393             fn = (HWord)h_generic_calc_CatOddLanes16x4; break;
1394          case Iop_CatEvenLanes16x4:
1395             fn = (HWord)h_generic_calc_CatEvenLanes16x4; break;
1396          case Iop_PermOrZero8x8:
1397             fn = (HWord)h_generic_calc_PermOrZero8x8; break;
1398 
1399          case Iop_QNarrowBin32Sto16Sx4:
1400             fn = (HWord)h_generic_calc_QNarrowBin32Sto16Sx4; break;
1401          case Iop_QNarrowBin16Sto8Sx8:
1402             fn = (HWord)h_generic_calc_QNarrowBin16Sto8Sx8; break;
1403          case Iop_QNarrowBin16Sto8Ux8:
1404             fn = (HWord)h_generic_calc_QNarrowBin16Sto8Ux8; break;
1405 
1406          case Iop_NarrowBin16to8x8:
1407             fn = (HWord)h_generic_calc_NarrowBin16to8x8; break;
1408          case Iop_NarrowBin32to16x4:
1409             fn = (HWord)h_generic_calc_NarrowBin32to16x4; break;
1410 
1411          case Iop_SarN8x8:
1412             fn = (HWord)h_generic_calc_SarN8x8;
1413             second_is_UInt = True;
1414             break;
1415 
1416          default:
1417             fn = (HWord)0; break;
1418       }
1419       if (fn != (HWord)0) {
1420          /* Note: the following assumes all helpers are of signature
1421                ULong fn ( ULong, ULong ), and they are
1422             not marked as regparm functions.
1423          */
1424          HReg dst  = newVRegI(env);
1425          HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
1426          HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1427          if (second_is_UInt)
1428             addInstr(env, AMD64Instr_MovxLQ(False, argR, argR));
1429          addInstr(env, mk_iMOVsd_RR(argL, hregAMD64_RDI()) );
1430          addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RSI()) );
1431          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 2,
1432                                         mk_RetLoc_simple(RLPri_Int) ));
1433          addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1434          return dst;
1435       }
1436 
1437       break;
1438    }
1439 
1440    /* --------- UNARY OP --------- */
1441    case Iex_Unop: {
1442 
1443       /* 1Uto8(64to1(expr64)) */
1444       {
1445          DEFINE_PATTERN( p_1Uto8_64to1,
1446                          unop(Iop_1Uto8, unop(Iop_64to1, bind(0))) );
1447          if (matchIRExpr(&mi,p_1Uto8_64to1,e)) {
1448             const IRExpr* expr64 = mi.bindee[0];
1449             HReg    dst    = newVRegI(env);
1450             HReg    src    = iselIntExpr_R(env, expr64);
1451             addInstr(env, mk_iMOVsd_RR(src,dst) );
1452             addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
1453                                             AMD64RMI_Imm(1), dst));
1454             return dst;
1455          }
1456       }
1457 
1458       /* 8Uto64(LDle(expr64)) */
1459       {
1460          DEFINE_PATTERN(p_LDle8_then_8Uto64,
1461                         unop(Iop_8Uto64,
1462                              IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
1463          if (matchIRExpr(&mi,p_LDle8_then_8Uto64,e)) {
1464             HReg dst = newVRegI(env);
1465             AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1466             addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
1467             return dst;
1468          }
1469       }
1470 
1471       /* 16Uto64(LDle(expr64)) */
1472       {
1473          DEFINE_PATTERN(p_LDle16_then_16Uto64,
1474                         unop(Iop_16Uto64,
1475                              IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
1476          if (matchIRExpr(&mi,p_LDle16_then_16Uto64,e)) {
1477             HReg dst = newVRegI(env);
1478             AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1479             addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
1480             return dst;
1481          }
1482       }
1483 
1484       /* 32Uto64( Add32/Sub32/And32/Or32/Xor32(expr32, expr32) )
1485          Use 32 bit arithmetic and let the default zero-extend rule
1486          do the 32Uto64 for free. */
1487       if (e->Iex.Unop.op == Iop_32Uto64 && e->Iex.Unop.arg->tag == Iex_Binop) {
1488          IROp    opi  = e->Iex.Unop.arg->Iex.Binop.op; /* inner op */
1489          IRExpr* argL = e->Iex.Unop.arg->Iex.Binop.arg1;
1490          IRExpr* argR = e->Iex.Unop.arg->Iex.Binop.arg2;
1491          AMD64AluOp aluOp = Aalu_INVALID;
1492          switch (opi) {
1493             case Iop_Add32: aluOp = Aalu_ADD; break;
1494             case Iop_Sub32: aluOp = Aalu_SUB; break;
1495             case Iop_And32: aluOp = Aalu_AND; break;
1496             case Iop_Or32:  aluOp = Aalu_OR;  break;
1497             case Iop_Xor32: aluOp = Aalu_XOR; break;
1498             default: break;
1499          }
1500          if (aluOp != Aalu_INVALID) {
1501             /* For commutative ops we assume any literal values are on
1502                the second operand. */
1503             HReg dst      = newVRegI(env);
1504             HReg reg      = iselIntExpr_R(env, argL);
1505             AMD64RMI* rmi = iselIntExpr_RMI(env, argR);
1506             addInstr(env, mk_iMOVsd_RR(reg,dst));
1507             addInstr(env, AMD64Instr_Alu32R(aluOp, rmi, dst));
1508             return dst;
1509          }
1510          /* just fall through to normal handling for Iop_32Uto64 */
1511       }
1512 
1513       /* Fallback cases */
1514       switch (e->Iex.Unop.op) {
1515          case Iop_32Uto64:
1516          case Iop_32Sto64: {
1517             HReg dst = newVRegI(env);
1518             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1519             addInstr(env, AMD64Instr_MovxLQ(e->Iex.Unop.op == Iop_32Sto64,
1520                                             src, dst) );
1521             return dst;
1522          }
1523          case Iop_128HIto64: {
1524             HReg rHi, rLo;
1525             iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1526             return rHi; /* and abandon rLo */
1527          }
1528          case Iop_128to64: {
1529             HReg rHi, rLo;
1530             iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1531             return rLo; /* and abandon rHi */
1532          }
1533          case Iop_8Uto16:
1534          case Iop_8Uto32:
1535          case Iop_8Uto64:
1536          case Iop_16Uto64:
1537          case Iop_16Uto32: {
1538             HReg dst     = newVRegI(env);
1539             HReg src     = iselIntExpr_R(env, e->Iex.Unop.arg);
1540             Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Uto32
1541                                    || e->Iex.Unop.op==Iop_16Uto64 );
1542             UInt mask    = srcIs16 ? 0xFFFF : 0xFF;
1543             addInstr(env, mk_iMOVsd_RR(src,dst) );
1544             addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
1545                                             AMD64RMI_Imm(mask), dst));
1546             return dst;
1547          }
1548          case Iop_8Sto16:
1549          case Iop_8Sto64:
1550          case Iop_8Sto32:
1551          case Iop_16Sto32:
1552          case Iop_16Sto64: {
1553             HReg dst     = newVRegI(env);
1554             HReg src     = iselIntExpr_R(env, e->Iex.Unop.arg);
1555             Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Sto32
1556                                    || e->Iex.Unop.op==Iop_16Sto64 );
1557             UInt amt     = srcIs16 ? 48 : 56;
1558             addInstr(env, mk_iMOVsd_RR(src,dst) );
1559             addInstr(env, AMD64Instr_Sh64(Ash_SHL, amt, dst));
1560             addInstr(env, AMD64Instr_Sh64(Ash_SAR, amt, dst));
1561             return dst;
1562          }
1563  	 case Iop_Not8:
1564  	 case Iop_Not16:
1565          case Iop_Not32:
1566          case Iop_Not64: {
1567             HReg dst = newVRegI(env);
1568             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1569             addInstr(env, mk_iMOVsd_RR(src,dst) );
1570             addInstr(env, AMD64Instr_Unary64(Aun_NOT,dst));
1571             return dst;
1572          }
1573          case Iop_16HIto8:
1574          case Iop_32HIto16:
1575          case Iop_64HIto32: {
1576             HReg dst  = newVRegI(env);
1577             HReg src  = iselIntExpr_R(env, e->Iex.Unop.arg);
1578             Int shift = 0;
1579             switch (e->Iex.Unop.op) {
1580                case Iop_16HIto8:  shift = 8;  break;
1581                case Iop_32HIto16: shift = 16; break;
1582                case Iop_64HIto32: shift = 32; break;
1583                default: vassert(0);
1584             }
1585             addInstr(env, mk_iMOVsd_RR(src,dst) );
1586             addInstr(env, AMD64Instr_Sh64(Ash_SHR, shift, dst));
1587             return dst;
1588          }
1589          case Iop_1Uto64:
1590          case Iop_1Uto32:
1591          case Iop_1Uto8: {
1592             HReg dst           = newVRegI(env);
1593             AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1594             addInstr(env, AMD64Instr_Set64(cond,dst));
1595             return dst;
1596          }
1597          case Iop_1Sto8:
1598          case Iop_1Sto16:
1599          case Iop_1Sto32:
1600          case Iop_1Sto64: {
1601             /* could do better than this, but for now ... */
1602             HReg dst           = newVRegI(env);
1603             AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1604             addInstr(env, AMD64Instr_Set64(cond,dst));
1605             addInstr(env, AMD64Instr_Sh64(Ash_SHL, 63, dst));
1606             addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1607             return dst;
1608          }
1609          case Iop_Ctz64: {
1610             /* Count trailing zeroes, implemented by amd64 'bsfq' */
1611             HReg dst = newVRegI(env);
1612             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1613             addInstr(env, AMD64Instr_Bsfr64(True,src,dst));
1614             return dst;
1615          }
1616          case Iop_Clz64: {
1617             /* Count leading zeroes.  Do 'bsrq' to establish the index
1618                of the highest set bit, and subtract that value from
1619                63. */
1620             HReg tmp = newVRegI(env);
1621             HReg dst = newVRegI(env);
1622             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1623             addInstr(env, AMD64Instr_Bsfr64(False,src,tmp));
1624             addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
1625                                             AMD64RMI_Imm(63), dst));
1626             addInstr(env, AMD64Instr_Alu64R(Aalu_SUB,
1627                                             AMD64RMI_Reg(tmp), dst));
1628             return dst;
1629          }
1630 
1631          case Iop_CmpwNEZ64: {
1632             HReg dst = newVRegI(env);
1633             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1634             addInstr(env, mk_iMOVsd_RR(src,dst));
1635             addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
1636             addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
1637                                             AMD64RMI_Reg(src), dst));
1638             addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1639             return dst;
1640          }
1641 
1642          case Iop_CmpwNEZ32: {
1643             HReg src = newVRegI(env);
1644             HReg dst = newVRegI(env);
1645             HReg pre = iselIntExpr_R(env, e->Iex.Unop.arg);
1646             addInstr(env, mk_iMOVsd_RR(pre,src));
1647             addInstr(env, AMD64Instr_MovxLQ(False, src, src));
1648             addInstr(env, mk_iMOVsd_RR(src,dst));
1649             addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
1650             addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
1651                                             AMD64RMI_Reg(src), dst));
1652             addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1653             return dst;
1654          }
1655 
1656          case Iop_Left8:
1657          case Iop_Left16:
1658          case Iop_Left32:
1659          case Iop_Left64: {
1660             HReg dst = newVRegI(env);
1661             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1662             addInstr(env, mk_iMOVsd_RR(src, dst));
1663             addInstr(env, AMD64Instr_Unary64(Aun_NEG, dst));
1664             addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(src), dst));
1665             return dst;
1666          }
1667 
1668          case Iop_V128to32: {
1669             HReg        dst     = newVRegI(env);
1670             HReg        vec     = iselVecExpr(env, e->Iex.Unop.arg);
1671             AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
1672             addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, rsp_m16));
1673             addInstr(env, AMD64Instr_LoadEX(4, False/*z-widen*/, rsp_m16, dst));
1674             return dst;
1675          }
1676 
1677          /* V128{HI}to64 */
1678          case Iop_V128to64: {
1679             HReg dst = newVRegI(env);
1680             HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1681             addInstr(env, AMD64Instr_SseMOVQ(dst, vec, False/*!toXMM*/));
1682             return dst;
1683          }
1684          case Iop_V128HIto64: {
1685             HReg dst  = newVRegI(env);
1686             HReg vec  = iselVecExpr(env, e->Iex.Unop.arg);
1687             HReg vec2 = newVRegV(env);
1688             addInstr(env, mk_vMOVsd_RR(vec, vec2));
1689             addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, vec2));
1690             addInstr(env, AMD64Instr_SseMOVQ(dst, vec2, False/*!toXMM*/));
1691             return dst;
1692          }
1693 
1694          /* V256to64_{3,2,1,0} */
1695          case Iop_V256to64_0: case Iop_V256to64_1:
1696          case Iop_V256to64_2: case Iop_V256to64_3: {
1697             HReg vHi, vLo, vec;
1698             iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
1699             /* Do the first part of the selection by deciding which of
1700                the 128 bit registers to look at, and second part using
1701                the same scheme as for V128{HI}to64 above. */
1702             Bool low64of128 = True;
1703             switch (e->Iex.Unop.op) {
1704                case Iop_V256to64_0: vec = vLo; low64of128 = True;  break;
1705                case Iop_V256to64_1: vec = vLo; low64of128 = False; break;
1706                case Iop_V256to64_2: vec = vHi; low64of128 = True;  break;
1707                case Iop_V256to64_3: vec = vHi; low64of128 = False; break;
1708                default: vassert(0);
1709             }
1710             HReg dst = newVRegI(env);
1711             if (low64of128) {
1712                addInstr(env, AMD64Instr_SseMOVQ(dst, vec, False/*!toXMM*/));
1713             } else {
1714                HReg vec2 = newVRegV(env);
1715                addInstr(env, mk_vMOVsd_RR(vec, vec2));
1716                addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, vec2));
1717                addInstr(env, AMD64Instr_SseMOVQ(dst, vec2, False/*!toXMM*/));
1718             }
1719             return dst;
1720          }
1721 
1722          /* ReinterpF64asI64(e) */
1723          /* Given an IEEE754 double, produce an I64 with the same bit
1724             pattern. */
1725          case Iop_ReinterpF64asI64: {
1726             AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1727             HReg        dst    = newVRegI(env);
1728             HReg        src    = iselDblExpr(env, e->Iex.Unop.arg);
1729             /* paranoia */
1730             set_SSE_rounding_default(env);
1731             addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, src, m8_rsp));
1732             addInstr(env, AMD64Instr_Alu64R(
1733                              Aalu_MOV, AMD64RMI_Mem(m8_rsp), dst));
1734             return dst;
1735          }
1736 
1737          /* ReinterpF32asI32(e) */
1738          /* Given an IEEE754 single, produce an I64 with the same bit
1739             pattern in the lower half. */
1740          case Iop_ReinterpF32asI32: {
1741             AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1742             HReg        dst    = newVRegI(env);
1743             HReg        src    = iselFltExpr(env, e->Iex.Unop.arg);
1744             /* paranoia */
1745             set_SSE_rounding_default(env);
1746             addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, src, m8_rsp));
1747             addInstr(env, AMD64Instr_LoadEX(4, False/*unsigned*/, m8_rsp, dst ));
1748             return dst;
1749          }
1750 
1751          case Iop_16to8:
1752          case Iop_32to8:
1753          case Iop_64to8:
1754          case Iop_32to16:
1755          case Iop_64to16:
1756          case Iop_64to32:
1757             /* These are no-ops. */
1758             return iselIntExpr_R(env, e->Iex.Unop.arg);
1759 
1760          case Iop_GetMSBs8x8: {
1761             /* Note: the following assumes the helper is of
1762                signature
1763                   UInt fn ( ULong ), and is not a regparm fn.
1764             */
1765             HReg dst = newVRegI(env);
1766             HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
1767             HWord fn = (HWord)h_generic_calc_GetMSBs8x8;
1768             addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
1769             addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
1770                                            1, mk_RetLoc_simple(RLPri_Int) ));
1771             /* MovxLQ is not exactly the right thing here.  We just
1772                need to get the bottom 8 bits of RAX into dst, and zero
1773                out everything else.  Assuming that the helper returns
1774                a UInt with the top 24 bits zeroed out, it'll do,
1775                though. */
1776             addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1777             return dst;
1778          }
1779 
1780          case Iop_GetMSBs8x16: {
1781             /* Note: the following assumes the helper is of signature
1782                   UInt fn ( ULong w64hi, ULong w64Lo ),
1783                and is not a regparm fn. */
1784             HReg dst = newVRegI(env);
1785             HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1786             HReg rsp = hregAMD64_RSP();
1787             HWord fn = (HWord)h_generic_calc_GetMSBs8x16;
1788             AMD64AMode* m8_rsp  = AMD64AMode_IR( -8, rsp);
1789             AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
1790             addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
1791                                              16, vec, m16_rsp));
1792             /* hi 64 bits into RDI -- the first arg */
1793             addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1794                                              AMD64RMI_Mem(m8_rsp),
1795                                              hregAMD64_RDI() )); /* 1st arg */
1796             /* lo 64 bits into RSI -- the 2nd arg */
1797             addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1798                                              AMD64RMI_Mem(m16_rsp),
1799                                              hregAMD64_RSI() )); /* 2nd arg */
1800             addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
1801                                            2, mk_RetLoc_simple(RLPri_Int) ));
1802             /* MovxLQ is not exactly the right thing here.  We just
1803                need to get the bottom 16 bits of RAX into dst, and zero
1804                out everything else.  Assuming that the helper returns
1805                a UInt with the top 16 bits zeroed out, it'll do,
1806                though. */
1807             addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1808             return dst;
1809          }
1810 
1811          default:
1812             break;
1813       }
1814 
1815       /* Deal with unary 64-bit SIMD ops. */
1816       HWord fn = 0;
1817       switch (e->Iex.Unop.op) {
1818          case Iop_CmpNEZ32x2:
1819             fn = (HWord)h_generic_calc_CmpNEZ32x2; break;
1820          case Iop_CmpNEZ16x4:
1821             fn = (HWord)h_generic_calc_CmpNEZ16x4; break;
1822          case Iop_CmpNEZ8x8:
1823             fn = (HWord)h_generic_calc_CmpNEZ8x8; break;
1824          default:
1825             fn = (HWord)0; break;
1826       }
1827       if (fn != (HWord)0) {
1828          /* Note: the following assumes all helpers are of
1829             signature
1830                ULong fn ( ULong ), and they are
1831             not marked as regparm functions.
1832          */
1833          HReg dst = newVRegI(env);
1834          HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
1835          addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
1836          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 1,
1837                                         mk_RetLoc_simple(RLPri_Int) ));
1838          addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1839          return dst;
1840       }
1841 
1842       break;
1843    }
1844 
1845    /* --------- GET --------- */
1846    case Iex_Get: {
1847       if (ty == Ity_I64) {
1848          HReg dst = newVRegI(env);
1849          addInstr(env, AMD64Instr_Alu64R(
1850                           Aalu_MOV,
1851                           AMD64RMI_Mem(
1852                              AMD64AMode_IR(e->Iex.Get.offset,
1853                                            hregAMD64_RBP())),
1854                           dst));
1855          return dst;
1856       }
1857       if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
1858          HReg dst = newVRegI(env);
1859          addInstr(env, AMD64Instr_LoadEX(
1860                           toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
1861                           False,
1862                           AMD64AMode_IR(e->Iex.Get.offset,hregAMD64_RBP()),
1863                           dst));
1864          return dst;
1865       }
1866       break;
1867    }
1868 
1869    case Iex_GetI: {
1870       AMD64AMode* am
1871          = genGuestArrayOffset(
1872               env, e->Iex.GetI.descr,
1873                    e->Iex.GetI.ix, e->Iex.GetI.bias );
1874       HReg dst = newVRegI(env);
1875       if (ty == Ity_I8) {
1876          addInstr(env, AMD64Instr_LoadEX( 1, False, am, dst ));
1877          return dst;
1878       }
1879       if (ty == Ity_I64) {
1880          addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, AMD64RMI_Mem(am), dst ));
1881          return dst;
1882       }
1883       break;
1884    }
1885 
1886    /* --------- CCALL --------- */
1887    case Iex_CCall: {
1888       HReg    dst = newVRegI(env);
1889       vassert(ty == e->Iex.CCall.retty);
1890 
1891       /* be very restrictive for now.  Only 64-bit ints allowed for
1892          args, and 64 or 32 bits for return type. */
1893       if (e->Iex.CCall.retty != Ity_I64 && e->Iex.CCall.retty != Ity_I32)
1894          goto irreducible;
1895 
1896       /* Marshal args, do the call. */
1897       UInt   addToSp = 0;
1898       RetLoc rloc    = mk_RetLoc_INVALID();
1899       doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
1900                     e->Iex.CCall.cee, e->Iex.CCall.retty, e->Iex.CCall.args );
1901       vassert(is_sane_RetLoc(rloc));
1902       vassert(rloc.pri == RLPri_Int);
1903       vassert(addToSp == 0);
1904 
1905       /* Move to dst, and zero out the top 32 bits if the result type is
1906          Ity_I32.  Probably overkill, but still .. */
1907       if (e->Iex.CCall.retty == Ity_I64)
1908          addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1909       else
1910          addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1911 
1912       return dst;
1913    }
1914 
1915    /* --------- LITERAL --------- */
1916    /* 64/32/16/8-bit literals */
1917    case Iex_Const:
1918       if (ty == Ity_I64) {
1919          HReg r = newVRegI(env);
1920          addInstr(env, AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, r));
1921          return r;
1922       } else {
1923          AMD64RMI* rmi = iselIntExpr_RMI ( env, e );
1924          HReg      r   = newVRegI(env);
1925          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, rmi, r));
1926          return r;
1927       }
1928 
1929    /* --------- MULTIPLEX --------- */
1930    case Iex_ITE: { // VFD
1931       if ((ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8)
1932           && typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1) {
1933          HReg     r1  = iselIntExpr_R(env, e->Iex.ITE.iftrue);
1934          HReg     r0  = iselIntExpr_R(env, e->Iex.ITE.iffalse);
1935          HReg     dst = newVRegI(env);
1936          addInstr(env, mk_iMOVsd_RR(r1,dst));
1937          AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
1938          addInstr(env, AMD64Instr_CMov64(cc ^ 1, r0, dst));
1939          return dst;
1940       }
1941       break;
1942    }
1943 
1944    /* --------- TERNARY OP --------- */
1945    case Iex_Triop: {
1946       IRTriop *triop = e->Iex.Triop.details;
1947       /* C3210 flags following FPU partial remainder (fprem), both
1948          IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
1949       if (triop->op == Iop_PRemC3210F64
1950           || triop->op == Iop_PRem1C3210F64) {
1951          AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1952          HReg        arg1   = iselDblExpr(env, triop->arg2);
1953          HReg        arg2   = iselDblExpr(env, triop->arg3);
1954          HReg        dst    = newVRegI(env);
1955          addInstr(env, AMD64Instr_A87Free(2));
1956 
1957          /* one arg -> top of x87 stack */
1958          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg2, m8_rsp));
1959          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
1960 
1961          /* other arg -> top of x87 stack */
1962          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg1, m8_rsp));
1963          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
1964 
1965          switch (triop->op) {
1966             case Iop_PRemC3210F64:
1967                addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
1968                break;
1969             case Iop_PRem1C3210F64:
1970                addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
1971                break;
1972             default:
1973                vassert(0);
1974          }
1975          /* Ignore the result, and instead make off with the FPU's
1976 	    C3210 flags (in the status word). */
1977          addInstr(env, AMD64Instr_A87StSW(m8_rsp));
1978          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Mem(m8_rsp),dst));
1979          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0x4700),dst));
1980          return dst;
1981       }
1982       break;
1983    }
1984 
1985    default:
1986    break;
1987    } /* switch (e->tag) */
1988 
1989    /* We get here if no pattern matched. */
1990   irreducible:
1991    ppIRExpr(e);
1992    vpanic("iselIntExpr_R(amd64): cannot reduce tree");
1993 }
1994 
1995 
1996 /*---------------------------------------------------------*/
1997 /*--- ISEL: Integer expression auxiliaries              ---*/
1998 /*---------------------------------------------------------*/
1999 
2000 /* --------------------- AMODEs --------------------- */
2001 
2002 /* Return an AMode which computes the value of the specified
2003    expression, possibly also adding insns to the code list as a
2004    result.  The expression may only be a 32-bit one.
2005 */
2006 
iselIntExpr_AMode(ISelEnv * env,const IRExpr * e)2007 static AMD64AMode* iselIntExpr_AMode ( ISelEnv* env, const IRExpr* e )
2008 {
2009    AMD64AMode* am = iselIntExpr_AMode_wrk(env, e);
2010    vassert(sane_AMode(am));
2011    return am;
2012 }
2013 
2014 /* DO NOT CALL THIS DIRECTLY ! */
iselIntExpr_AMode_wrk(ISelEnv * env,const IRExpr * e)2015 static AMD64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, const IRExpr* e )
2016 {
2017    MatchInfo mi;
2018    DECLARE_PATTERN(p_complex);
2019    IRType ty = typeOfIRExpr(env->type_env,e);
2020    vassert(ty == Ity_I64);
2021 
2022    /* Add64( Add64(expr1, Shl64(expr2, imm8)), simm32 ) */
2023    /*              bind0        bind1  bind2   bind3   */
2024    DEFINE_PATTERN(p_complex,
2025       binop( Iop_Add64,
2026              binop( Iop_Add64,
2027                     bind(0),
2028                     binop(Iop_Shl64, bind(1), bind(2))
2029                   ),
2030              bind(3)
2031            )
2032    );
2033    if (matchIRExpr(&mi, p_complex, e)) {
2034       const IRExpr* expr1  = mi.bindee[0];
2035       const IRExpr* expr2  = mi.bindee[1];
2036       const IRExpr* imm8   = mi.bindee[2];
2037       const IRExpr* simm32 = mi.bindee[3];
2038       if (imm8->tag == Iex_Const
2039           && imm8->Iex.Const.con->tag == Ico_U8
2040           && imm8->Iex.Const.con->Ico.U8 < 4
2041           /* imm8 is OK, now check simm32 */
2042           && simm32->tag == Iex_Const
2043           && simm32->Iex.Const.con->tag == Ico_U64
2044           && fitsIn32Bits(simm32->Iex.Const.con->Ico.U64)) {
2045          UInt shift = imm8->Iex.Const.con->Ico.U8;
2046          UInt offset = toUInt(simm32->Iex.Const.con->Ico.U64);
2047          HReg r1 = iselIntExpr_R(env, expr1);
2048          HReg r2 = iselIntExpr_R(env, expr2);
2049          vassert(shift == 0 || shift == 1 || shift == 2 || shift == 3);
2050          return AMD64AMode_IRRS(offset, r1, r2, shift);
2051       }
2052    }
2053 
2054    /* Add64(expr1, Shl64(expr2, imm)) */
2055    if (e->tag == Iex_Binop
2056        && e->Iex.Binop.op == Iop_Add64
2057        && e->Iex.Binop.arg2->tag == Iex_Binop
2058        && e->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl64
2059        && e->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
2060        && e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
2061       UInt shift = e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
2062       if (shift == 1 || shift == 2 || shift == 3) {
2063          HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2064          HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg2->Iex.Binop.arg1 );
2065          return AMD64AMode_IRRS(0, r1, r2, shift);
2066       }
2067    }
2068 
2069    /* Add64(expr,i) */
2070    if (e->tag == Iex_Binop
2071        && e->Iex.Binop.op == Iop_Add64
2072        && e->Iex.Binop.arg2->tag == Iex_Const
2073        && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64
2074        && fitsIn32Bits(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)) {
2075       HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2076       return AMD64AMode_IR(
2077                 toUInt(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64),
2078                 r1
2079              );
2080    }
2081 
2082    /* Doesn't match anything in particular.  Generate it into
2083       a register and use that. */
2084    {
2085       HReg r1 = iselIntExpr_R(env, e);
2086       return AMD64AMode_IR(0, r1);
2087    }
2088 }
2089 
2090 
2091 /* --------------------- RMIs --------------------- */
2092 
2093 /* Similarly, calculate an expression into an X86RMI operand.  As with
2094    iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
2095 
iselIntExpr_RMI(ISelEnv * env,const IRExpr * e)2096 static AMD64RMI* iselIntExpr_RMI ( ISelEnv* env, const IRExpr* e )
2097 {
2098    AMD64RMI* rmi = iselIntExpr_RMI_wrk(env, e);
2099    /* sanity checks ... */
2100    switch (rmi->tag) {
2101       case Armi_Imm:
2102          return rmi;
2103       case Armi_Reg:
2104          vassert(hregClass(rmi->Armi.Reg.reg) == HRcInt64);
2105          vassert(hregIsVirtual(rmi->Armi.Reg.reg));
2106          return rmi;
2107       case Armi_Mem:
2108          vassert(sane_AMode(rmi->Armi.Mem.am));
2109          return rmi;
2110       default:
2111          vpanic("iselIntExpr_RMI: unknown amd64 RMI tag");
2112    }
2113 }
2114 
2115 /* DO NOT CALL THIS DIRECTLY ! */
iselIntExpr_RMI_wrk(ISelEnv * env,const IRExpr * e)2116 static AMD64RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, const IRExpr* e )
2117 {
2118    IRType ty = typeOfIRExpr(env->type_env,e);
2119    vassert(ty == Ity_I64 || ty == Ity_I32
2120            || ty == Ity_I16 || ty == Ity_I8);
2121 
2122    /* special case: immediate 64/32/16/8 */
2123    if (e->tag == Iex_Const) {
2124       switch (e->Iex.Const.con->tag) {
2125         case Ico_U64:
2126            if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
2127               return AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
2128            }
2129            break;
2130          case Ico_U32:
2131             return AMD64RMI_Imm(e->Iex.Const.con->Ico.U32); break;
2132          case Ico_U16:
2133             return AMD64RMI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16); break;
2134          case Ico_U8:
2135             return AMD64RMI_Imm(0xFF & e->Iex.Const.con->Ico.U8); break;
2136          default:
2137             vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
2138       }
2139    }
2140 
2141    /* special case: 64-bit GET */
2142    if (e->tag == Iex_Get && ty == Ity_I64) {
2143       return AMD64RMI_Mem(AMD64AMode_IR(e->Iex.Get.offset,
2144                                         hregAMD64_RBP()));
2145    }
2146 
2147    /* special case: 64-bit load from memory */
2148    if (e->tag == Iex_Load && ty == Ity_I64
2149        && e->Iex.Load.end == Iend_LE) {
2150       AMD64AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2151       return AMD64RMI_Mem(am);
2152    }
2153 
2154    /* default case: calculate into a register and return that */
2155    {
2156       HReg r = iselIntExpr_R ( env, e );
2157       return AMD64RMI_Reg(r);
2158    }
2159 }
2160 
2161 
2162 /* --------------------- RIs --------------------- */
2163 
2164 /* Calculate an expression into an AMD64RI operand.  As with
2165    iselIntExpr_R, the expression can have type 64, 32, 16 or 8
2166    bits. */
2167 
iselIntExpr_RI(ISelEnv * env,const IRExpr * e)2168 static AMD64RI* iselIntExpr_RI ( ISelEnv* env, const IRExpr* e )
2169 {
2170    AMD64RI* ri = iselIntExpr_RI_wrk(env, e);
2171    /* sanity checks ... */
2172    switch (ri->tag) {
2173       case Ari_Imm:
2174          return ri;
2175       case Ari_Reg:
2176          vassert(hregClass(ri->Ari.Reg.reg) == HRcInt64);
2177          vassert(hregIsVirtual(ri->Ari.Reg.reg));
2178          return ri;
2179       default:
2180          vpanic("iselIntExpr_RI: unknown amd64 RI tag");
2181    }
2182 }
2183 
2184 /* DO NOT CALL THIS DIRECTLY ! */
iselIntExpr_RI_wrk(ISelEnv * env,const IRExpr * e)2185 static AMD64RI* iselIntExpr_RI_wrk ( ISelEnv* env, const IRExpr* e )
2186 {
2187    IRType ty = typeOfIRExpr(env->type_env,e);
2188    vassert(ty == Ity_I64 || ty == Ity_I32
2189            || ty == Ity_I16 || ty == Ity_I8);
2190 
2191    /* special case: immediate */
2192    if (e->tag == Iex_Const) {
2193       switch (e->Iex.Const.con->tag) {
2194         case Ico_U64:
2195            if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
2196               return AMD64RI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
2197            }
2198            break;
2199          case Ico_U32:
2200             return AMD64RI_Imm(e->Iex.Const.con->Ico.U32);
2201          case Ico_U16:
2202             return AMD64RI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16);
2203          case Ico_U8:
2204             return AMD64RI_Imm(0xFF & e->Iex.Const.con->Ico.U8);
2205          default:
2206             vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
2207       }
2208    }
2209 
2210    /* default case: calculate into a register and return that */
2211    {
2212       HReg r = iselIntExpr_R ( env, e );
2213       return AMD64RI_Reg(r);
2214    }
2215 }
2216 
2217 
2218 /* --------------------- RMs --------------------- */
2219 
2220 /* Similarly, calculate an expression into an AMD64RM operand.  As
2221    with iselIntExpr_R, the expression can have type 64, 32, 16 or 8
2222    bits.  */
2223 
iselIntExpr_RM(ISelEnv * env,const IRExpr * e)2224 static AMD64RM* iselIntExpr_RM ( ISelEnv* env, const IRExpr* e )
2225 {
2226    AMD64RM* rm = iselIntExpr_RM_wrk(env, e);
2227    /* sanity checks ... */
2228    switch (rm->tag) {
2229       case Arm_Reg:
2230          vassert(hregClass(rm->Arm.Reg.reg) == HRcInt64);
2231          vassert(hregIsVirtual(rm->Arm.Reg.reg));
2232          return rm;
2233       case Arm_Mem:
2234          vassert(sane_AMode(rm->Arm.Mem.am));
2235          return rm;
2236       default:
2237          vpanic("iselIntExpr_RM: unknown amd64 RM tag");
2238    }
2239 }
2240 
2241 /* DO NOT CALL THIS DIRECTLY ! */
iselIntExpr_RM_wrk(ISelEnv * env,const IRExpr * e)2242 static AMD64RM* iselIntExpr_RM_wrk ( ISelEnv* env, const IRExpr* e )
2243 {
2244    IRType ty = typeOfIRExpr(env->type_env,e);
2245    vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
2246 
2247    /* special case: 64-bit GET */
2248    if (e->tag == Iex_Get && ty == Ity_I64) {
2249       return AMD64RM_Mem(AMD64AMode_IR(e->Iex.Get.offset,
2250                                        hregAMD64_RBP()));
2251    }
2252 
2253    /* special case: load from memory */
2254 
2255    /* default case: calculate into a register and return that */
2256    {
2257       HReg r = iselIntExpr_R ( env, e );
2258       return AMD64RM_Reg(r);
2259    }
2260 }
2261 
2262 
2263 /* --------------------- CONDCODE --------------------- */
2264 
2265 /* Generate code to evaluated a bit-typed expression, returning the
2266    condition code which would correspond when the expression would
2267    notionally have returned 1. */
2268 
iselCondCode(ISelEnv * env,const IRExpr * e)2269 static AMD64CondCode iselCondCode ( ISelEnv* env, const IRExpr* e )
2270 {
2271    /* Uh, there's nothing we can sanity check here, unfortunately. */
2272    return iselCondCode_wrk(env,e);
2273 }
2274 
2275 /* DO NOT CALL THIS DIRECTLY ! */
iselCondCode_wrk(ISelEnv * env,const IRExpr * e)2276 static AMD64CondCode iselCondCode_wrk ( ISelEnv* env, const IRExpr* e )
2277 {
2278    vassert(e);
2279    vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
2280 
2281    /* var */
2282    if (e->tag == Iex_RdTmp) {
2283       HReg r64 = lookupIRTemp(env, e->Iex.RdTmp.tmp);
2284       HReg dst = newVRegI(env);
2285       addInstr(env, mk_iMOVsd_RR(r64,dst));
2286       addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(1),dst));
2287       return Acc_NZ;
2288    }
2289 
2290    /* Constant 1:Bit */
2291    if (e->tag == Iex_Const) {
2292       HReg r;
2293       vassert(e->Iex.Const.con->tag == Ico_U1);
2294       vassert(e->Iex.Const.con->Ico.U1 == True
2295               || e->Iex.Const.con->Ico.U1 == False);
2296       r = newVRegI(env);
2297       addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Imm(0),r));
2298       addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,AMD64RMI_Reg(r),r));
2299       return e->Iex.Const.con->Ico.U1 ? Acc_Z : Acc_NZ;
2300    }
2301 
2302    /* Not1(...) */
2303    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
2304       /* Generate code for the arg, and negate the test condition */
2305       return 1 ^ iselCondCode(env, e->Iex.Unop.arg);
2306    }
2307 
2308    /* --- patterns rooted at: 64to1 --- */
2309 
2310    /* 64to1 */
2311    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_64to1) {
2312       HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
2313       addInstr(env, AMD64Instr_Test64(1,reg));
2314       return Acc_NZ;
2315    }
2316 
2317    /* --- patterns rooted at: 32to1 --- */
2318 
2319    /* 32to1 */
2320    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_32to1) {
2321       HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
2322       addInstr(env, AMD64Instr_Test64(1,reg));
2323       return Acc_NZ;
2324    }
2325 
2326    /* --- patterns rooted at: CmpNEZ8 --- */
2327 
2328    /* CmpNEZ8(x) */
2329    if (e->tag == Iex_Unop
2330        && e->Iex.Unop.op == Iop_CmpNEZ8) {
2331       HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
2332       addInstr(env, AMD64Instr_Test64(0xFF,r));
2333       return Acc_NZ;
2334    }
2335 
2336    /* --- patterns rooted at: CmpNEZ16 --- */
2337 
2338    /* CmpNEZ16(x) */
2339    if (e->tag == Iex_Unop
2340        && e->Iex.Unop.op == Iop_CmpNEZ16) {
2341       HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
2342       addInstr(env, AMD64Instr_Test64(0xFFFF,r));
2343       return Acc_NZ;
2344    }
2345 
2346    /* --- patterns rooted at: CmpNEZ32 --- */
2347 
2348    if (e->tag == Iex_Unop
2349        && e->Iex.Unop.op == Iop_CmpNEZ32) {
2350       IRExpr* arg = e->Iex.Unop.arg;
2351       if (arg->tag == Iex_Binop
2352           && (arg->Iex.Binop.op == Iop_Or32
2353               || arg->Iex.Binop.op == Iop_And32)) {
2354          /* CmpNEZ32(Or32(x,y)) */
2355          /* CmpNEZ32(And32(x,y)) */
2356          HReg      r0   = iselIntExpr_R(env, arg->Iex.Binop.arg1);
2357          AMD64RMI* rmi1 = iselIntExpr_RMI(env, arg->Iex.Binop.arg2);
2358          HReg      tmp  = newVRegI(env);
2359          addInstr(env, mk_iMOVsd_RR(r0, tmp));
2360          addInstr(env, AMD64Instr_Alu32R(
2361                           arg->Iex.Binop.op == Iop_Or32 ? Aalu_OR : Aalu_AND,
2362                           rmi1, tmp));
2363          return Acc_NZ;
2364       }
2365       /* CmpNEZ32(x) */
2366       HReg      r1   = iselIntExpr_R(env, arg);
2367       AMD64RMI* rmi2 = AMD64RMI_Imm(0);
2368       addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
2369       return Acc_NZ;
2370    }
2371 
2372    /* --- patterns rooted at: CmpNEZ64 --- */
2373 
2374    if (e->tag == Iex_Unop
2375        && e->Iex.Unop.op == Iop_CmpNEZ64) {
2376       IRExpr* arg = e->Iex.Unop.arg;
2377       if (arg->tag == Iex_Binop
2378           && (arg->Iex.Binop.op == Iop_Or64
2379               || arg->Iex.Binop.op == Iop_And64)) {
2380          /* CmpNEZ64(Or64(x,y)) */
2381          /* CmpNEZ64(And64(x,y)) */
2382          HReg      r0   = iselIntExpr_R(env, arg->Iex.Binop.arg1);
2383          AMD64RMI* rmi1 = iselIntExpr_RMI(env, arg->Iex.Binop.arg2);
2384          HReg      tmp  = newVRegI(env);
2385          addInstr(env, mk_iMOVsd_RR(r0, tmp));
2386          addInstr(env, AMD64Instr_Alu64R(
2387                           arg->Iex.Binop.op == Iop_Or64 ? Aalu_OR : Aalu_AND,
2388                           rmi1, tmp));
2389          return Acc_NZ;
2390       }
2391       /* CmpNEZ64(x) */
2392       HReg      r1   = iselIntExpr_R(env, arg);
2393       AMD64RMI* rmi2 = AMD64RMI_Imm(0);
2394       addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
2395       return Acc_NZ;
2396    }
2397 
2398    /* --- patterns rooted at: Cmp{EQ,NE}{8,16,32} --- */
2399 
2400    /* CmpEQ8 / CmpNE8 */
2401    if (e->tag == Iex_Binop
2402        && (e->Iex.Binop.op == Iop_CmpEQ8
2403            || e->Iex.Binop.op == Iop_CmpNE8
2404            || e->Iex.Binop.op == Iop_CasCmpEQ8
2405            || e->Iex.Binop.op == Iop_CasCmpNE8)) {
2406       if (isZeroU8(e->Iex.Binop.arg2)) {
2407          HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2408          addInstr(env, AMD64Instr_Test64(0xFF,r1));
2409          switch (e->Iex.Binop.op) {
2410             case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
2411             case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
2412             default: vpanic("iselCondCode(amd64): CmpXX8(expr,0:I8)");
2413          }
2414       } else {
2415          HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2416          AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2417          HReg      r    = newVRegI(env);
2418          addInstr(env, mk_iMOVsd_RR(r1,r));
2419          addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
2420          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFF),r));
2421          switch (e->Iex.Binop.op) {
2422             case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
2423             case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
2424             default: vpanic("iselCondCode(amd64): CmpXX8(expr,expr)");
2425          }
2426       }
2427    }
2428 
2429    /* CmpEQ16 / CmpNE16 */
2430    if (e->tag == Iex_Binop
2431        && (e->Iex.Binop.op == Iop_CmpEQ16
2432            || e->Iex.Binop.op == Iop_CmpNE16
2433            || e->Iex.Binop.op == Iop_CasCmpEQ16
2434            || e->Iex.Binop.op == Iop_CasCmpNE16)) {
2435       HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2436       AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2437       HReg      r    = newVRegI(env);
2438       addInstr(env, mk_iMOVsd_RR(r1,r));
2439       addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
2440       addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFFFF),r));
2441       switch (e->Iex.Binop.op) {
2442          case Iop_CmpEQ16: case Iop_CasCmpEQ16: return Acc_Z;
2443          case Iop_CmpNE16: case Iop_CasCmpNE16: return Acc_NZ;
2444          default: vpanic("iselCondCode(amd64): CmpXX16");
2445       }
2446    }
2447 
2448    /* CmpNE64(ccall, 64-bit constant) (--smc-check=all optimisation).
2449       Saves a "movq %rax, %tmp" compared to the default route. */
2450    if (e->tag == Iex_Binop
2451        && e->Iex.Binop.op == Iop_CmpNE64
2452        && e->Iex.Binop.arg1->tag == Iex_CCall
2453        && e->Iex.Binop.arg2->tag == Iex_Const) {
2454       IRExpr* cal = e->Iex.Binop.arg1;
2455       IRExpr* con = e->Iex.Binop.arg2;
2456       HReg    tmp = newVRegI(env);
2457       /* clone & partial-eval of generic Iex_CCall and Iex_Const cases */
2458       vassert(cal->Iex.CCall.retty == Ity_I64); /* else ill-typed IR */
2459       vassert(con->Iex.Const.con->tag == Ico_U64);
2460       /* Marshal args, do the call. */
2461       UInt   addToSp = 0;
2462       RetLoc rloc    = mk_RetLoc_INVALID();
2463       doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
2464                     cal->Iex.CCall.cee,
2465                     cal->Iex.CCall.retty, cal->Iex.CCall.args );
2466       vassert(is_sane_RetLoc(rloc));
2467       vassert(rloc.pri == RLPri_Int);
2468       vassert(addToSp == 0);
2469       /* */
2470       addInstr(env, AMD64Instr_Imm64(con->Iex.Const.con->Ico.U64, tmp));
2471       addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,
2472                                       AMD64RMI_Reg(hregAMD64_RAX()), tmp));
2473       return Acc_NZ;
2474    }
2475 
2476    /* Cmp*64*(x,y) */
2477    if (e->tag == Iex_Binop
2478        && (e->Iex.Binop.op == Iop_CmpEQ64
2479            || e->Iex.Binop.op == Iop_CmpNE64
2480            || e->Iex.Binop.op == Iop_CmpLT64S
2481            || e->Iex.Binop.op == Iop_CmpLT64U
2482            || e->Iex.Binop.op == Iop_CmpLE64S
2483            || e->Iex.Binop.op == Iop_CmpLE64U
2484            || e->Iex.Binop.op == Iop_CasCmpEQ64
2485            || e->Iex.Binop.op == Iop_CasCmpNE64
2486            || e->Iex.Binop.op == Iop_ExpCmpNE64)) {
2487       HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2488       AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2489       addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
2490       switch (e->Iex.Binop.op) {
2491          case Iop_CmpEQ64: case Iop_CasCmpEQ64: return Acc_Z;
2492          case Iop_CmpNE64:
2493          case Iop_CasCmpNE64: case Iop_ExpCmpNE64: return Acc_NZ;
2494 	 case Iop_CmpLT64S: return Acc_L;
2495 	 case Iop_CmpLT64U: return Acc_B;
2496 	 case Iop_CmpLE64S: return Acc_LE;
2497          case Iop_CmpLE64U: return Acc_BE;
2498          default: vpanic("iselCondCode(amd64): CmpXX64");
2499       }
2500    }
2501 
2502    /* Cmp*32*(x,y) */
2503    if (e->tag == Iex_Binop
2504        && (e->Iex.Binop.op == Iop_CmpEQ32
2505            || e->Iex.Binop.op == Iop_CmpNE32
2506            || e->Iex.Binop.op == Iop_CmpLT32S
2507            || e->Iex.Binop.op == Iop_CmpLT32U
2508            || e->Iex.Binop.op == Iop_CmpLE32S
2509            || e->Iex.Binop.op == Iop_CmpLE32U
2510            || e->Iex.Binop.op == Iop_CasCmpEQ32
2511            || e->Iex.Binop.op == Iop_CasCmpNE32
2512            || e->Iex.Binop.op == Iop_ExpCmpNE32)) {
2513       HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2514       AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2515       addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
2516       switch (e->Iex.Binop.op) {
2517          case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Acc_Z;
2518          case Iop_CmpNE32:
2519          case Iop_CasCmpNE32: case Iop_ExpCmpNE32: return Acc_NZ;
2520 	 case Iop_CmpLT32S: return Acc_L;
2521 	 case Iop_CmpLT32U: return Acc_B;
2522 	 case Iop_CmpLE32S: return Acc_LE;
2523          case Iop_CmpLE32U: return Acc_BE;
2524          default: vpanic("iselCondCode(amd64): CmpXX32");
2525       }
2526    }
2527 
2528    ppIRExpr(e);
2529    vpanic("iselCondCode(amd64)");
2530 }
2531 
2532 
2533 /*---------------------------------------------------------*/
2534 /*--- ISEL: Integer expressions (128 bit)               ---*/
2535 /*---------------------------------------------------------*/
2536 
2537 /* Compute a 128-bit value into a register pair, which is returned as
2538    the first two parameters.  As with iselIntExpr_R, these may be
2539    either real or virtual regs; in any case they must not be changed
2540    by subsequent code emitted by the caller.  */
2541 
iselInt128Expr(HReg * rHi,HReg * rLo,ISelEnv * env,const IRExpr * e)2542 static void iselInt128Expr ( HReg* rHi, HReg* rLo,
2543                              ISelEnv* env, const IRExpr* e )
2544 {
2545    iselInt128Expr_wrk(rHi, rLo, env, e);
2546 #  if 0
2547    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2548 #  endif
2549    vassert(hregClass(*rHi) == HRcInt64);
2550    vassert(hregIsVirtual(*rHi));
2551    vassert(hregClass(*rLo) == HRcInt64);
2552    vassert(hregIsVirtual(*rLo));
2553 }
2554 
2555 /* DO NOT CALL THIS DIRECTLY ! */
iselInt128Expr_wrk(HReg * rHi,HReg * rLo,ISelEnv * env,const IRExpr * e)2556 static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo,
2557                                  ISelEnv* env, const IRExpr* e )
2558 {
2559    vassert(e);
2560    vassert(typeOfIRExpr(env->type_env,e) == Ity_I128);
2561 
2562    /* read 128-bit IRTemp */
2563    if (e->tag == Iex_RdTmp) {
2564       lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
2565       return;
2566    }
2567 
2568    /* --------- BINARY ops --------- */
2569    if (e->tag == Iex_Binop) {
2570       switch (e->Iex.Binop.op) {
2571          /* 64 x 64 -> 128 multiply */
2572          case Iop_MullU64:
2573          case Iop_MullS64: {
2574             /* get one operand into %rax, and the other into a R/M.
2575                Need to make an educated guess about which is better in
2576                which. */
2577             HReg     tLo    = newVRegI(env);
2578             HReg     tHi    = newVRegI(env);
2579             Bool     syned  = toBool(e->Iex.Binop.op == Iop_MullS64);
2580             AMD64RM* rmLeft = iselIntExpr_RM(env, e->Iex.Binop.arg1);
2581             HReg     rRight = iselIntExpr_R(env, e->Iex.Binop.arg2);
2582             addInstr(env, mk_iMOVsd_RR(rRight, hregAMD64_RAX()));
2583             addInstr(env, AMD64Instr_MulL(syned, rmLeft));
2584             /* Result is now in RDX:RAX.  Tell the caller. */
2585             addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
2586             addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
2587             *rHi = tHi;
2588             *rLo = tLo;
2589             return;
2590          }
2591 
2592          /* 128 x 64 -> (64(rem),64(div)) division */
2593          case Iop_DivModU128to64:
2594          case Iop_DivModS128to64: {
2595             /* Get the 128-bit operand into rdx:rax, and the other into
2596                any old R/M. */
2597             HReg sHi, sLo;
2598             HReg     tLo     = newVRegI(env);
2599             HReg     tHi     = newVRegI(env);
2600             Bool     syned   = toBool(e->Iex.Binop.op == Iop_DivModS128to64);
2601             AMD64RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
2602             iselInt128Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2603             addInstr(env, mk_iMOVsd_RR(sHi, hregAMD64_RDX()));
2604             addInstr(env, mk_iMOVsd_RR(sLo, hregAMD64_RAX()));
2605             addInstr(env, AMD64Instr_Div(syned, 8, rmRight));
2606             addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
2607             addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
2608             *rHi = tHi;
2609             *rLo = tLo;
2610             return;
2611          }
2612 
2613          /* 64HLto128(e1,e2) */
2614          case Iop_64HLto128:
2615             *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
2616             *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
2617             return;
2618 
2619          default:
2620             break;
2621       }
2622    } /* if (e->tag == Iex_Binop) */
2623 
2624    ppIRExpr(e);
2625    vpanic("iselInt128Expr");
2626 }
2627 
2628 
2629 /*---------------------------------------------------------*/
2630 /*--- ISEL: Floating point expressions (32 bit)         ---*/
2631 /*---------------------------------------------------------*/
2632 
2633 /* Nothing interesting here; really just wrappers for
2634    64-bit stuff. */
2635 
iselFltExpr(ISelEnv * env,const IRExpr * e)2636 static HReg iselFltExpr ( ISelEnv* env, const IRExpr* e )
2637 {
2638    HReg r = iselFltExpr_wrk( env, e );
2639 #  if 0
2640    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2641 #  endif
2642    vassert(hregClass(r) == HRcVec128);
2643    vassert(hregIsVirtual(r));
2644    return r;
2645 }
2646 
2647 /* DO NOT CALL THIS DIRECTLY */
iselFltExpr_wrk(ISelEnv * env,const IRExpr * e)2648 static HReg iselFltExpr_wrk ( ISelEnv* env, const IRExpr* e )
2649 {
2650    IRType ty = typeOfIRExpr(env->type_env,e);
2651    vassert(ty == Ity_F32);
2652 
2653    if (e->tag == Iex_RdTmp) {
2654       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2655    }
2656 
2657    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2658       AMD64AMode* am;
2659       HReg res = newVRegV(env);
2660       vassert(e->Iex.Load.ty == Ity_F32);
2661       am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2662       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, res, am));
2663       return res;
2664    }
2665 
2666    if (e->tag == Iex_Binop
2667        && e->Iex.Binop.op == Iop_F64toF32) {
2668       /* Although the result is still held in a standard SSE register,
2669          we need to round it to reflect the loss of accuracy/range
2670          entailed in casting it to a 32-bit float. */
2671       HReg dst = newVRegV(env);
2672       HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
2673       set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
2674       addInstr(env, AMD64Instr_SseSDSS(True/*D->S*/,src,dst));
2675       set_SSE_rounding_default( env );
2676       return dst;
2677    }
2678 
2679    if (e->tag == Iex_Get) {
2680       AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
2681                                        hregAMD64_RBP() );
2682       HReg res = newVRegV(env);
2683       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, res, am ));
2684       return res;
2685    }
2686 
2687    if (e->tag == Iex_Unop
2688        && e->Iex.Unop.op == Iop_ReinterpI32asF32) {
2689        /* Given an I32, produce an IEEE754 float with the same bit
2690           pattern. */
2691        HReg        dst    = newVRegV(env);
2692        HReg        src    = iselIntExpr_R(env, e->Iex.Unop.arg);
2693        AMD64AMode* m4_rsp = AMD64AMode_IR(-4, hregAMD64_RSP());
2694        addInstr(env, AMD64Instr_Store(4, src, m4_rsp));
2695        addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, dst, m4_rsp ));
2696        return dst;
2697    }
2698 
2699    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF32toInt) {
2700       AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2701       HReg        arg    = iselFltExpr(env, e->Iex.Binop.arg2);
2702       HReg        dst    = newVRegV(env);
2703 
2704       /* rf now holds the value to be rounded.  The first thing to do
2705          is set the FPU's rounding mode accordingly. */
2706 
2707       /* Set host x87 rounding mode */
2708       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2709 
2710       addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, arg, m8_rsp));
2711       addInstr(env, AMD64Instr_A87Free(1));
2712       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 4));
2713       addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
2714       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 4));
2715       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, dst, m8_rsp));
2716 
2717       /* Restore default x87 rounding. */
2718       set_FPU_rounding_default( env );
2719 
2720       return dst;
2721    }
2722 
2723    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_NegF32) {
2724       /* Sigh ... very rough code.  Could do much better. */
2725       /* Get the 128-bit literal 00---0 10---0 into a register
2726          and xor it with the value to be negated. */
2727       HReg r1  = newVRegI(env);
2728       HReg dst = newVRegV(env);
2729       HReg tmp = newVRegV(env);
2730       HReg src = iselFltExpr(env, e->Iex.Unop.arg);
2731       AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
2732       addInstr(env, mk_vMOVsd_RR(src,tmp));
2733       addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
2734       addInstr(env, AMD64Instr_Imm64( 1ULL<<31, r1 ));
2735       addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
2736       addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
2737       addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
2738       add_to_rsp(env, 16);
2739       return dst;
2740    }
2741 
2742    if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF32) {
2743       IRQop *qop = e->Iex.Qop.details;
2744       HReg dst  = newVRegV(env);
2745       HReg argX = iselFltExpr(env, qop->arg2);
2746       HReg argY = iselFltExpr(env, qop->arg3);
2747       HReg argZ = iselFltExpr(env, qop->arg4);
2748       /* XXXROUNDINGFIXME */
2749       /* set roundingmode here */
2750       /* subq $16, %rsp         -- make a space*/
2751       sub_from_rsp(env, 16);
2752       /* Prepare 4 arg regs:
2753          leaq 0(%rsp), %rdi
2754          leaq 4(%rsp), %rsi
2755          leaq 8(%rsp), %rdx
2756          leaq 12(%rsp), %rcx
2757       */
2758       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
2759                                      hregAMD64_RDI()));
2760       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(4, hregAMD64_RSP()),
2761                                      hregAMD64_RSI()));
2762       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
2763                                      hregAMD64_RDX()));
2764       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(12, hregAMD64_RSP()),
2765                                      hregAMD64_RCX()));
2766       /* Store the three args, at (%rsi), (%rdx) and (%rcx):
2767          movss  %argX, 0(%rsi)
2768          movss  %argY, 0(%rdx)
2769          movss  %argZ, 0(%rcx)
2770          */
2771       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argX,
2772                                        AMD64AMode_IR(0, hregAMD64_RSI())));
2773       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argY,
2774                                        AMD64AMode_IR(0, hregAMD64_RDX())));
2775       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argZ,
2776                                        AMD64AMode_IR(0, hregAMD64_RCX())));
2777       /* call the helper */
2778       addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
2779                                      (ULong)(HWord)h_generic_calc_MAddF32,
2780                                      4, mk_RetLoc_simple(RLPri_None) ));
2781       /* fetch the result from memory, using %r_argp, which the
2782          register allocator will keep alive across the call. */
2783       addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 4, dst,
2784                                        AMD64AMode_IR(0, hregAMD64_RSP())));
2785       /* and finally, clear the space */
2786       add_to_rsp(env, 16);
2787       return dst;
2788    }
2789 
2790    ppIRExpr(e);
2791    vpanic("iselFltExpr_wrk");
2792 }
2793 
2794 
2795 /*---------------------------------------------------------*/
2796 /*--- ISEL: Floating point expressions (64 bit)         ---*/
2797 /*---------------------------------------------------------*/
2798 
2799 /* Compute a 64-bit floating point value into the lower half of an xmm
2800    register, the identity of which is returned.  As with
2801    iselIntExpr_R, the returned reg will be virtual, and it must not be
2802    changed by subsequent code emitted by the caller.
2803 */
2804 
2805 /* IEEE 754 formats.  From http://www.freesoft.org/CIE/RFC/1832/32.htm:
2806 
2807     Type                  S (1 bit)   E (11 bits)   F (52 bits)
2808     ----                  ---------   -----------   -----------
2809     signalling NaN        u           2047 (max)    .0uuuuu---u
2810                                                     (with at least
2811                                                      one 1 bit)
2812     quiet NaN             u           2047 (max)    .1uuuuu---u
2813 
2814     negative infinity     1           2047 (max)    .000000---0
2815 
2816     positive infinity     0           2047 (max)    .000000---0
2817 
2818     negative zero         1           0             .000000---0
2819 
2820     positive zero         0           0             .000000---0
2821 */
2822 
iselDblExpr(ISelEnv * env,const IRExpr * e)2823 static HReg iselDblExpr ( ISelEnv* env, const IRExpr* e )
2824 {
2825    HReg r = iselDblExpr_wrk( env, e );
2826 #  if 0
2827    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2828 #  endif
2829    vassert(hregClass(r) == HRcVec128);
2830    vassert(hregIsVirtual(r));
2831    return r;
2832 }
2833 
2834 /* DO NOT CALL THIS DIRECTLY */
iselDblExpr_wrk(ISelEnv * env,const IRExpr * e)2835 static HReg iselDblExpr_wrk ( ISelEnv* env, const IRExpr* e )
2836 {
2837    IRType ty = typeOfIRExpr(env->type_env,e);
2838    vassert(e);
2839    vassert(ty == Ity_F64);
2840 
2841    if (e->tag == Iex_RdTmp) {
2842       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2843    }
2844 
2845    if (e->tag == Iex_Const) {
2846       union { ULong u64; Double f64; } u;
2847       HReg res = newVRegV(env);
2848       HReg tmp = newVRegI(env);
2849       vassert(sizeof(u) == 8);
2850       vassert(sizeof(u.u64) == 8);
2851       vassert(sizeof(u.f64) == 8);
2852 
2853       if (e->Iex.Const.con->tag == Ico_F64) {
2854          u.f64 = e->Iex.Const.con->Ico.F64;
2855       }
2856       else if (e->Iex.Const.con->tag == Ico_F64i) {
2857          u.u64 = e->Iex.Const.con->Ico.F64i;
2858       }
2859       else
2860          vpanic("iselDblExpr(amd64): const");
2861 
2862       addInstr(env, AMD64Instr_Imm64(u.u64, tmp));
2863       addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(tmp)));
2864       addInstr(env, AMD64Instr_SseLdSt(
2865                        True/*load*/, 8, res,
2866                        AMD64AMode_IR(0, hregAMD64_RSP())
2867               ));
2868       add_to_rsp(env, 8);
2869       return res;
2870    }
2871 
2872    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2873       AMD64AMode* am;
2874       HReg res = newVRegV(env);
2875       vassert(e->Iex.Load.ty == Ity_F64);
2876       am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2877       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2878       return res;
2879    }
2880 
2881    if (e->tag == Iex_Get) {
2882       AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
2883                                       hregAMD64_RBP() );
2884       HReg res = newVRegV(env);
2885       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2886       return res;
2887    }
2888 
2889    if (e->tag == Iex_GetI) {
2890       AMD64AMode* am
2891          = genGuestArrayOffset(
2892               env, e->Iex.GetI.descr,
2893                    e->Iex.GetI.ix, e->Iex.GetI.bias );
2894       HReg res = newVRegV(env);
2895       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2896       return res;
2897    }
2898 
2899    if (e->tag == Iex_Triop) {
2900       IRTriop *triop = e->Iex.Triop.details;
2901       AMD64SseOp op = Asse_INVALID;
2902       switch (triop->op) {
2903          case Iop_AddF64: op = Asse_ADDF; break;
2904          case Iop_SubF64: op = Asse_SUBF; break;
2905          case Iop_MulF64: op = Asse_MULF; break;
2906          case Iop_DivF64: op = Asse_DIVF; break;
2907          default: break;
2908       }
2909       if (op != Asse_INVALID) {
2910          HReg dst  = newVRegV(env);
2911          HReg argL = iselDblExpr(env, triop->arg2);
2912          HReg argR = iselDblExpr(env, triop->arg3);
2913          addInstr(env, mk_vMOVsd_RR(argL, dst));
2914          /* XXXROUNDINGFIXME */
2915          /* set roundingmode here */
2916          addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
2917          return dst;
2918       }
2919    }
2920 
2921    if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF64) {
2922       IRQop *qop = e->Iex.Qop.details;
2923       HReg dst  = newVRegV(env);
2924       HReg argX = iselDblExpr(env, qop->arg2);
2925       HReg argY = iselDblExpr(env, qop->arg3);
2926       HReg argZ = iselDblExpr(env, qop->arg4);
2927       /* XXXROUNDINGFIXME */
2928       /* set roundingmode here */
2929       /* subq $32, %rsp         -- make a space*/
2930       sub_from_rsp(env, 32);
2931       /* Prepare 4 arg regs:
2932          leaq 0(%rsp), %rdi
2933          leaq 8(%rsp), %rsi
2934          leaq 16(%rsp), %rdx
2935          leaq 24(%rsp), %rcx
2936       */
2937       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
2938                                      hregAMD64_RDI()));
2939       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
2940                                      hregAMD64_RSI()));
2941       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, hregAMD64_RSP()),
2942                                      hregAMD64_RDX()));
2943       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(24, hregAMD64_RSP()),
2944                                      hregAMD64_RCX()));
2945       /* Store the three args, at (%rsi), (%rdx) and (%rcx):
2946          movsd  %argX, 0(%rsi)
2947          movsd  %argY, 0(%rdx)
2948          movsd  %argZ, 0(%rcx)
2949          */
2950       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argX,
2951                                        AMD64AMode_IR(0, hregAMD64_RSI())));
2952       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argY,
2953                                        AMD64AMode_IR(0, hregAMD64_RDX())));
2954       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argZ,
2955                                        AMD64AMode_IR(0, hregAMD64_RCX())));
2956       /* call the helper */
2957       addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
2958                                      (ULong)(HWord)h_generic_calc_MAddF64,
2959                                      4, mk_RetLoc_simple(RLPri_None) ));
2960       /* fetch the result from memory, using %r_argp, which the
2961          register allocator will keep alive across the call. */
2962       addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 8, dst,
2963                                        AMD64AMode_IR(0, hregAMD64_RSP())));
2964       /* and finally, clear the space */
2965       add_to_rsp(env, 32);
2966       return dst;
2967    }
2968 
2969    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
2970       AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2971       HReg        arg    = iselDblExpr(env, e->Iex.Binop.arg2);
2972       HReg        dst    = newVRegV(env);
2973 
2974       /* rf now holds the value to be rounded.  The first thing to do
2975          is set the FPU's rounding mode accordingly. */
2976 
2977       /* Set host x87 rounding mode */
2978       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2979 
2980       addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
2981       addInstr(env, AMD64Instr_A87Free(1));
2982       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
2983       addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
2984       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
2985       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
2986 
2987       /* Restore default x87 rounding. */
2988       set_FPU_rounding_default( env );
2989 
2990       return dst;
2991    }
2992 
2993    IRTriop *triop = e->Iex.Triop.details;
2994    if (e->tag == Iex_Triop
2995        && (triop->op == Iop_ScaleF64
2996            || triop->op == Iop_AtanF64
2997            || triop->op == Iop_Yl2xF64
2998            || triop->op == Iop_Yl2xp1F64
2999            || triop->op == Iop_PRemF64
3000            || triop->op == Iop_PRem1F64)
3001       ) {
3002       AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3003       HReg        arg1   = iselDblExpr(env, triop->arg2);
3004       HReg        arg2   = iselDblExpr(env, triop->arg3);
3005       HReg        dst    = newVRegV(env);
3006       Bool     arg2first = toBool(triop->op == Iop_ScaleF64
3007                                   || triop->op == Iop_PRemF64
3008                                   || triop->op == Iop_PRem1F64);
3009       addInstr(env, AMD64Instr_A87Free(2));
3010 
3011       /* one arg -> top of x87 stack */
3012       addInstr(env, AMD64Instr_SseLdSt(
3013                        False/*store*/, 8, arg2first ? arg2 : arg1, m8_rsp));
3014       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3015 
3016       /* other arg -> top of x87 stack */
3017       addInstr(env, AMD64Instr_SseLdSt(
3018                        False/*store*/, 8, arg2first ? arg1 : arg2, m8_rsp));
3019       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3020 
3021       /* do it */
3022       /* XXXROUNDINGFIXME */
3023       /* set roundingmode here */
3024       switch (triop->op) {
3025          case Iop_ScaleF64:
3026             addInstr(env, AMD64Instr_A87FpOp(Afp_SCALE));
3027             break;
3028          case Iop_AtanF64:
3029             addInstr(env, AMD64Instr_A87FpOp(Afp_ATAN));
3030             break;
3031          case Iop_Yl2xF64:
3032             addInstr(env, AMD64Instr_A87FpOp(Afp_YL2X));
3033             break;
3034          case Iop_Yl2xp1F64:
3035             addInstr(env, AMD64Instr_A87FpOp(Afp_YL2XP1));
3036             break;
3037          case Iop_PRemF64:
3038             addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
3039             break;
3040          case Iop_PRem1F64:
3041             addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
3042             break;
3043          default:
3044             vassert(0);
3045       }
3046 
3047       /* save result */
3048       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3049       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3050       return dst;
3051    }
3052 
3053    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_I64StoF64) {
3054       HReg dst = newVRegV(env);
3055       HReg src = iselIntExpr_R(env, e->Iex.Binop.arg2);
3056       set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
3057       addInstr(env, AMD64Instr_SseSI2SF( 8, 8, src, dst ));
3058       set_SSE_rounding_default( env );
3059       return dst;
3060    }
3061 
3062    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_I32StoF64) {
3063       HReg dst = newVRegV(env);
3064       HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
3065       set_SSE_rounding_default( env );
3066       addInstr(env, AMD64Instr_SseSI2SF( 4, 8, src, dst ));
3067       return dst;
3068    }
3069 
3070    if (e->tag == Iex_Unop
3071        && (e->Iex.Unop.op == Iop_NegF64
3072            || e->Iex.Unop.op == Iop_AbsF64)) {
3073       /* Sigh ... very rough code.  Could do much better. */
3074       /* Get the 128-bit literal 00---0 10---0 into a register
3075          and xor/nand it with the value to be negated. */
3076       HReg r1  = newVRegI(env);
3077       HReg dst = newVRegV(env);
3078       HReg tmp = newVRegV(env);
3079       HReg src = iselDblExpr(env, e->Iex.Unop.arg);
3080       AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3081       addInstr(env, mk_vMOVsd_RR(src,tmp));
3082       addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3083       addInstr(env, AMD64Instr_Imm64( 1ULL<<63, r1 ));
3084       addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
3085       addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
3086 
3087       if (e->Iex.Unop.op == Iop_NegF64)
3088          addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
3089       else
3090          addInstr(env, AMD64Instr_SseReRg(Asse_ANDN, tmp, dst));
3091 
3092       add_to_rsp(env, 16);
3093       return dst;
3094    }
3095 
3096    if (e->tag == Iex_Binop) {
3097       A87FpOp fpop = Afp_INVALID;
3098       switch (e->Iex.Binop.op) {
3099          case Iop_SqrtF64: fpop = Afp_SQRT; break;
3100          case Iop_SinF64:  fpop = Afp_SIN;  break;
3101          case Iop_CosF64:  fpop = Afp_COS;  break;
3102          case Iop_TanF64:  fpop = Afp_TAN;  break;
3103          case Iop_2xm1F64: fpop = Afp_2XM1; break;
3104          default: break;
3105       }
3106       if (fpop != Afp_INVALID) {
3107          AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3108          HReg        arg    = iselDblExpr(env, e->Iex.Binop.arg2);
3109          HReg        dst    = newVRegV(env);
3110          Int     nNeeded    = e->Iex.Binop.op==Iop_TanF64 ? 2 : 1;
3111          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
3112          addInstr(env, AMD64Instr_A87Free(nNeeded));
3113          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3114          /* XXXROUNDINGFIXME */
3115          /* set roundingmode here */
3116          /* Note that AMD64Instr_A87FpOp(Afp_TAN) sets the condition
3117             codes.  I don't think that matters, since this insn
3118             selector never generates such an instruction intervening
3119             between an flag-setting instruction and a flag-using
3120             instruction. */
3121          addInstr(env, AMD64Instr_A87FpOp(fpop));
3122          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3123          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3124          return dst;
3125       }
3126    }
3127 
3128    if (e->tag == Iex_Unop) {
3129       switch (e->Iex.Unop.op) {
3130 //..          case Iop_I32toF64: {
3131 //..             HReg dst = newVRegF(env);
3132 //..             HReg ri  = iselIntExpr_R(env, e->Iex.Unop.arg);
3133 //..             addInstr(env, X86Instr_Push(X86RMI_Reg(ri)));
3134 //..             set_FPU_rounding_default(env);
3135 //..             addInstr(env, X86Instr_FpLdStI(
3136 //..                              True/*load*/, 4, dst,
3137 //..                              X86AMode_IR(0, hregX86_ESP())));
3138 //..             add_to_esp(env, 4);
3139 //..             return dst;
3140 //..          }
3141          case Iop_ReinterpI64asF64: {
3142             /* Given an I64, produce an IEEE754 double with the same
3143                bit pattern. */
3144             AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3145             HReg        dst    = newVRegV(env);
3146             AMD64RI*    src    = iselIntExpr_RI(env, e->Iex.Unop.arg);
3147             /* paranoia */
3148             set_SSE_rounding_default(env);
3149             addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, src, m8_rsp));
3150             addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3151             return dst;
3152          }
3153          case Iop_F32toF64: {
3154             HReg f32;
3155             HReg f64 = newVRegV(env);
3156             /* this shouldn't be necessary, but be paranoid ... */
3157             set_SSE_rounding_default(env);
3158             f32 = iselFltExpr(env, e->Iex.Unop.arg);
3159             addInstr(env, AMD64Instr_SseSDSS(False/*S->D*/, f32, f64));
3160             return f64;
3161          }
3162          default:
3163             break;
3164       }
3165    }
3166 
3167    /* --------- MULTIPLEX --------- */
3168    if (e->tag == Iex_ITE) { // VFD
3169       HReg r1, r0, dst;
3170       vassert(ty == Ity_F64);
3171       vassert(typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1);
3172       r1  = iselDblExpr(env, e->Iex.ITE.iftrue);
3173       r0  = iselDblExpr(env, e->Iex.ITE.iffalse);
3174       dst = newVRegV(env);
3175       addInstr(env, mk_vMOVsd_RR(r1,dst));
3176       AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
3177       addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
3178       return dst;
3179    }
3180 
3181    ppIRExpr(e);
3182    vpanic("iselDblExpr_wrk");
3183 }
3184 
3185 
3186 /*---------------------------------------------------------*/
3187 /*--- ISEL: SIMD (Vector) expressions, 128 bit.         ---*/
3188 /*---------------------------------------------------------*/
3189 
iselVecExpr(ISelEnv * env,const IRExpr * e)3190 static HReg iselVecExpr ( ISelEnv* env, const IRExpr* e )
3191 {
3192    HReg r = iselVecExpr_wrk( env, e );
3193 #  if 0
3194    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3195 #  endif
3196    vassert(hregClass(r) == HRcVec128);
3197    vassert(hregIsVirtual(r));
3198    return r;
3199 }
3200 
3201 
3202 /* DO NOT CALL THIS DIRECTLY */
iselVecExpr_wrk(ISelEnv * env,const IRExpr * e)3203 static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e )
3204 {
3205    HWord      fn = 0; /* address of helper fn, if required */
3206    Bool       arg1isEReg = False;
3207    AMD64SseOp op = Asse_INVALID;
3208    vassert(e);
3209    IRType ty = typeOfIRExpr(env->type_env, e);
3210    vassert(ty == Ity_V128);
3211    UInt laneBits = 0;
3212 
3213    if (e->tag == Iex_RdTmp) {
3214       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
3215    }
3216 
3217    if (e->tag == Iex_Get) {
3218       HReg dst = newVRegV(env);
3219       addInstr(env, AMD64Instr_SseLdSt(
3220                        True/*load*/,
3221                        16,
3222                        dst,
3223                        AMD64AMode_IR(e->Iex.Get.offset, hregAMD64_RBP())
3224                     )
3225               );
3226       return dst;
3227    }
3228 
3229    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
3230       HReg        dst = newVRegV(env);
3231       AMD64AMode* am  = iselIntExpr_AMode(env, e->Iex.Load.addr);
3232       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
3233       return dst;
3234    }
3235 
3236    if (e->tag == Iex_Const) {
3237       HReg dst = newVRegV(env);
3238       vassert(e->Iex.Const.con->tag == Ico_V128);
3239       switch (e->Iex.Const.con->Ico.V128) {
3240          case 0x0000:
3241             dst = generate_zeroes_V128(env);
3242             break;
3243          case 0xFFFF:
3244             dst = generate_ones_V128(env);
3245             break;
3246          default: {
3247             AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3248             /* do push_uimm64 twice, first time for the high-order half. */
3249             push_uimm64(env, bitmask8_to_bytemask64(
3250                                 (e->Iex.Const.con->Ico.V128 >> 8) & 0xFF
3251                        ));
3252             push_uimm64(env, bitmask8_to_bytemask64(
3253                                 (e->Iex.Const.con->Ico.V128 >> 0) & 0xFF
3254                        ));
3255             addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, rsp0 ));
3256             add_to_rsp(env, 16);
3257             break;
3258          }
3259       }
3260       return dst;
3261    }
3262 
3263    if (e->tag == Iex_Unop) {
3264    switch (e->Iex.Unop.op) {
3265 
3266       case Iop_NotV128: {
3267          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3268          return do_sse_NotV128(env, arg);
3269       }
3270 
3271       case Iop_CmpNEZ64x2: {
3272          /* We can use SSE2 instructions for this. */
3273          /* Ideally, we want to do a 64Ix2 comparison against zero of
3274             the operand.  Problem is no such insn exists.  Solution
3275             therefore is to do a 32Ix4 comparison instead, and bitwise-
3276             negate (NOT) the result.  Let a,b,c,d be 32-bit lanes, and
3277             let the not'd result of this initial comparison be a:b:c:d.
3278             What we need to compute is (a|b):(a|b):(c|d):(c|d).  So, use
3279             pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
3280             giving the required result.
3281 
3282             The required selection sequence is 2,3,0,1, which
3283             according to Intel's documentation means the pshufd
3284             literal value is 0xB1, that is,
3285             (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)
3286          */
3287          HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
3288          HReg tmp  = generate_zeroes_V128(env);
3289          HReg dst  = newVRegV(env);
3290          addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, arg, tmp));
3291          tmp = do_sse_NotV128(env, tmp);
3292          addInstr(env, AMD64Instr_SseShuf(0xB1, tmp, dst));
3293          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3294          return dst;
3295       }
3296 
3297       case Iop_CmpNEZ32x4: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
3298       case Iop_CmpNEZ16x8: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
3299       case Iop_CmpNEZ8x16: op = Asse_CMPEQ8;  goto do_CmpNEZ_vector;
3300       do_CmpNEZ_vector:
3301       {
3302          HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
3303          HReg tmp  = newVRegV(env);
3304          HReg zero = generate_zeroes_V128(env);
3305          HReg dst;
3306          addInstr(env, mk_vMOVsd_RR(arg, tmp));
3307          addInstr(env, AMD64Instr_SseReRg(op, zero, tmp));
3308          dst = do_sse_NotV128(env, tmp);
3309          return dst;
3310       }
3311 
3312       case Iop_RecipEst32Fx4: op = Asse_RCPF;   goto do_32Fx4_unary;
3313       case Iop_RSqrtEst32Fx4: op = Asse_RSQRTF; goto do_32Fx4_unary;
3314       do_32Fx4_unary:
3315       {
3316          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3317          HReg dst = newVRegV(env);
3318          addInstr(env, AMD64Instr_Sse32Fx4(op, arg, dst));
3319          return dst;
3320       }
3321 
3322       case Iop_RecipEst32F0x4: op = Asse_RCPF;   goto do_32F0x4_unary;
3323       case Iop_RSqrtEst32F0x4: op = Asse_RSQRTF; goto do_32F0x4_unary;
3324       case Iop_Sqrt32F0x4:     op = Asse_SQRTF;  goto do_32F0x4_unary;
3325       do_32F0x4_unary:
3326       {
3327          /* A bit subtle.  We have to copy the arg to the result
3328             register first, because actually doing the SSE scalar insn
3329             leaves the upper 3/4 of the destination register
3330             unchanged.  Whereas the required semantics of these
3331             primops is that the upper 3/4 is simply copied in from the
3332             argument. */
3333          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3334          HReg dst = newVRegV(env);
3335          addInstr(env, mk_vMOVsd_RR(arg, dst));
3336          addInstr(env, AMD64Instr_Sse32FLo(op, arg, dst));
3337          return dst;
3338       }
3339 
3340       case Iop_Sqrt64F0x2:  op = Asse_SQRTF;  goto do_64F0x2_unary;
3341       do_64F0x2_unary:
3342       {
3343          /* A bit subtle.  We have to copy the arg to the result
3344             register first, because actually doing the SSE scalar insn
3345             leaves the upper half of the destination register
3346             unchanged.  Whereas the required semantics of these
3347             primops is that the upper half is simply copied in from the
3348             argument. */
3349          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3350          HReg dst = newVRegV(env);
3351          addInstr(env, mk_vMOVsd_RR(arg, dst));
3352          addInstr(env, AMD64Instr_Sse64FLo(op, arg, dst));
3353          return dst;
3354       }
3355 
3356       case Iop_32UtoV128: {
3357          HReg        dst     = newVRegV(env);
3358          AMD64AMode* rsp_m32 = AMD64AMode_IR(-32, hregAMD64_RSP());
3359          AMD64RI*    ri      = iselIntExpr_RI(env, e->Iex.Unop.arg);
3360          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, ri, rsp_m32));
3361          addInstr(env, AMD64Instr_SseLdzLO(4, dst, rsp_m32));
3362          return dst;
3363       }
3364 
3365       case Iop_64UtoV128: {
3366          HReg        dst  = newVRegV(env);
3367          AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3368          AMD64RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Unop.arg);
3369          addInstr(env, AMD64Instr_Push(rmi));
3370          addInstr(env, AMD64Instr_SseLdzLO(8, dst, rsp0));
3371          add_to_rsp(env, 8);
3372          return dst;
3373       }
3374 
3375       case Iop_V256toV128_0:
3376       case Iop_V256toV128_1: {
3377          HReg vHi, vLo;
3378          iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
3379          return (e->Iex.Unop.op == Iop_V256toV128_1) ? vHi : vLo;
3380       }
3381 
3382       default:
3383          break;
3384    } /* switch (e->Iex.Unop.op) */
3385    } /* if (e->tag == Iex_Unop) */
3386 
3387    if (e->tag == Iex_Binop) {
3388    switch (e->Iex.Binop.op) {
3389 
3390       case Iop_Sqrt64Fx2:
3391       case Iop_Sqrt32Fx4: {
3392          /* :: (rmode, vec) -> vec */
3393          HReg arg = iselVecExpr(env, e->Iex.Binop.arg2);
3394          HReg dst = newVRegV(env);
3395          /* XXXROUNDINGFIXME */
3396          /* set roundingmode here */
3397          addInstr(env, (e->Iex.Binop.op == Iop_Sqrt64Fx2
3398                            ? AMD64Instr_Sse64Fx2 : AMD64Instr_Sse32Fx4)
3399                        (Asse_SQRTF, arg, dst));
3400          return dst;
3401       }
3402 
3403       /* FIXME: could we generate MOVQ here? */
3404       case Iop_SetV128lo64: {
3405          HReg dst  = newVRegV(env);
3406          HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3407          HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3408          AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
3409          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
3410          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, AMD64RI_Reg(srcI), rsp_m16));
3411          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
3412          return dst;
3413       }
3414 
3415       /* FIXME: could we generate MOVD here? */
3416       case Iop_SetV128lo32: {
3417          HReg dst  = newVRegV(env);
3418          HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3419          HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3420          AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
3421          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
3422          addInstr(env, AMD64Instr_Store(4, srcI, rsp_m16));
3423          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
3424          return dst;
3425       }
3426 
3427       case Iop_64HLtoV128: {
3428          const IRExpr* arg1 = e->Iex.Binop.arg1;
3429          const IRExpr* arg2 = e->Iex.Binop.arg2;
3430          HReg dst = newVRegV(env);
3431          HReg tmp = newVRegV(env);
3432          HReg qHi = iselIntExpr_R(env, arg1);
3433          // If the args are trivially the same (tmp or const), use the same
3434          // source register for both, and only one movq since those are
3435          // (relatively) expensive.
3436          if (areAtomsAndEqual(arg1, arg2)) {
3437             addInstr(env, AMD64Instr_SseMOVQ(qHi, dst, True/*toXMM*/));
3438             addInstr(env, mk_vMOVsd_RR(dst, tmp));
3439             addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
3440             addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3441          } else {
3442             HReg qLo = iselIntExpr_R(env, arg2);
3443             addInstr(env, AMD64Instr_SseMOVQ(qHi, dst, True/*toXMM*/));
3444             addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
3445             addInstr(env, AMD64Instr_SseMOVQ(qLo, tmp, True/*toXMM*/));
3446             addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3447          }
3448          return dst;
3449       }
3450 
3451       case Iop_CmpEQ32Fx4: op = Asse_CMPEQF; goto do_32Fx4;
3452       case Iop_CmpLT32Fx4: op = Asse_CMPLTF; goto do_32Fx4;
3453       case Iop_CmpLE32Fx4: op = Asse_CMPLEF; goto do_32Fx4;
3454       case Iop_CmpUN32Fx4: op = Asse_CMPUNF; goto do_32Fx4;
3455       case Iop_Max32Fx4:   op = Asse_MAXF;   goto do_32Fx4;
3456       case Iop_Min32Fx4:   op = Asse_MINF;   goto do_32Fx4;
3457       do_32Fx4:
3458       {
3459          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3460          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3461          HReg dst = newVRegV(env);
3462          addInstr(env, mk_vMOVsd_RR(argL, dst));
3463          addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
3464          return dst;
3465       }
3466 
3467       case Iop_CmpEQ64Fx2: op = Asse_CMPEQF; goto do_64Fx2;
3468       case Iop_CmpLT64Fx2: op = Asse_CMPLTF; goto do_64Fx2;
3469       case Iop_CmpLE64Fx2: op = Asse_CMPLEF; goto do_64Fx2;
3470       case Iop_CmpUN64Fx2: op = Asse_CMPUNF; goto do_64Fx2;
3471       case Iop_Max64Fx2:   op = Asse_MAXF;   goto do_64Fx2;
3472       case Iop_Min64Fx2:   op = Asse_MINF;   goto do_64Fx2;
3473       do_64Fx2:
3474       {
3475          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3476          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3477          HReg dst = newVRegV(env);
3478          addInstr(env, mk_vMOVsd_RR(argL, dst));
3479          addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
3480          return dst;
3481       }
3482 
3483       case Iop_CmpEQ32F0x4: op = Asse_CMPEQF; goto do_32F0x4;
3484       case Iop_CmpLT32F0x4: op = Asse_CMPLTF; goto do_32F0x4;
3485       case Iop_CmpLE32F0x4: op = Asse_CMPLEF; goto do_32F0x4;
3486       case Iop_CmpUN32F0x4: op = Asse_CMPUNF; goto do_32F0x4;
3487       case Iop_Add32F0x4:   op = Asse_ADDF;   goto do_32F0x4;
3488       case Iop_Div32F0x4:   op = Asse_DIVF;   goto do_32F0x4;
3489       case Iop_Max32F0x4:   op = Asse_MAXF;   goto do_32F0x4;
3490       case Iop_Min32F0x4:   op = Asse_MINF;   goto do_32F0x4;
3491       case Iop_Mul32F0x4:   op = Asse_MULF;   goto do_32F0x4;
3492       case Iop_Sub32F0x4:   op = Asse_SUBF;   goto do_32F0x4;
3493       do_32F0x4: {
3494          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3495          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3496          HReg dst = newVRegV(env);
3497          addInstr(env, mk_vMOVsd_RR(argL, dst));
3498          addInstr(env, AMD64Instr_Sse32FLo(op, argR, dst));
3499          return dst;
3500       }
3501 
3502       case Iop_CmpEQ64F0x2: op = Asse_CMPEQF; goto do_64F0x2;
3503       case Iop_CmpLT64F0x2: op = Asse_CMPLTF; goto do_64F0x2;
3504       case Iop_CmpLE64F0x2: op = Asse_CMPLEF; goto do_64F0x2;
3505       case Iop_CmpUN64F0x2: op = Asse_CMPUNF; goto do_64F0x2;
3506       case Iop_Add64F0x2:   op = Asse_ADDF;   goto do_64F0x2;
3507       case Iop_Div64F0x2:   op = Asse_DIVF;   goto do_64F0x2;
3508       case Iop_Max64F0x2:   op = Asse_MAXF;   goto do_64F0x2;
3509       case Iop_Min64F0x2:   op = Asse_MINF;   goto do_64F0x2;
3510       case Iop_Mul64F0x2:   op = Asse_MULF;   goto do_64F0x2;
3511       case Iop_Sub64F0x2:   op = Asse_SUBF;   goto do_64F0x2;
3512       do_64F0x2: {
3513          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3514          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3515          HReg dst = newVRegV(env);
3516          addInstr(env, mk_vMOVsd_RR(argL, dst));
3517          addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
3518          return dst;
3519       }
3520 
3521       case Iop_PermOrZero8x16:
3522          if (env->hwcaps & VEX_HWCAPS_AMD64_SSSE3) {
3523             op = Asse_PSHUFB;
3524             goto do_SseReRg;
3525          }
3526          // Otherwise we'll have to generate a call to
3527          // h_generic_calc_PermOrZero8x16 (ATK).  But that would only be for a
3528          // host which doesn't have SSSE3, in which case we don't expect this
3529          // IROp to enter the compilation pipeline in the first place.
3530          break;
3531 
3532       case Iop_QNarrowBin32Sto16Sx8:
3533          op = Asse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
3534       case Iop_QNarrowBin16Sto8Sx16:
3535          op = Asse_PACKSSW; arg1isEReg = True; goto do_SseReRg;
3536       case Iop_QNarrowBin16Sto8Ux16:
3537          op = Asse_PACKUSW; arg1isEReg = True; goto do_SseReRg;
3538 
3539       case Iop_InterleaveHI8x16:
3540          op = Asse_UNPCKHB; arg1isEReg = True; goto do_SseReRg;
3541       case Iop_InterleaveHI16x8:
3542          op = Asse_UNPCKHW; arg1isEReg = True; goto do_SseReRg;
3543       case Iop_InterleaveHI32x4:
3544          op = Asse_UNPCKHD; arg1isEReg = True; goto do_SseReRg;
3545       case Iop_InterleaveHI64x2:
3546          op = Asse_UNPCKHQ; arg1isEReg = True; goto do_SseReRg;
3547 
3548       case Iop_InterleaveLO8x16:
3549          op = Asse_UNPCKLB; arg1isEReg = True; goto do_SseReRg;
3550       case Iop_InterleaveLO16x8:
3551          op = Asse_UNPCKLW; arg1isEReg = True; goto do_SseReRg;
3552       case Iop_InterleaveLO32x4:
3553          op = Asse_UNPCKLD; arg1isEReg = True; goto do_SseReRg;
3554       case Iop_InterleaveLO64x2:
3555          op = Asse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg;
3556 
3557       case Iop_AndV128:    op = Asse_AND;      goto do_SseReRg;
3558       case Iop_OrV128:     op = Asse_OR;       goto do_SseReRg;
3559       case Iop_XorV128:    op = Asse_XOR;      goto do_SseReRg;
3560       case Iop_Add8x16:    op = Asse_ADD8;     goto do_SseReRg;
3561       case Iop_Add16x8:    op = Asse_ADD16;    goto do_SseReRg;
3562       case Iop_Add32x4:    op = Asse_ADD32;    goto do_SseReRg;
3563       case Iop_Add64x2:    op = Asse_ADD64;    goto do_SseReRg;
3564       case Iop_QAdd8Sx16:  op = Asse_QADD8S;   goto do_SseReRg;
3565       case Iop_QAdd16Sx8:  op = Asse_QADD16S;  goto do_SseReRg;
3566       case Iop_QAdd8Ux16:  op = Asse_QADD8U;   goto do_SseReRg;
3567       case Iop_QAdd16Ux8:  op = Asse_QADD16U;  goto do_SseReRg;
3568       case Iop_Avg8Ux16:   op = Asse_AVG8U;    goto do_SseReRg;
3569       case Iop_Avg16Ux8:   op = Asse_AVG16U;   goto do_SseReRg;
3570       case Iop_CmpEQ8x16:  op = Asse_CMPEQ8;   goto do_SseReRg;
3571       case Iop_CmpEQ16x8:  op = Asse_CMPEQ16;  goto do_SseReRg;
3572       case Iop_CmpEQ32x4:  op = Asse_CMPEQ32;  goto do_SseReRg;
3573       case Iop_CmpGT8Sx16: op = Asse_CMPGT8S;  goto do_SseReRg;
3574       case Iop_CmpGT16Sx8: op = Asse_CMPGT16S; goto do_SseReRg;
3575       case Iop_CmpGT32Sx4: op = Asse_CMPGT32S; goto do_SseReRg;
3576       case Iop_Max16Sx8:   op = Asse_MAX16S;   goto do_SseReRg;
3577       case Iop_Max8Ux16:   op = Asse_MAX8U;    goto do_SseReRg;
3578       case Iop_Min16Sx8:   op = Asse_MIN16S;   goto do_SseReRg;
3579       case Iop_Min8Ux16:   op = Asse_MIN8U;    goto do_SseReRg;
3580       case Iop_MulHi16Ux8: op = Asse_MULHI16U; goto do_SseReRg;
3581       case Iop_MulHi16Sx8: op = Asse_MULHI16S; goto do_SseReRg;
3582       case Iop_Mul16x8:    op = Asse_MUL16;    goto do_SseReRg;
3583       case Iop_Sub8x16:    op = Asse_SUB8;     goto do_SseReRg;
3584       case Iop_Sub16x8:    op = Asse_SUB16;    goto do_SseReRg;
3585       case Iop_Sub32x4:    op = Asse_SUB32;    goto do_SseReRg;
3586       case Iop_Sub64x2:    op = Asse_SUB64;    goto do_SseReRg;
3587       case Iop_QSub8Sx16:  op = Asse_QSUB8S;   goto do_SseReRg;
3588       case Iop_QSub16Sx8:  op = Asse_QSUB16S;  goto do_SseReRg;
3589       case Iop_QSub8Ux16:  op = Asse_QSUB8U;   goto do_SseReRg;
3590       case Iop_QSub16Ux8:  op = Asse_QSUB16U;  goto do_SseReRg;
3591       do_SseReRg: {
3592          HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
3593          HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
3594          HReg dst = newVRegV(env);
3595          if (arg1isEReg) {
3596             addInstr(env, mk_vMOVsd_RR(arg2, dst));
3597             addInstr(env, AMD64Instr_SseReRg(op, arg1, dst));
3598          } else {
3599             addInstr(env, mk_vMOVsd_RR(arg1, dst));
3600             addInstr(env, AMD64Instr_SseReRg(op, arg2, dst));
3601          }
3602          return dst;
3603       }
3604 
3605       case Iop_ShlN16x8: laneBits = 16; op = Asse_SHL16; goto do_SseShift;
3606       case Iop_ShlN32x4: laneBits = 32; op = Asse_SHL32; goto do_SseShift;
3607       case Iop_ShlN64x2: laneBits = 64; op = Asse_SHL64; goto do_SseShift;
3608       case Iop_SarN16x8: laneBits = 16; op = Asse_SAR16; goto do_SseShift;
3609       case Iop_SarN32x4: laneBits = 32; op = Asse_SAR32; goto do_SseShift;
3610       case Iop_ShrN16x8: laneBits = 16; op = Asse_SHR16; goto do_SseShift;
3611       case Iop_ShrN32x4: laneBits = 32; op = Asse_SHR32; goto do_SseShift;
3612       case Iop_ShrN64x2: laneBits = 64; op = Asse_SHR64; goto do_SseShift;
3613       do_SseShift: {
3614          HReg dst  = newVRegV(env);
3615          HReg greg = iselVecExpr(env, e->Iex.Binop.arg1);
3616          /* If it's a shift by an in-range immediate, generate a single
3617             instruction. */
3618          if (e->Iex.Binop.arg2->tag == Iex_Const) {
3619             IRConst* c = e->Iex.Binop.arg2->Iex.Const.con;
3620             vassert(c->tag == Ico_U8);
3621             UInt shift = c->Ico.U8;
3622             if (shift < laneBits) {
3623                addInstr(env, mk_vMOVsd_RR(greg, dst));
3624                addInstr(env, AMD64Instr_SseShiftN(op, shift, dst));
3625                return dst;
3626             }
3627          }
3628          /* Otherwise we have to do it the longwinded way. */
3629          AMD64RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
3630          AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3631          HReg        ereg = newVRegV(env);
3632          addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3633          addInstr(env, AMD64Instr_Push(rmi));
3634          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
3635          addInstr(env, mk_vMOVsd_RR(greg, dst));
3636          addInstr(env, AMD64Instr_SseReRg(op, ereg, dst));
3637          add_to_rsp(env, 16);
3638          return dst;
3639       }
3640 
3641       case Iop_Mul32x4:    fn = (HWord)h_generic_calc_Mul32x4;
3642                            goto do_SseAssistedBinary;
3643       case Iop_Max32Sx4:   fn = (HWord)h_generic_calc_Max32Sx4;
3644                            goto do_SseAssistedBinary;
3645       case Iop_Min32Sx4:   fn = (HWord)h_generic_calc_Min32Sx4;
3646                            goto do_SseAssistedBinary;
3647       case Iop_Max32Ux4:   fn = (HWord)h_generic_calc_Max32Ux4;
3648                            goto do_SseAssistedBinary;
3649       case Iop_Min32Ux4:   fn = (HWord)h_generic_calc_Min32Ux4;
3650                            goto do_SseAssistedBinary;
3651       case Iop_Max16Ux8:   fn = (HWord)h_generic_calc_Max16Ux8;
3652                            goto do_SseAssistedBinary;
3653       case Iop_Min16Ux8:   fn = (HWord)h_generic_calc_Min16Ux8;
3654                            goto do_SseAssistedBinary;
3655       case Iop_Max8Sx16:   fn = (HWord)h_generic_calc_Max8Sx16;
3656                            goto do_SseAssistedBinary;
3657       case Iop_Min8Sx16:   fn = (HWord)h_generic_calc_Min8Sx16;
3658                            goto do_SseAssistedBinary;
3659       case Iop_CmpEQ64x2:  fn = (HWord)h_generic_calc_CmpEQ64x2;
3660                            goto do_SseAssistedBinary;
3661       case Iop_CmpGT64Sx2: fn = (HWord)h_generic_calc_CmpGT64Sx2;
3662                            goto do_SseAssistedBinary;
3663       case Iop_Perm32x4:   fn = (HWord)h_generic_calc_Perm32x4;
3664                            goto do_SseAssistedBinary;
3665       case Iop_QNarrowBin32Sto16Ux8:
3666                            fn = (HWord)h_generic_calc_QNarrowBin32Sto16Ux8;
3667                            goto do_SseAssistedBinary;
3668       case Iop_NarrowBin16to8x16:
3669                            fn = (HWord)h_generic_calc_NarrowBin16to8x16;
3670                            goto do_SseAssistedBinary;
3671       case Iop_NarrowBin32to16x8:
3672                            fn = (HWord)h_generic_calc_NarrowBin32to16x8;
3673                            goto do_SseAssistedBinary;
3674       do_SseAssistedBinary: {
3675          /* RRRufff!  RRRufff code is what we're generating here.  Oh
3676             well. */
3677          vassert(fn != 0);
3678          HReg dst = newVRegV(env);
3679          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3680          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3681          HReg argp = newVRegI(env);
3682          /* subq $112, %rsp         -- make a space*/
3683          sub_from_rsp(env, 112);
3684          /* leaq 48(%rsp), %r_argp  -- point into it */
3685          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3686                                         argp));
3687          /* andq $-16, %r_argp      -- 16-align the pointer */
3688          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
3689                                          AMD64RMI_Imm( ~(UInt)15 ),
3690                                          argp));
3691          /* Prepare 3 arg regs:
3692             leaq 0(%r_argp), %rdi
3693             leaq 16(%r_argp), %rsi
3694             leaq 32(%r_argp), %rdx
3695          */
3696          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
3697                                         hregAMD64_RDI()));
3698          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
3699                                         hregAMD64_RSI()));
3700          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
3701                                         hregAMD64_RDX()));
3702          /* Store the two args, at (%rsi) and (%rdx):
3703             movupd  %argL, 0(%rsi)
3704             movupd  %argR, 0(%rdx)
3705          */
3706          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
3707                                           AMD64AMode_IR(0, hregAMD64_RSI())));
3708          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argR,
3709                                           AMD64AMode_IR(0, hregAMD64_RDX())));
3710          /* call the helper */
3711          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
3712                                         3, mk_RetLoc_simple(RLPri_None) ));
3713          /* fetch the result from memory, using %r_argp, which the
3714             register allocator will keep alive across the call. */
3715          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
3716                                           AMD64AMode_IR(0, argp)));
3717          /* and finally, clear the space */
3718          add_to_rsp(env, 112);
3719          return dst;
3720       }
3721 
3722       case Iop_SarN64x2: fn = (HWord)h_generic_calc_SarN64x2;
3723                          goto do_SseAssistedVectorAndScalar;
3724       case Iop_SarN8x16: fn = (HWord)h_generic_calc_SarN8x16;
3725                          goto do_SseAssistedVectorAndScalar;
3726       do_SseAssistedVectorAndScalar: {
3727          /* RRRufff!  RRRufff code is what we're generating here.  Oh
3728             well. */
3729          vassert(fn != 0);
3730          HReg dst = newVRegV(env);
3731          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3732          HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
3733          HReg argp = newVRegI(env);
3734          /* subq $112, %rsp         -- make a space*/
3735          sub_from_rsp(env, 112);
3736          /* leaq 48(%rsp), %r_argp  -- point into it */
3737          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3738                                         argp));
3739          /* andq $-16, %r_argp      -- 16-align the pointer */
3740          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
3741                                          AMD64RMI_Imm( ~(UInt)15 ),
3742                                          argp));
3743          /* Prepare 2 vector arg regs:
3744             leaq 0(%r_argp), %rdi
3745             leaq 16(%r_argp), %rsi
3746          */
3747          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
3748                                         hregAMD64_RDI()));
3749          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
3750                                         hregAMD64_RSI()));
3751          /* Store the vector arg, at (%rsi):
3752             movupd  %argL, 0(%rsi)
3753          */
3754          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
3755                                           AMD64AMode_IR(0, hregAMD64_RSI())));
3756          /* And get the scalar value into rdx */
3757          addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RDX()));
3758 
3759          /* call the helper */
3760          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
3761                                         3, mk_RetLoc_simple(RLPri_None) ));
3762          /* fetch the result from memory, using %r_argp, which the
3763             register allocator will keep alive across the call. */
3764          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
3765                                           AMD64AMode_IR(0, argp)));
3766          /* and finally, clear the space */
3767          add_to_rsp(env, 112);
3768          return dst;
3769       }
3770 
3771       case Iop_I32StoF32x4:
3772       case Iop_F32toI32Sx4: {
3773          HReg arg = iselVecExpr(env, e->Iex.Binop.arg2);
3774          HReg dst = newVRegV(env);
3775          AMD64SseOp mop
3776             = e->Iex.Binop.op == Iop_I32StoF32x4 ? Asse_I2F : Asse_F2I;
3777          set_SSE_rounding_mode(env, e->Iex.Binop.arg1);
3778          addInstr(env, AMD64Instr_Sse32Fx4(mop, arg, dst));
3779          set_SSE_rounding_default(env);
3780          return dst;
3781       }
3782 
3783       default:
3784          break;
3785    } /* switch (e->Iex.Binop.op) */
3786    } /* if (e->tag == Iex_Binop) */
3787 
3788    if (e->tag == Iex_Triop) {
3789    IRTriop *triop = e->Iex.Triop.details;
3790    switch (triop->op) {
3791 
3792       case Iop_Add64Fx2: op = Asse_ADDF; goto do_64Fx2_w_rm;
3793       case Iop_Sub64Fx2: op = Asse_SUBF; goto do_64Fx2_w_rm;
3794       case Iop_Mul64Fx2: op = Asse_MULF; goto do_64Fx2_w_rm;
3795       case Iop_Div64Fx2: op = Asse_DIVF; goto do_64Fx2_w_rm;
3796       do_64Fx2_w_rm:
3797       {
3798          HReg argL = iselVecExpr(env, triop->arg2);
3799          HReg argR = iselVecExpr(env, triop->arg3);
3800          HReg dst = newVRegV(env);
3801          addInstr(env, mk_vMOVsd_RR(argL, dst));
3802          /* XXXROUNDINGFIXME */
3803          /* set roundingmode here */
3804          addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
3805          return dst;
3806       }
3807 
3808       case Iop_Add32Fx4: op = Asse_ADDF; goto do_32Fx4_w_rm;
3809       case Iop_Sub32Fx4: op = Asse_SUBF; goto do_32Fx4_w_rm;
3810       case Iop_Mul32Fx4: op = Asse_MULF; goto do_32Fx4_w_rm;
3811       case Iop_Div32Fx4: op = Asse_DIVF; goto do_32Fx4_w_rm;
3812       do_32Fx4_w_rm:
3813       {
3814          HReg argL = iselVecExpr(env, triop->arg2);
3815          HReg argR = iselVecExpr(env, triop->arg3);
3816          HReg dst = newVRegV(env);
3817          addInstr(env, mk_vMOVsd_RR(argL, dst));
3818          /* XXXROUNDINGFIXME */
3819          /* set roundingmode here */
3820          addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
3821          return dst;
3822       }
3823 
3824       default:
3825          break;
3826    } /* switch (triop->op) */
3827    } /* if (e->tag == Iex_Triop) */
3828 
3829    if (e->tag == Iex_ITE) { // VFD
3830       HReg r1  = iselVecExpr(env, e->Iex.ITE.iftrue);
3831       HReg r0  = iselVecExpr(env, e->Iex.ITE.iffalse);
3832       HReg dst = newVRegV(env);
3833       addInstr(env, mk_vMOVsd_RR(r1,dst));
3834       AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
3835       addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
3836       return dst;
3837    }
3838 
3839    //vec_fail:
3840    vex_printf("iselVecExpr (amd64, subarch = %s): can't reduce\n",
3841               LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
3842    ppIRExpr(e);
3843    vpanic("iselVecExpr_wrk");
3844 }
3845 
3846 
3847 /*---------------------------------------------------------*/
3848 /*--- ISEL: SIMD (V256) expressions, into 2 XMM regs.    --*/
3849 /*---------------------------------------------------------*/
3850 
iselDVecExpr(HReg * rHi,HReg * rLo,ISelEnv * env,const IRExpr * e)3851 static void iselDVecExpr ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
3852                            ISelEnv* env, const IRExpr* e )
3853 {
3854    iselDVecExpr_wrk( rHi, rLo, env, e );
3855 #  if 0
3856    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3857 #  endif
3858    vassert(hregClass(*rHi) == HRcVec128);
3859    vassert(hregClass(*rLo) == HRcVec128);
3860    vassert(hregIsVirtual(*rHi));
3861    vassert(hregIsVirtual(*rLo));
3862 }
3863 
3864 
3865 /* DO NOT CALL THIS DIRECTLY */
iselDVecExpr_wrk(HReg * rHi,HReg * rLo,ISelEnv * env,const IRExpr * e)3866 static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
3867                                ISelEnv* env, const IRExpr* e )
3868 {
3869    HWord fn = 0; /* address of helper fn, if required */
3870    vassert(e);
3871    IRType ty = typeOfIRExpr(env->type_env, e);
3872    vassert(ty == Ity_V256);
3873    UInt laneBits = 0;
3874 
3875    AMD64SseOp op = Asse_INVALID;
3876 
3877    /* read 256-bit IRTemp */
3878    if (e->tag == Iex_RdTmp) {
3879       lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
3880       return;
3881    }
3882 
3883    if (e->tag == Iex_Get) {
3884       HReg        vHi  = newVRegV(env);
3885       HReg        vLo  = newVRegV(env);
3886       HReg        rbp  = hregAMD64_RBP();
3887       AMD64AMode* am0  = AMD64AMode_IR(e->Iex.Get.offset + 0,  rbp);
3888       AMD64AMode* am16 = AMD64AMode_IR(e->Iex.Get.offset + 16, rbp);
3889       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
3890       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
3891       *rHi = vHi;
3892       *rLo = vLo;
3893       return;
3894    }
3895 
3896    if (e->tag == Iex_Load) {
3897       HReg        vHi  = newVRegV(env);
3898       HReg        vLo  = newVRegV(env);
3899       HReg        rA   = iselIntExpr_R(env, e->Iex.Load.addr);
3900       AMD64AMode* am0  = AMD64AMode_IR(0,  rA);
3901       AMD64AMode* am16 = AMD64AMode_IR(16, rA);
3902       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
3903       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
3904       *rHi = vHi;
3905       *rLo = vLo;
3906       return;
3907    }
3908 
3909    if (e->tag == Iex_Const) {
3910       vassert(e->Iex.Const.con->tag == Ico_V256);
3911       switch (e->Iex.Const.con->Ico.V256) {
3912          case 0x00000000: {
3913             HReg vHi = generate_zeroes_V128(env);
3914             HReg vLo = newVRegV(env);
3915             addInstr(env, mk_vMOVsd_RR(vHi, vLo));
3916             *rHi = vHi;
3917             *rLo = vLo;
3918             return;
3919          }
3920          default:
3921             break; /* give up.   Until such time as is necessary. */
3922       }
3923    }
3924 
3925    if (e->tag == Iex_Unop) {
3926    switch (e->Iex.Unop.op) {
3927 
3928       case Iop_NotV256: {
3929          HReg argHi, argLo;
3930          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3931          *rHi = do_sse_NotV128(env, argHi);
3932          *rLo = do_sse_NotV128(env, argLo);
3933          return;
3934       }
3935 
3936       case Iop_RecipEst32Fx8: op = Asse_RCPF;   goto do_32Fx8_unary;
3937       case Iop_Sqrt32Fx8:     op = Asse_SQRTF;  goto do_32Fx8_unary;
3938       case Iop_RSqrtEst32Fx8: op = Asse_RSQRTF; goto do_32Fx8_unary;
3939       do_32Fx8_unary:
3940       {
3941          HReg argHi, argLo;
3942          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3943          HReg dstHi = newVRegV(env);
3944          HReg dstLo = newVRegV(env);
3945          addInstr(env, AMD64Instr_Sse32Fx4(op, argHi, dstHi));
3946          addInstr(env, AMD64Instr_Sse32Fx4(op, argLo, dstLo));
3947          *rHi = dstHi;
3948          *rLo = dstLo;
3949          return;
3950       }
3951 
3952       case Iop_Sqrt64Fx4:  op = Asse_SQRTF;  goto do_64Fx4_unary;
3953       do_64Fx4_unary:
3954       {
3955          HReg argHi, argLo;
3956          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3957          HReg dstHi = newVRegV(env);
3958          HReg dstLo = newVRegV(env);
3959          addInstr(env, AMD64Instr_Sse64Fx2(op, argHi, dstHi));
3960          addInstr(env, AMD64Instr_Sse64Fx2(op, argLo, dstLo));
3961          *rHi = dstHi;
3962          *rLo = dstLo;
3963          return;
3964       }
3965 
3966       case Iop_CmpNEZ64x4: {
3967          /* We can use SSE2 instructions for this. */
3968          /* Same scheme as Iop_CmpNEZ64x2, except twice as wide
3969             (obviously).  See comment on Iop_CmpNEZ64x2 for
3970             explanation of what's going on here. */
3971          HReg argHi, argLo;
3972          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3973          HReg tmpHi  = generate_zeroes_V128(env);
3974          HReg tmpLo  = newVRegV(env);
3975          addInstr(env, mk_vMOVsd_RR(tmpHi, tmpLo));
3976          HReg dstHi  = newVRegV(env);
3977          HReg dstLo  = newVRegV(env);
3978          addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argHi, tmpHi));
3979          addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argLo, tmpLo));
3980          tmpHi = do_sse_NotV128(env, tmpHi);
3981          tmpLo = do_sse_NotV128(env, tmpLo);
3982          addInstr(env, AMD64Instr_SseShuf(0xB1, tmpHi, dstHi));
3983          addInstr(env, AMD64Instr_SseShuf(0xB1, tmpLo, dstLo));
3984          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpHi, dstHi));
3985          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpLo, dstLo));
3986          *rHi = dstHi;
3987          *rLo = dstLo;
3988          return;
3989       }
3990 
3991       case Iop_CmpNEZ32x8: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
3992       case Iop_CmpNEZ16x16: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
3993       case Iop_CmpNEZ8x32: op = Asse_CMPEQ8;  goto do_CmpNEZ_vector;
3994       do_CmpNEZ_vector:
3995       {
3996          HReg argHi, argLo;
3997          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3998          HReg tmpHi = newVRegV(env);
3999          HReg tmpLo = newVRegV(env);
4000          HReg zero  = generate_zeroes_V128(env);
4001          HReg dstHi, dstLo;
4002          addInstr(env, mk_vMOVsd_RR(argHi, tmpHi));
4003          addInstr(env, mk_vMOVsd_RR(argLo, tmpLo));
4004          addInstr(env, AMD64Instr_SseReRg(op, zero, tmpHi));
4005          addInstr(env, AMD64Instr_SseReRg(op, zero, tmpLo));
4006          dstHi = do_sse_NotV128(env, tmpHi);
4007          dstLo = do_sse_NotV128(env, tmpLo);
4008          *rHi = dstHi;
4009          *rLo = dstLo;
4010          return;
4011       }
4012 
4013       default:
4014          break;
4015    } /* switch (e->Iex.Unop.op) */
4016    } /* if (e->tag == Iex_Unop) */
4017 
4018    if (e->tag == Iex_Binop) {
4019    switch (e->Iex.Binop.op) {
4020 
4021       case Iop_Max64Fx4:   op = Asse_MAXF;   goto do_64Fx4;
4022       case Iop_Min64Fx4:   op = Asse_MINF;   goto do_64Fx4;
4023       do_64Fx4:
4024       {
4025          HReg argLhi, argLlo, argRhi, argRlo;
4026          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4027          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4028          HReg dstHi = newVRegV(env);
4029          HReg dstLo = newVRegV(env);
4030          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4031          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4032          addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
4033          addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
4034          *rHi = dstHi;
4035          *rLo = dstLo;
4036          return;
4037       }
4038 
4039       case Iop_Max32Fx8:   op = Asse_MAXF;   goto do_32Fx8;
4040       case Iop_Min32Fx8:   op = Asse_MINF;   goto do_32Fx8;
4041       do_32Fx8:
4042       {
4043          HReg argLhi, argLlo, argRhi, argRlo;
4044          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4045          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4046          HReg dstHi = newVRegV(env);
4047          HReg dstLo = newVRegV(env);
4048          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4049          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4050          addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
4051          addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
4052          *rHi = dstHi;
4053          *rLo = dstLo;
4054          return;
4055       }
4056 
4057       case Iop_AndV256:    op = Asse_AND;      goto do_SseReRg;
4058       case Iop_OrV256:     op = Asse_OR;       goto do_SseReRg;
4059       case Iop_XorV256:    op = Asse_XOR;      goto do_SseReRg;
4060       case Iop_Add8x32:    op = Asse_ADD8;     goto do_SseReRg;
4061       case Iop_Add16x16:   op = Asse_ADD16;    goto do_SseReRg;
4062       case Iop_Add32x8:    op = Asse_ADD32;    goto do_SseReRg;
4063       case Iop_Add64x4:    op = Asse_ADD64;    goto do_SseReRg;
4064       case Iop_QAdd8Sx32:  op = Asse_QADD8S;   goto do_SseReRg;
4065       case Iop_QAdd16Sx16: op = Asse_QADD16S;  goto do_SseReRg;
4066       case Iop_QAdd8Ux32:  op = Asse_QADD8U;   goto do_SseReRg;
4067       case Iop_QAdd16Ux16: op = Asse_QADD16U;  goto do_SseReRg;
4068       case Iop_Avg8Ux32:   op = Asse_AVG8U;    goto do_SseReRg;
4069       case Iop_Avg16Ux16:  op = Asse_AVG16U;   goto do_SseReRg;
4070       case Iop_CmpEQ8x32:  op = Asse_CMPEQ8;   goto do_SseReRg;
4071       case Iop_CmpEQ16x16: op = Asse_CMPEQ16;  goto do_SseReRg;
4072       case Iop_CmpEQ32x8:  op = Asse_CMPEQ32;  goto do_SseReRg;
4073       case Iop_CmpGT8Sx32: op = Asse_CMPGT8S;  goto do_SseReRg;
4074       case Iop_CmpGT16Sx16: op = Asse_CMPGT16S; goto do_SseReRg;
4075       case Iop_CmpGT32Sx8: op = Asse_CMPGT32S; goto do_SseReRg;
4076       case Iop_Max16Sx16:  op = Asse_MAX16S;   goto do_SseReRg;
4077       case Iop_Max8Ux32:   op = Asse_MAX8U;    goto do_SseReRg;
4078       case Iop_Min16Sx16:  op = Asse_MIN16S;   goto do_SseReRg;
4079       case Iop_Min8Ux32:   op = Asse_MIN8U;    goto do_SseReRg;
4080       case Iop_MulHi16Ux16: op = Asse_MULHI16U; goto do_SseReRg;
4081       case Iop_MulHi16Sx16: op = Asse_MULHI16S; goto do_SseReRg;
4082       case Iop_Mul16x16:   op = Asse_MUL16;    goto do_SseReRg;
4083       case Iop_Sub8x32:    op = Asse_SUB8;     goto do_SseReRg;
4084       case Iop_Sub16x16:   op = Asse_SUB16;    goto do_SseReRg;
4085       case Iop_Sub32x8:    op = Asse_SUB32;    goto do_SseReRg;
4086       case Iop_Sub64x4:    op = Asse_SUB64;    goto do_SseReRg;
4087       case Iop_QSub8Sx32:  op = Asse_QSUB8S;   goto do_SseReRg;
4088       case Iop_QSub16Sx16: op = Asse_QSUB16S;  goto do_SseReRg;
4089       case Iop_QSub8Ux32:  op = Asse_QSUB8U;   goto do_SseReRg;
4090       case Iop_QSub16Ux16: op = Asse_QSUB16U;  goto do_SseReRg;
4091       do_SseReRg:
4092       {
4093          HReg argLhi, argLlo, argRhi, argRlo;
4094          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4095          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4096          HReg dstHi = newVRegV(env);
4097          HReg dstLo = newVRegV(env);
4098          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4099          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4100          addInstr(env, AMD64Instr_SseReRg(op, argRhi, dstHi));
4101          addInstr(env, AMD64Instr_SseReRg(op, argRlo, dstLo));
4102          *rHi = dstHi;
4103          *rLo = dstLo;
4104          return;
4105       }
4106 
4107       case Iop_ShlN16x16: laneBits = 16; op = Asse_SHL16; goto do_SseShift;
4108       case Iop_ShlN32x8:  laneBits = 32; op = Asse_SHL32; goto do_SseShift;
4109       case Iop_ShlN64x4:  laneBits = 64; op = Asse_SHL64; goto do_SseShift;
4110       case Iop_SarN16x16: laneBits = 16; op = Asse_SAR16; goto do_SseShift;
4111       case Iop_SarN32x8:  laneBits = 32; op = Asse_SAR32; goto do_SseShift;
4112       case Iop_ShrN16x16: laneBits = 16; op = Asse_SHR16; goto do_SseShift;
4113       case Iop_ShrN32x8:  laneBits = 32; op = Asse_SHR32; goto do_SseShift;
4114       case Iop_ShrN64x4:  laneBits = 64; op = Asse_SHR64; goto do_SseShift;
4115       do_SseShift: {
4116          HReg dstHi = newVRegV(env);
4117          HReg dstLo = newVRegV(env);
4118          HReg gregHi, gregLo;
4119          iselDVecExpr(&gregHi, &gregLo, env, e->Iex.Binop.arg1);
4120          /* If it's a shift by an in-range immediate, generate two single
4121             instructions. */
4122          if (e->Iex.Binop.arg2->tag == Iex_Const) {
4123             IRConst* c = e->Iex.Binop.arg2->Iex.Const.con;
4124             vassert(c->tag == Ico_U8);
4125             UInt shift = c->Ico.U8;
4126             if (shift < laneBits) {
4127                addInstr(env, mk_vMOVsd_RR(gregHi, dstHi));
4128                addInstr(env, AMD64Instr_SseShiftN(op, shift, dstHi));
4129                addInstr(env, mk_vMOVsd_RR(gregLo, dstLo));
4130                addInstr(env, AMD64Instr_SseShiftN(op, shift, dstLo));
4131                *rHi = dstHi;
4132                *rLo = dstLo;
4133                return;
4134             }
4135          }
4136          /* Otherwise we have to do it the longwinded way. */
4137          AMD64RMI*   rmi   = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
4138          AMD64AMode* rsp0  = AMD64AMode_IR(0, hregAMD64_RSP());
4139          HReg        ereg  = newVRegV(env);
4140          addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
4141          addInstr(env, AMD64Instr_Push(rmi));
4142          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
4143          addInstr(env, mk_vMOVsd_RR(gregHi, dstHi));
4144          addInstr(env, AMD64Instr_SseReRg(op, ereg, dstHi));
4145          addInstr(env, mk_vMOVsd_RR(gregLo, dstLo));
4146          addInstr(env, AMD64Instr_SseReRg(op, ereg, dstLo));
4147          add_to_rsp(env, 16);
4148          *rHi = dstHi;
4149          *rLo = dstLo;
4150          return;
4151       }
4152 
4153       case Iop_V128HLtoV256: {
4154          // Curiously, there doesn't seem to be any benefit to be had here by
4155          // checking whether arg1 and arg2 are the same, in the style of how
4156          // (eg) 64HLtoV128 is handled elsewhere in this file.
4157          *rHi = iselVecExpr(env, e->Iex.Binop.arg1);
4158          *rLo = iselVecExpr(env, e->Iex.Binop.arg2);
4159          return;
4160       }
4161 
4162       case Iop_Mul32x8:    fn = (HWord)h_generic_calc_Mul32x4;
4163                            goto do_SseAssistedBinary;
4164       case Iop_Max32Sx8:   fn = (HWord)h_generic_calc_Max32Sx4;
4165                            goto do_SseAssistedBinary;
4166       case Iop_Min32Sx8:   fn = (HWord)h_generic_calc_Min32Sx4;
4167                            goto do_SseAssistedBinary;
4168       case Iop_Max32Ux8:   fn = (HWord)h_generic_calc_Max32Ux4;
4169                            goto do_SseAssistedBinary;
4170       case Iop_Min32Ux8:   fn = (HWord)h_generic_calc_Min32Ux4;
4171                            goto do_SseAssistedBinary;
4172       case Iop_Max16Ux16:  fn = (HWord)h_generic_calc_Max16Ux8;
4173                            goto do_SseAssistedBinary;
4174       case Iop_Min16Ux16:  fn = (HWord)h_generic_calc_Min16Ux8;
4175                            goto do_SseAssistedBinary;
4176       case Iop_Max8Sx32:   fn = (HWord)h_generic_calc_Max8Sx16;
4177                            goto do_SseAssistedBinary;
4178       case Iop_Min8Sx32:   fn = (HWord)h_generic_calc_Min8Sx16;
4179                            goto do_SseAssistedBinary;
4180       case Iop_CmpEQ64x4:  fn = (HWord)h_generic_calc_CmpEQ64x2;
4181                            goto do_SseAssistedBinary;
4182       case Iop_CmpGT64Sx4: fn = (HWord)h_generic_calc_CmpGT64Sx2;
4183                            goto do_SseAssistedBinary;
4184       do_SseAssistedBinary: {
4185          /* RRRufff!  RRRufff code is what we're generating here.  Oh
4186             well. */
4187          vassert(fn != 0);
4188          HReg dstHi = newVRegV(env);
4189          HReg dstLo = newVRegV(env);
4190          HReg argLhi, argLlo, argRhi, argRlo;
4191          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4192          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4193          HReg argp = newVRegI(env);
4194          /* subq $160, %rsp         -- make a space*/
4195          sub_from_rsp(env, 160);
4196          /* leaq 48(%rsp), %r_argp  -- point into it */
4197          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
4198                                         argp));
4199          /* andq $-16, %r_argp      -- 16-align the pointer */
4200          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
4201                                          AMD64RMI_Imm( ~(UInt)15 ),
4202                                          argp));
4203          /* Prepare 3 arg regs:
4204             leaq 0(%r_argp), %rdi
4205             leaq 16(%r_argp), %rsi
4206             leaq 32(%r_argp), %rdx
4207          */
4208          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
4209                                         hregAMD64_RDI()));
4210          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
4211                                         hregAMD64_RSI()));
4212          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
4213                                         hregAMD64_RDX()));
4214          /* Store the two high args, at (%rsi) and (%rdx):
4215             movupd  %argLhi, 0(%rsi)
4216             movupd  %argRhi, 0(%rdx)
4217          */
4218          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
4219                                           AMD64AMode_IR(0, hregAMD64_RSI())));
4220          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
4221                                           AMD64AMode_IR(0, hregAMD64_RDX())));
4222          /* Store the two low args, at 48(%rsi) and 48(%rdx):
4223             movupd  %argLlo, 48(%rsi)
4224             movupd  %argRlo, 48(%rdx)
4225          */
4226          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
4227                                           AMD64AMode_IR(48, hregAMD64_RSI())));
4228          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
4229                                           AMD64AMode_IR(48, hregAMD64_RDX())));
4230          /* call the helper */
4231          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4232                                         mk_RetLoc_simple(RLPri_None) ));
4233          /* Prepare 3 arg regs:
4234             leaq 48(%r_argp), %rdi
4235             leaq 64(%r_argp), %rsi
4236             leaq 80(%r_argp), %rdx
4237          */
4238          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, argp),
4239                                         hregAMD64_RDI()));
4240          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
4241                                         hregAMD64_RSI()));
4242          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(80, argp),
4243                                         hregAMD64_RDX()));
4244          /* call the helper */
4245          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4246                                         mk_RetLoc_simple(RLPri_None) ));
4247          /* fetch the result from memory, using %r_argp, which the
4248             register allocator will keep alive across the call. */
4249          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
4250                                           AMD64AMode_IR(0, argp)));
4251          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
4252                                           AMD64AMode_IR(48, argp)));
4253          /* and finally, clear the space */
4254          add_to_rsp(env, 160);
4255          *rHi = dstHi;
4256          *rLo = dstLo;
4257          return;
4258       }
4259 
4260       case Iop_Perm32x8:   fn = (HWord)h_generic_calc_Perm32x8;
4261                            goto do_SseAssistedBinary256;
4262       do_SseAssistedBinary256: {
4263          /* RRRufff!  RRRufff code is what we're generating here.  Oh
4264             well. */
4265          vassert(fn != 0);
4266          HReg dstHi = newVRegV(env);
4267          HReg dstLo = newVRegV(env);
4268          HReg argLhi, argLlo, argRhi, argRlo;
4269          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4270          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4271          HReg argp = newVRegI(env);
4272          /* subq $160, %rsp         -- make a space*/
4273          sub_from_rsp(env, 160);
4274          /* leaq 48(%rsp), %r_argp  -- point into it */
4275          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
4276                                         argp));
4277          /* andq $-16, %r_argp      -- 16-align the pointer */
4278          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
4279                                          AMD64RMI_Imm( ~(UInt)15 ),
4280                                          argp));
4281          /* Prepare 3 arg regs:
4282             leaq 0(%r_argp), %rdi
4283             leaq 32(%r_argp), %rsi
4284             leaq 64(%r_argp), %rdx
4285          */
4286          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
4287                                         hregAMD64_RDI()));
4288          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
4289                                         hregAMD64_RSI()));
4290          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
4291                                         hregAMD64_RDX()));
4292          /* Store the two args, at (%rsi) and (%rdx):
4293             movupd  %argLlo, 0(%rsi)
4294             movupd  %argLhi, 16(%rsi)
4295             movupd  %argRlo, 0(%rdx)
4296             movupd  %argRhi, 16(%rdx)
4297          */
4298          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
4299                                           AMD64AMode_IR(0, hregAMD64_RSI())));
4300          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
4301                                           AMD64AMode_IR(16, hregAMD64_RSI())));
4302          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
4303                                           AMD64AMode_IR(0, hregAMD64_RDX())));
4304          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
4305                                           AMD64AMode_IR(16, hregAMD64_RDX())));
4306          /* call the helper */
4307          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4308                                         mk_RetLoc_simple(RLPri_None) ));
4309          /* fetch the result from memory, using %r_argp, which the
4310             register allocator will keep alive across the call. */
4311          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
4312                                           AMD64AMode_IR(0, argp)));
4313          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
4314                                           AMD64AMode_IR(16, argp)));
4315          /* and finally, clear the space */
4316          add_to_rsp(env, 160);
4317          *rHi = dstHi;
4318          *rLo = dstLo;
4319          return;
4320       }
4321 
4322       case Iop_I32StoF32x8:
4323       case Iop_F32toI32Sx8: {
4324          HReg argHi, argLo;
4325          iselDVecExpr(&argHi, &argLo, env, e->Iex.Binop.arg2);
4326          HReg dstHi = newVRegV(env);
4327          HReg dstLo = newVRegV(env);
4328          AMD64SseOp mop
4329             = e->Iex.Binop.op == Iop_I32StoF32x8 ? Asse_I2F : Asse_F2I;
4330          set_SSE_rounding_mode(env, e->Iex.Binop.arg1);
4331          addInstr(env, AMD64Instr_Sse32Fx4(mop, argHi, dstHi));
4332          addInstr(env, AMD64Instr_Sse32Fx4(mop, argLo, dstLo));
4333          set_SSE_rounding_default(env);
4334          *rHi = dstHi;
4335          *rLo = dstLo;
4336          return;
4337       }
4338 
4339       default:
4340          break;
4341    } /* switch (e->Iex.Binop.op) */
4342    } /* if (e->tag == Iex_Binop) */
4343 
4344    if (e->tag == Iex_Triop) {
4345    IRTriop *triop = e->Iex.Triop.details;
4346    switch (triop->op) {
4347 
4348       case Iop_Add64Fx4: op = Asse_ADDF; goto do_64Fx4_w_rm;
4349       case Iop_Sub64Fx4: op = Asse_SUBF; goto do_64Fx4_w_rm;
4350       case Iop_Mul64Fx4: op = Asse_MULF; goto do_64Fx4_w_rm;
4351       case Iop_Div64Fx4: op = Asse_DIVF; goto do_64Fx4_w_rm;
4352       do_64Fx4_w_rm:
4353       {
4354          HReg argLhi, argLlo, argRhi, argRlo;
4355          iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
4356          iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
4357          HReg dstHi = newVRegV(env);
4358          HReg dstLo = newVRegV(env);
4359          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4360          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4361          /* XXXROUNDINGFIXME */
4362          /* set roundingmode here */
4363          addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
4364          addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
4365          *rHi = dstHi;
4366          *rLo = dstLo;
4367          return;
4368       }
4369 
4370       case Iop_Add32Fx8: op = Asse_ADDF; goto do_32Fx8_w_rm;
4371       case Iop_Sub32Fx8: op = Asse_SUBF; goto do_32Fx8_w_rm;
4372       case Iop_Mul32Fx8: op = Asse_MULF; goto do_32Fx8_w_rm;
4373       case Iop_Div32Fx8: op = Asse_DIVF; goto do_32Fx8_w_rm;
4374       do_32Fx8_w_rm:
4375       {
4376          HReg argLhi, argLlo, argRhi, argRlo;
4377          iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
4378          iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
4379          HReg dstHi = newVRegV(env);
4380          HReg dstLo = newVRegV(env);
4381          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4382          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4383          /* XXXROUNDINGFIXME */
4384          /* set roundingmode here */
4385          addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
4386          addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
4387          *rHi = dstHi;
4388          *rLo = dstLo;
4389          return;
4390       }
4391 
4392       default:
4393          break;
4394    } /* switch (triop->op) */
4395    } /* if (e->tag == Iex_Triop) */
4396 
4397 
4398    if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_64x4toV256) {
4399       const IRExpr* arg1 = e->Iex.Qop.details->arg1;
4400       const IRExpr* arg2 = e->Iex.Qop.details->arg2;
4401       const IRExpr* arg3 = e->Iex.Qop.details->arg3;
4402       const IRExpr* arg4 = e->Iex.Qop.details->arg4;
4403       // If the args are trivially the same (tmp or const), use the same
4404       // source register for all four, and only one movq since those are
4405       // (relatively) expensive.
4406       if (areAtomsAndEqual(arg1, arg2)
4407           && areAtomsAndEqual(arg1, arg3) && areAtomsAndEqual(arg1, arg4)) {
4408          HReg q3 = iselIntExpr_R(env, e->Iex.Qop.details->arg1);
4409          HReg tmp = newVRegV(env);
4410          HReg dst = newVRegV(env);
4411          addInstr(env, AMD64Instr_SseMOVQ(q3, dst, True/*toXMM*/));
4412          addInstr(env, mk_vMOVsd_RR(dst, tmp));
4413          addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
4414          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
4415          *rHi = dst;
4416          *rLo = dst;
4417       } else {
4418          /* arg1 is the most significant (Q3), arg4 the least (Q0) */
4419          HReg q3 = iselIntExpr_R(env, arg1);
4420          HReg q2 = iselIntExpr_R(env, arg2);
4421          HReg q1 = iselIntExpr_R(env, arg3);
4422          HReg q0 = iselIntExpr_R(env, arg4);
4423          HReg tmp = newVRegV(env);
4424          HReg dstHi = newVRegV(env);
4425          HReg dstLo = newVRegV(env);
4426          addInstr(env, AMD64Instr_SseMOVQ(q3, dstHi, True/*toXMM*/));
4427          addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstHi));
4428          addInstr(env, AMD64Instr_SseMOVQ(q2, tmp, True/*toXMM*/));
4429          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dstHi));
4430          addInstr(env, AMD64Instr_SseMOVQ(q1, dstLo, True/*toXMM*/));
4431          addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstLo));
4432          addInstr(env, AMD64Instr_SseMOVQ(q0, tmp, True/*toXMM*/));
4433          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dstLo));
4434          *rHi = dstHi;
4435          *rLo = dstLo;
4436       }
4437       return;
4438    }
4439 
4440    if (e->tag == Iex_ITE) {
4441       HReg r1Hi, r1Lo, r0Hi, r0Lo;
4442       iselDVecExpr(&r1Hi, &r1Lo, env, e->Iex.ITE.iftrue);
4443       iselDVecExpr(&r0Hi, &r0Lo, env, e->Iex.ITE.iffalse);
4444       HReg dstHi = newVRegV(env);
4445       HReg dstLo = newVRegV(env);
4446       addInstr(env, mk_vMOVsd_RR(r1Hi,dstHi));
4447       addInstr(env, mk_vMOVsd_RR(r1Lo,dstLo));
4448       AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
4449       addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Hi, dstHi));
4450       addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Lo, dstLo));
4451       *rHi = dstHi;
4452       *rLo = dstLo;
4453       return;
4454    }
4455 
4456    //avx_fail:
4457    vex_printf("iselDVecExpr (amd64, subarch = %s): can't reduce\n",
4458               LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
4459    ppIRExpr(e);
4460    vpanic("iselDVecExpr_wrk");
4461 }
4462 
4463 
4464 /*---------------------------------------------------------*/
4465 /*--- ISEL: Statements                                  ---*/
4466 /*---------------------------------------------------------*/
4467 
iselStmt(ISelEnv * env,IRStmt * stmt)4468 static void iselStmt ( ISelEnv* env, IRStmt* stmt )
4469 {
4470    if (vex_traceflags & VEX_TRACE_VCODE) {
4471       vex_printf("\n-- ");
4472       ppIRStmt(stmt);
4473       vex_printf("\n");
4474    }
4475 
4476    switch (stmt->tag) {
4477 
4478    /* --------- LOADG (guarded load) --------- */
4479    case Ist_LoadG: {
4480       IRLoadG* lg = stmt->Ist.LoadG.details;
4481       if (lg->end != Iend_LE)
4482          goto stmt_fail;
4483 
4484       UChar szB = 0; /* invalid */
4485       switch (lg->cvt) {
4486          case ILGop_Ident32:   szB = 4;  break;
4487          case ILGop_Ident64:   szB = 8;  break;
4488          case ILGop_IdentV128: szB = 16; break;
4489          default: break;
4490       }
4491       if (szB == 0)
4492          goto stmt_fail;
4493 
4494       AMD64AMode* amAddr
4495          = iselIntExpr_AMode(env, lg->addr);
4496       HReg rAlt
4497          = szB == 16 ? iselVecExpr(env, lg->alt)
4498                      : iselIntExpr_R(env, lg->alt);
4499       HReg rDst
4500          = lookupIRTemp(env, lg->dst);
4501 
4502       /* Get the alt value into the dst.  We'll do a conditional load
4503          which overwrites it -- or not -- with loaded data. */
4504       if (szB == 16) {
4505          addInstr(env, mk_vMOVsd_RR(rAlt, rDst));
4506       } else {
4507          addInstr(env, mk_iMOVsd_RR(rAlt, rDst));
4508       }
4509       AMD64CondCode cc = iselCondCode(env, lg->guard);
4510       if (szB == 16) {
4511          addInstr(env, AMD64Instr_SseCLoad(cc, amAddr, rDst));
4512       } else {
4513          addInstr(env, AMD64Instr_CLoad(cc, szB, amAddr, rDst));
4514       }
4515       return;
4516    }
4517 
4518    /* --------- STOREG (guarded store) --------- */
4519    case Ist_StoreG: {
4520       IRStoreG* sg = stmt->Ist.StoreG.details;
4521       if (sg->end != Iend_LE)
4522          goto stmt_fail;
4523 
4524       UChar szB = 0; /* invalid */
4525       switch (typeOfIRExpr(env->type_env, sg->data)) {
4526          case Ity_I32:  szB = 4; break;
4527          case Ity_I64:  szB = 8; break;
4528          case Ity_V128: szB = 16; break;
4529          default: break;
4530       }
4531       if (szB == 0)
4532          goto stmt_fail;
4533 
4534       AMD64AMode* amAddr
4535          = iselIntExpr_AMode(env, sg->addr);
4536       HReg rSrc
4537          = szB == 16 ? iselVecExpr(env, sg->data)
4538                      : iselIntExpr_R(env, sg->data);
4539       AMD64CondCode cc
4540          = iselCondCode(env, sg->guard);
4541       if (szB == 16) {
4542          addInstr(env, AMD64Instr_SseCStore(cc, rSrc, amAddr));
4543       } else {
4544          addInstr(env, AMD64Instr_CStore(cc, szB, rSrc, amAddr));
4545       }
4546       return;
4547    }
4548 
4549    /* --------- STORE --------- */
4550    case Ist_Store: {
4551       IRType    tya   = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
4552       IRType    tyd   = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
4553       IREndness end   = stmt->Ist.Store.end;
4554 
4555       if (tya != Ity_I64 || end != Iend_LE)
4556          goto stmt_fail;
4557 
4558       if (tyd == Ity_I64) {
4559          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4560          AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Store.data);
4561          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,ri,am));
4562          return;
4563       }
4564       if (tyd == Ity_I8 || tyd == Ity_I16 || tyd == Ity_I32) {
4565          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4566          HReg r = iselIntExpr_R(env, stmt->Ist.Store.data);
4567          addInstr(env, AMD64Instr_Store(
4568                           toUChar(tyd==Ity_I8 ? 1 : (tyd==Ity_I16 ? 2 : 4)),
4569                           r,am));
4570          return;
4571       }
4572       if (tyd == Ity_F64) {
4573          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4574          HReg r = iselDblExpr(env, stmt->Ist.Store.data);
4575          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, r, am));
4576          return;
4577       }
4578       if (tyd == Ity_F32) {
4579          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4580          HReg r = iselFltExpr(env, stmt->Ist.Store.data);
4581          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, r, am));
4582          return;
4583       }
4584       if (tyd == Ity_V128) {
4585          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4586          HReg r = iselVecExpr(env, stmt->Ist.Store.data);
4587          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, r, am));
4588          return;
4589       }
4590       if (tyd == Ity_V256) {
4591          HReg        rA   = iselIntExpr_R(env, stmt->Ist.Store.addr);
4592          AMD64AMode* am0  = AMD64AMode_IR(0,  rA);
4593          AMD64AMode* am16 = AMD64AMode_IR(16, rA);
4594          HReg vHi, vLo;
4595          iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Store.data);
4596          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
4597          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
4598          return;
4599       }
4600       break;
4601    }
4602 
4603    /* --------- PUT --------- */
4604    case Ist_Put: {
4605       IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
4606       if (ty == Ity_I64) {
4607          /* We're going to write to memory, so compute the RHS into an
4608             AMD64RI. */
4609          AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Put.data);
4610          addInstr(env,
4611                   AMD64Instr_Alu64M(
4612                      Aalu_MOV,
4613                      ri,
4614                      AMD64AMode_IR(stmt->Ist.Put.offset,
4615                                    hregAMD64_RBP())
4616                  ));
4617          return;
4618       }
4619       if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
4620          HReg r = iselIntExpr_R(env, stmt->Ist.Put.data);
4621          addInstr(env, AMD64Instr_Store(
4622                           toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
4623                           r,
4624                           AMD64AMode_IR(stmt->Ist.Put.offset,
4625                                         hregAMD64_RBP())));
4626          return;
4627       }
4628       if (ty == Ity_F32) {
4629          HReg f32 = iselFltExpr(env, stmt->Ist.Put.data);
4630          AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset, hregAMD64_RBP());
4631          set_SSE_rounding_default(env); /* paranoia */
4632          addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 4, f32, am ));
4633          return;
4634       }
4635       if (ty == Ity_F64) {
4636          HReg f64 = iselDblExpr(env, stmt->Ist.Put.data);
4637          AMD64AMode* am = AMD64AMode_IR( stmt->Ist.Put.offset,
4638                                          hregAMD64_RBP() );
4639          addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, f64, am ));
4640          return;
4641       }
4642       if (ty == Ity_V128) {
4643          HReg        vec = iselVecExpr(env, stmt->Ist.Put.data);
4644          AMD64AMode* am  = AMD64AMode_IR(stmt->Ist.Put.offset,
4645                                          hregAMD64_RBP());
4646          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, am));
4647          return;
4648       }
4649       if (ty == Ity_V256) {
4650          HReg vHi, vLo;
4651          iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Put.data);
4652          HReg        rbp  = hregAMD64_RBP();
4653          AMD64AMode* am0  = AMD64AMode_IR(stmt->Ist.Put.offset + 0,  rbp);
4654          AMD64AMode* am16 = AMD64AMode_IR(stmt->Ist.Put.offset + 16, rbp);
4655          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
4656          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
4657          return;
4658       }
4659       break;
4660    }
4661 
4662    /* --------- Indexed PUT --------- */
4663    case Ist_PutI: {
4664       IRPutI *puti = stmt->Ist.PutI.details;
4665 
4666       AMD64AMode* am
4667          = genGuestArrayOffset(
4668               env, puti->descr,
4669                    puti->ix, puti->bias );
4670 
4671       IRType ty = typeOfIRExpr(env->type_env, puti->data);
4672       if (ty == Ity_F64) {
4673          HReg val = iselDblExpr(env, puti->data);
4674          addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, val, am ));
4675          return;
4676       }
4677       if (ty == Ity_I8) {
4678          HReg r = iselIntExpr_R(env, puti->data);
4679          addInstr(env, AMD64Instr_Store( 1, r, am ));
4680          return;
4681       }
4682       if (ty == Ity_I64) {
4683          AMD64RI* ri = iselIntExpr_RI(env, puti->data);
4684          addInstr(env, AMD64Instr_Alu64M( Aalu_MOV, ri, am ));
4685          return;
4686       }
4687       break;
4688    }
4689 
4690    /* --------- TMP --------- */
4691    case Ist_WrTmp: {
4692       IRTemp tmp = stmt->Ist.WrTmp.tmp;
4693       IRType ty = typeOfIRTemp(env->type_env, tmp);
4694 
4695       /* optimisation: if stmt->Ist.WrTmp.data is Add64(..,..),
4696          compute it into an AMode and then use LEA.  This usually
4697          produces fewer instructions, often because (for memcheck
4698          created IR) we get t = address-expression, (t is later used
4699          twice) and so doing this naturally turns address-expression
4700          back into an AMD64 amode. */
4701       if (ty == Ity_I64
4702           && stmt->Ist.WrTmp.data->tag == Iex_Binop
4703           && stmt->Ist.WrTmp.data->Iex.Binop.op == Iop_Add64) {
4704          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.WrTmp.data);
4705          HReg dst = lookupIRTemp(env, tmp);
4706          if (am->tag == Aam_IR && am->Aam.IR.imm == 0) {
4707             /* Hmm, iselIntExpr_AMode wimped out and just computed the
4708                value into a register.  Just emit a normal reg-reg move
4709                so reg-alloc can coalesce it away in the usual way. */
4710             HReg src = am->Aam.IR.reg;
4711             addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst));
4712          } else {
4713             addInstr(env, AMD64Instr_Lea64(am,dst));
4714          }
4715          return;
4716       }
4717 
4718       if (ty == Ity_I64 || ty == Ity_I32
4719           || ty == Ity_I16 || ty == Ity_I8) {
4720          AMD64RMI* rmi = iselIntExpr_RMI(env, stmt->Ist.WrTmp.data);
4721          HReg dst = lookupIRTemp(env, tmp);
4722          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,rmi,dst));
4723          return;
4724       }
4725       if (ty == Ity_I128) {
4726          HReg rHi, rLo, dstHi, dstLo;
4727          iselInt128Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
4728          lookupIRTempPair( &dstHi, &dstLo, env, tmp);
4729          addInstr(env, mk_iMOVsd_RR(rHi,dstHi) );
4730          addInstr(env, mk_iMOVsd_RR(rLo,dstLo) );
4731          return;
4732       }
4733       if (ty == Ity_I1) {
4734          AMD64CondCode cond = iselCondCode(env, stmt->Ist.WrTmp.data);
4735          HReg dst = lookupIRTemp(env, tmp);
4736          addInstr(env, AMD64Instr_Set64(cond, dst));
4737          return;
4738       }
4739       if (ty == Ity_F64) {
4740          HReg dst = lookupIRTemp(env, tmp);
4741          HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
4742          addInstr(env, mk_vMOVsd_RR(src, dst));
4743          return;
4744       }
4745       if (ty == Ity_F32) {
4746          HReg dst = lookupIRTemp(env, tmp);
4747          HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
4748          addInstr(env, mk_vMOVsd_RR(src, dst));
4749          return;
4750       }
4751       if (ty == Ity_V128) {
4752          HReg dst = lookupIRTemp(env, tmp);
4753          HReg src = iselVecExpr(env, stmt->Ist.WrTmp.data);
4754          addInstr(env, mk_vMOVsd_RR(src, dst));
4755          return;
4756       }
4757       if (ty == Ity_V256) {
4758          HReg rHi, rLo, dstHi, dstLo;
4759          iselDVecExpr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
4760          lookupIRTempPair( &dstHi, &dstLo, env, tmp);
4761          addInstr(env, mk_vMOVsd_RR(rHi,dstHi) );
4762          addInstr(env, mk_vMOVsd_RR(rLo,dstLo) );
4763          return;
4764       }
4765       break;
4766    }
4767 
4768    /* --------- Call to DIRTY helper --------- */
4769    case Ist_Dirty: {
4770       IRDirty* d = stmt->Ist.Dirty.details;
4771 
4772       /* Figure out the return type, if any. */
4773       IRType retty = Ity_INVALID;
4774       if (d->tmp != IRTemp_INVALID)
4775          retty = typeOfIRTemp(env->type_env, d->tmp);
4776 
4777       /* Throw out any return types we don't know about. */
4778       Bool retty_ok = False;
4779       switch (retty) {
4780          case Ity_INVALID: /* function doesn't return anything */
4781          case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
4782          case Ity_V128: case Ity_V256:
4783             retty_ok = True; break;
4784          default:
4785             break;
4786       }
4787       if (!retty_ok)
4788          break; /* will go to stmt_fail: */
4789 
4790       /* Marshal args, do the call, and set the return value to
4791          0x555..555 if this is a conditional call that returns a value
4792          and the call is skipped. */
4793       UInt   addToSp = 0;
4794       RetLoc rloc    = mk_RetLoc_INVALID();
4795       doHelperCall( &addToSp, &rloc, env, d->guard, d->cee, retty, d->args );
4796       vassert(is_sane_RetLoc(rloc));
4797 
4798       /* Now figure out what to do with the returned value, if any. */
4799       switch (retty) {
4800          case Ity_INVALID: {
4801             /* No return value.  Nothing to do. */
4802             vassert(d->tmp == IRTemp_INVALID);
4803             vassert(rloc.pri == RLPri_None);
4804             vassert(addToSp == 0);
4805             return;
4806          }
4807          case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: {
4808             /* The returned value is in %rax.  Park it in the register
4809                associated with tmp. */
4810             vassert(rloc.pri == RLPri_Int);
4811             vassert(addToSp == 0);
4812             HReg dst = lookupIRTemp(env, d->tmp);
4813             addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(),dst) );
4814             return;
4815          }
4816          case Ity_V128: {
4817             /* The returned value is on the stack, and rloc.spOff
4818                tells us where.  Fish it off the stack and then move
4819                the stack pointer upwards to clear it, as directed by
4820                doHelperCall. */
4821             vassert(rloc.pri == RLPri_V128SpRel);
4822             vassert(addToSp >= 16);
4823             HReg        dst = lookupIRTemp(env, d->tmp);
4824             AMD64AMode* am  = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP());
4825             addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
4826             add_to_rsp(env, addToSp);
4827             return;
4828          }
4829          case Ity_V256: {
4830             /* See comments for Ity_V128. */
4831             vassert(rloc.pri == RLPri_V256SpRel);
4832             vassert(addToSp >= 32);
4833             HReg        dstLo, dstHi;
4834             lookupIRTempPair(&dstHi, &dstLo, env, d->tmp);
4835             AMD64AMode* amLo  = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP());
4836             addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstLo, amLo ));
4837             AMD64AMode* amHi  = AMD64AMode_IR(rloc.spOff+16, hregAMD64_RSP());
4838             addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstHi, amHi ));
4839             add_to_rsp(env, addToSp);
4840             return;
4841          }
4842          default:
4843             /*NOTREACHED*/
4844             vassert(0);
4845       }
4846       break;
4847    }
4848 
4849    /* --------- MEM FENCE --------- */
4850    case Ist_MBE:
4851       switch (stmt->Ist.MBE.event) {
4852          case Imbe_Fence:
4853             addInstr(env, AMD64Instr_MFence());
4854             return;
4855          default:
4856             break;
4857       }
4858       break;
4859 
4860    /* --------- ACAS --------- */
4861    case Ist_CAS:
4862       if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
4863          /* "normal" singleton CAS */
4864          UChar  sz;
4865          IRCAS* cas = stmt->Ist.CAS.details;
4866          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
4867          /* get: cas->expd into %rax, and cas->data into %rbx */
4868          AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
4869          HReg rData = iselIntExpr_R(env, cas->dataLo);
4870          HReg rExpd = iselIntExpr_R(env, cas->expdLo);
4871          HReg rOld  = lookupIRTemp(env, cas->oldLo);
4872          vassert(cas->expdHi == NULL);
4873          vassert(cas->dataHi == NULL);
4874          addInstr(env, mk_iMOVsd_RR(rExpd, rOld));
4875          addInstr(env, mk_iMOVsd_RR(rExpd, hregAMD64_RAX()));
4876          addInstr(env, mk_iMOVsd_RR(rData, hregAMD64_RBX()));
4877          switch (ty) {
4878             case Ity_I64: sz = 8; break;
4879             case Ity_I32: sz = 4; break;
4880             case Ity_I16: sz = 2; break;
4881             case Ity_I8:  sz = 1; break;
4882             default: goto unhandled_cas;
4883          }
4884          addInstr(env, AMD64Instr_ACAS(am, sz));
4885          addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RAX(), rOld));
4886          return;
4887       } else {
4888          /* double CAS */
4889          UChar  sz;
4890          IRCAS* cas = stmt->Ist.CAS.details;
4891          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
4892          /* only 32-bit and 64-bit allowed in this case */
4893          /* get: cas->expdLo into %rax, and cas->dataLo into %rbx */
4894          /* get: cas->expdHi into %rdx, and cas->dataHi into %rcx */
4895          AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
4896          HReg rDataHi = iselIntExpr_R(env, cas->dataHi);
4897          HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
4898          HReg rExpdHi = iselIntExpr_R(env, cas->expdHi);
4899          HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
4900          HReg rOldHi  = lookupIRTemp(env, cas->oldHi);
4901          HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
4902          switch (ty) {
4903             case Ity_I64:
4904                if (!(env->hwcaps & VEX_HWCAPS_AMD64_CX16))
4905                   goto unhandled_cas; /* we'd have to generate
4906                                          cmpxchg16b, but the host
4907                                          doesn't support that */
4908                sz = 8;
4909                break;
4910             case Ity_I32:
4911                sz = 4;
4912                break;
4913             default:
4914                goto unhandled_cas;
4915          }
4916          addInstr(env, mk_iMOVsd_RR(rExpdHi, rOldHi));
4917          addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
4918          addInstr(env, mk_iMOVsd_RR(rExpdHi, hregAMD64_RDX()));
4919          addInstr(env, mk_iMOVsd_RR(rExpdLo, hregAMD64_RAX()));
4920          addInstr(env, mk_iMOVsd_RR(rDataHi, hregAMD64_RCX()));
4921          addInstr(env, mk_iMOVsd_RR(rDataLo, hregAMD64_RBX()));
4922          addInstr(env, AMD64Instr_DACAS(am, sz));
4923          addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RDX(), rOldHi));
4924          addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RAX(), rOldLo));
4925          return;
4926       }
4927       unhandled_cas:
4928       break;
4929 
4930    /* --------- INSTR MARK --------- */
4931    /* Doesn't generate any executable code ... */
4932    case Ist_IMark:
4933        return;
4934 
4935    /* --------- ABI HINT --------- */
4936    /* These have no meaning (denotation in the IR) and so we ignore
4937       them ... if any actually made it this far. */
4938    case Ist_AbiHint:
4939        return;
4940 
4941    /* --------- NO-OP --------- */
4942    case Ist_NoOp:
4943        return;
4944 
4945    /* --------- EXIT --------- */
4946    case Ist_Exit: {
4947       if (stmt->Ist.Exit.dst->tag != Ico_U64)
4948          vpanic("iselStmt(amd64): Ist_Exit: dst is not a 64-bit value");
4949 
4950       AMD64CondCode cc    = iselCondCode(env, stmt->Ist.Exit.guard);
4951       AMD64AMode*   amRIP = AMD64AMode_IR(stmt->Ist.Exit.offsIP,
4952                                           hregAMD64_RBP());
4953 
4954       /* Case: boring transfer to known address */
4955       if (stmt->Ist.Exit.jk == Ijk_Boring) {
4956          if (env->chainingAllowed) {
4957             /* .. almost always true .. */
4958             /* Skip the event check at the dst if this is a forwards
4959                edge. */
4960             Bool toFastEP
4961                = ((Addr64)stmt->Ist.Exit.dst->Ico.U64) > env->max_ga;
4962             if (0) vex_printf("%s", toFastEP ? "Y" : ",");
4963             addInstr(env, AMD64Instr_XDirect(stmt->Ist.Exit.dst->Ico.U64,
4964                                              amRIP, cc, toFastEP));
4965          } else {
4966             /* .. very occasionally .. */
4967             /* We can't use chaining, so ask for an assisted transfer,
4968                as that's the only alternative that is allowable. */
4969             HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
4970             addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, Ijk_Boring));
4971          }
4972          return;
4973       }
4974 
4975       /* Case: assisted transfer to arbitrary address */
4976       switch (stmt->Ist.Exit.jk) {
4977          /* Keep this list in sync with that in iselNext below */
4978          case Ijk_ClientReq:
4979          case Ijk_EmWarn:
4980          case Ijk_NoDecode:
4981          case Ijk_NoRedir:
4982          case Ijk_SigSEGV:
4983          case Ijk_SigTRAP:
4984          case Ijk_Sys_syscall:
4985          case Ijk_Sys_int210:
4986          case Ijk_InvalICache:
4987          case Ijk_Yield:
4988          {
4989             HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
4990             addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, stmt->Ist.Exit.jk));
4991             return;
4992          }
4993          default:
4994             break;
4995       }
4996 
4997       /* Do we ever expect to see any other kind? */
4998       goto stmt_fail;
4999    }
5000 
5001    default: break;
5002    }
5003   stmt_fail:
5004    ppIRStmt(stmt);
5005    vpanic("iselStmt(amd64)");
5006 }
5007 
5008 
5009 /*---------------------------------------------------------*/
5010 /*--- ISEL: Basic block terminators (Nexts)             ---*/
5011 /*---------------------------------------------------------*/
5012 
iselNext(ISelEnv * env,IRExpr * next,IRJumpKind jk,Int offsIP)5013 static void iselNext ( ISelEnv* env,
5014                        IRExpr* next, IRJumpKind jk, Int offsIP )
5015 {
5016    if (vex_traceflags & VEX_TRACE_VCODE) {
5017       vex_printf( "\n-- PUT(%d) = ", offsIP);
5018       ppIRExpr( next );
5019       vex_printf( "; exit-");
5020       ppIRJumpKind(jk);
5021       vex_printf( "\n");
5022    }
5023 
5024    /* Case: boring transfer to known address */
5025    if (next->tag == Iex_Const) {
5026       IRConst* cdst = next->Iex.Const.con;
5027       vassert(cdst->tag == Ico_U64);
5028       if (jk == Ijk_Boring || jk == Ijk_Call) {
5029          /* Boring transfer to known address */
5030          AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
5031          if (env->chainingAllowed) {
5032             /* .. almost always true .. */
5033             /* Skip the event check at the dst if this is a forwards
5034                edge. */
5035             Bool toFastEP
5036                = ((Addr64)cdst->Ico.U64) > env->max_ga;
5037             if (0) vex_printf("%s", toFastEP ? "X" : ".");
5038             addInstr(env, AMD64Instr_XDirect(cdst->Ico.U64,
5039                                              amRIP, Acc_ALWAYS,
5040                                              toFastEP));
5041          } else {
5042             /* .. very occasionally .. */
5043             /* We can't use chaining, so ask for an indirect transfer,
5044                as that's the cheapest alternative that is
5045                allowable. */
5046             HReg r = iselIntExpr_R(env, next);
5047             addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
5048                                                Ijk_Boring));
5049          }
5050          return;
5051       }
5052    }
5053 
5054    /* Case: call/return (==boring) transfer to any address */
5055    switch (jk) {
5056       case Ijk_Boring: case Ijk_Ret: case Ijk_Call: {
5057          HReg        r     = iselIntExpr_R(env, next);
5058          AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
5059          if (env->chainingAllowed) {
5060             addInstr(env, AMD64Instr_XIndir(r, amRIP, Acc_ALWAYS));
5061          } else {
5062             addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
5063                                                Ijk_Boring));
5064          }
5065          return;
5066       }
5067       default:
5068          break;
5069    }
5070 
5071    /* Case: assisted transfer to arbitrary address */
5072    switch (jk) {
5073       /* Keep this list in sync with that for Ist_Exit above */
5074       case Ijk_ClientReq:
5075       case Ijk_EmWarn:
5076       case Ijk_NoDecode:
5077       case Ijk_NoRedir:
5078       case Ijk_SigSEGV:
5079       case Ijk_SigTRAP:
5080       case Ijk_Sys_syscall:
5081       case Ijk_Sys_int210:
5082       case Ijk_InvalICache:
5083       case Ijk_Yield: {
5084          HReg        r     = iselIntExpr_R(env, next);
5085          AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
5086          addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS, jk));
5087          return;
5088       }
5089       default:
5090          break;
5091    }
5092 
5093    vex_printf( "\n-- PUT(%d) = ", offsIP);
5094    ppIRExpr( next );
5095    vex_printf( "; exit-");
5096    ppIRJumpKind(jk);
5097    vex_printf( "\n");
5098    vassert(0); // are we expecting any other kind?
5099 }
5100 
5101 
5102 /*---------------------------------------------------------*/
5103 /*--- Insn selector top-level                           ---*/
5104 /*---------------------------------------------------------*/
5105 
5106 /* Translate an entire SB to amd64 code. */
5107 
iselSB_AMD64(const IRSB * bb,VexArch arch_host,const VexArchInfo * archinfo_host,const VexAbiInfo * vbi,Int offs_Host_EvC_Counter,Int offs_Host_EvC_FailAddr,Bool chainingAllowed,Bool addProfInc,Addr max_ga)5108 HInstrArray* iselSB_AMD64 ( const IRSB* bb,
5109                             VexArch      arch_host,
5110                             const VexArchInfo* archinfo_host,
5111                             const VexAbiInfo*  vbi/*UNUSED*/,
5112                             Int offs_Host_EvC_Counter,
5113                             Int offs_Host_EvC_FailAddr,
5114                             Bool chainingAllowed,
5115                             Bool addProfInc,
5116                             Addr max_ga )
5117 {
5118    Int        i, j;
5119    HReg       hreg, hregHI;
5120    ISelEnv*   env;
5121    UInt       hwcaps_host = archinfo_host->hwcaps;
5122    AMD64AMode *amCounter, *amFailAddr;
5123 
5124    /* sanity ... */
5125    vassert(arch_host == VexArchAMD64);
5126    vassert(0 == (hwcaps_host
5127                  & ~(VEX_HWCAPS_AMD64_SSE3
5128                      | VEX_HWCAPS_AMD64_SSSE3
5129                      | VEX_HWCAPS_AMD64_CX16
5130                      | VEX_HWCAPS_AMD64_LZCNT
5131                      | VEX_HWCAPS_AMD64_AVX
5132                      | VEX_HWCAPS_AMD64_RDTSCP
5133                      | VEX_HWCAPS_AMD64_BMI
5134                      | VEX_HWCAPS_AMD64_AVX2)));
5135 
5136    /* Check that the host's endianness is as expected. */
5137    vassert(archinfo_host->endness == VexEndnessLE);
5138 
5139    /* Make up an initial environment to use. */
5140    env = LibVEX_Alloc_inline(sizeof(ISelEnv));
5141    env->vreg_ctr = 0;
5142 
5143    /* Set up output code array. */
5144    env->code = newHInstrArray();
5145 
5146    /* Copy BB's type env. */
5147    env->type_env = bb->tyenv;
5148 
5149    /* Make up an IRTemp -> virtual HReg mapping.  This doesn't
5150       change as we go along. */
5151    env->n_vregmap = bb->tyenv->types_used;
5152    env->vregmap   = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
5153    env->vregmapHI = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
5154 
5155    /* and finally ... */
5156    env->chainingAllowed = chainingAllowed;
5157    env->hwcaps          = hwcaps_host;
5158    env->max_ga          = max_ga;
5159 
5160    /* For each IR temporary, allocate a suitably-kinded virtual
5161       register. */
5162    j = 0;
5163    for (i = 0; i < env->n_vregmap; i++) {
5164       hregHI = hreg = INVALID_HREG;
5165       switch (bb->tyenv->types[i]) {
5166          case Ity_I1:
5167          case Ity_I8: case Ity_I16: case Ity_I32: case Ity_I64:
5168             hreg = mkHReg(True, HRcInt64, 0, j++);
5169             break;
5170          case Ity_I128:
5171             hreg   = mkHReg(True, HRcInt64, 0, j++);
5172             hregHI = mkHReg(True, HRcInt64, 0, j++);
5173             break;
5174          case Ity_F32:
5175          case Ity_F64:
5176          case Ity_V128:
5177             hreg = mkHReg(True, HRcVec128, 0, j++);
5178             break;
5179          case Ity_V256:
5180             hreg   = mkHReg(True, HRcVec128, 0, j++);
5181             hregHI = mkHReg(True, HRcVec128, 0, j++);
5182             break;
5183          default:
5184             ppIRType(bb->tyenv->types[i]);
5185             vpanic("iselBB(amd64): IRTemp type");
5186       }
5187       env->vregmap[i]   = hreg;
5188       env->vregmapHI[i] = hregHI;
5189    }
5190    env->vreg_ctr = j;
5191 
5192    /* The very first instruction must be an event check. */
5193    amCounter  = AMD64AMode_IR(offs_Host_EvC_Counter,  hregAMD64_RBP());
5194    amFailAddr = AMD64AMode_IR(offs_Host_EvC_FailAddr, hregAMD64_RBP());
5195    addInstr(env, AMD64Instr_EvCheck(amCounter, amFailAddr));
5196 
5197    /* Possibly a block counter increment (for profiling).  At this
5198       point we don't know the address of the counter, so just pretend
5199       it is zero.  It will have to be patched later, but before this
5200       translation is used, by a call to LibVEX_patchProfCtr. */
5201    if (addProfInc) {
5202       addInstr(env, AMD64Instr_ProfInc());
5203    }
5204 
5205    /* Ok, finally we can iterate over the statements. */
5206    for (i = 0; i < bb->stmts_used; i++)
5207       if (bb->stmts[i])
5208          iselStmt(env, bb->stmts[i]);
5209 
5210    iselNext(env, bb->next, bb->jumpkind, bb->offsIP);
5211 
5212    /* record the number of vregs we used. */
5213    env->code->n_vregs = env->vreg_ctr;
5214    return env->code;
5215 }
5216 
5217 
5218 /*---------------------------------------------------------------*/
5219 /*--- end                                   host_amd64_isel.c ---*/
5220 /*---------------------------------------------------------------*/
5221