1
2 /*---------------------------------------------------------------*/
3 /*--- begin host_amd64_isel.c ---*/
4 /*---------------------------------------------------------------*/
5
6 /*
7 This file is part of Valgrind, a dynamic binary instrumentation
8 framework.
9
10 Copyright (C) 2004-2017 OpenWorks LLP
11 info@open-works.net
12
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
17
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, write to the Free Software
25 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26 02110-1301, USA.
27
28 The GNU General Public License is contained in the file COPYING.
29
30 Neither the names of the U.S. Department of Energy nor the
31 University of California nor the names of its contributors may be
32 used to endorse or promote products derived from this software
33 without prior written permission.
34 */
35
36 #include "libvex_basictypes.h"
37 #include "libvex_ir.h"
38 #include "libvex.h"
39
40 #include "ir_match.h"
41 #include "main_util.h"
42 #include "main_globals.h"
43 #include "host_generic_regs.h"
44 #include "host_generic_simd64.h"
45 #include "host_generic_simd128.h"
46 #include "host_generic_simd256.h"
47 #include "host_generic_maddf.h"
48 #include "host_amd64_defs.h"
49
50
51 /*---------------------------------------------------------*/
52 /*--- x87/SSE control word stuff ---*/
53 /*---------------------------------------------------------*/
54
55 /* Vex-generated code expects to run with the FPU set as follows: all
56 exceptions masked, round-to-nearest, precision = 53 bits. This
57 corresponds to a FPU control word value of 0x027F.
58
59 Similarly the SSE control word (%mxcsr) should be 0x1F80.
60
61 %fpucw and %mxcsr should have these values on entry to
62 Vex-generated code, and should those values should be
63 unchanged at exit.
64 */
65
66 #define DEFAULT_FPUCW 0x027F
67
68 #define DEFAULT_MXCSR 0x1F80
69
70 /* debugging only, do not use */
71 /* define DEFAULT_FPUCW 0x037F */
72
73
74 /*---------------------------------------------------------*/
75 /*--- misc helpers ---*/
76 /*---------------------------------------------------------*/
77
78 /* These are duplicated in guest-amd64/toIR.c */
unop(IROp op,IRExpr * a)79 static IRExpr* unop ( IROp op, IRExpr* a )
80 {
81 return IRExpr_Unop(op, a);
82 }
83
binop(IROp op,IRExpr * a1,IRExpr * a2)84 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
85 {
86 return IRExpr_Binop(op, a1, a2);
87 }
88
bind(Int binder)89 static IRExpr* bind ( Int binder )
90 {
91 return IRExpr_Binder(binder);
92 }
93
isZeroU8(const IRExpr * e)94 static Bool isZeroU8 ( const IRExpr* e )
95 {
96 return e->tag == Iex_Const
97 && e->Iex.Const.con->tag == Ico_U8
98 && e->Iex.Const.con->Ico.U8 == 0;
99 }
100
101
102 /*---------------------------------------------------------*/
103 /*--- ISelEnv ---*/
104 /*---------------------------------------------------------*/
105
106 /* This carries around:
107
108 - A mapping from IRTemp to IRType, giving the type of any IRTemp we
109 might encounter. This is computed before insn selection starts,
110 and does not change.
111
112 - A mapping from IRTemp to HReg. This tells the insn selector
113 which virtual register is associated with each IRTemp
114 temporary. This is computed before insn selection starts, and
115 does not change. We expect this mapping to map precisely the
116 same set of IRTemps as the type mapping does.
117
118 - vregmap holds the primary register for the IRTemp.
119 - vregmapHI is only used for 128-bit integer-typed
120 IRTemps. It holds the identity of a second
121 64-bit virtual HReg, which holds the high half
122 of the value.
123
124 - The host subarchitecture we are selecting insns for.
125 This is set at the start and does not change.
126
127 - The code array, that is, the insns selected so far.
128
129 - A counter, for generating new virtual registers.
130
131 - A Bool for indicating whether we may generate chain-me
132 instructions for control flow transfers, or whether we must use
133 XAssisted.
134
135 - The maximum guest address of any guest insn in this block.
136 Actually, the address of the highest-addressed byte from any insn
137 in this block. Is set at the start and does not change. This is
138 used for detecting jumps which are definitely forward-edges from
139 this block, and therefore can be made (chained) to the fast entry
140 point of the destination, thereby avoiding the destination's
141 event check.
142
143 Note, this is all host-independent. (JRS 20050201: well, kinda
144 ... not completely. Compare with ISelEnv for X86.)
145 */
146
147 typedef
148 struct {
149 /* Constant -- are set at the start and do not change. */
150 IRTypeEnv* type_env;
151
152 HReg* vregmap;
153 HReg* vregmapHI;
154 Int n_vregmap;
155
156 UInt hwcaps;
157
158 Bool chainingAllowed;
159 Addr64 max_ga;
160
161 /* These are modified as we go along. */
162 HInstrArray* code;
163 Int vreg_ctr;
164 }
165 ISelEnv;
166
167
lookupIRTemp(ISelEnv * env,IRTemp tmp)168 static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
169 {
170 vassert(tmp >= 0);
171 vassert(tmp < env->n_vregmap);
172 return env->vregmap[tmp];
173 }
174
lookupIRTempPair(HReg * vrHI,HReg * vrLO,ISelEnv * env,IRTemp tmp)175 static void lookupIRTempPair ( HReg* vrHI, HReg* vrLO,
176 ISelEnv* env, IRTemp tmp )
177 {
178 vassert(tmp >= 0);
179 vassert(tmp < env->n_vregmap);
180 vassert(! hregIsInvalid(env->vregmapHI[tmp]));
181 *vrLO = env->vregmap[tmp];
182 *vrHI = env->vregmapHI[tmp];
183 }
184
addInstr(ISelEnv * env,AMD64Instr * instr)185 static void addInstr ( ISelEnv* env, AMD64Instr* instr )
186 {
187 addHInstr(env->code, instr);
188 if (vex_traceflags & VEX_TRACE_VCODE) {
189 ppAMD64Instr(instr, True);
190 vex_printf("\n");
191 }
192 }
193
newVRegI(ISelEnv * env)194 static HReg newVRegI ( ISelEnv* env )
195 {
196 HReg reg = mkHReg(True/*virtual reg*/, HRcInt64, 0/*enc*/, env->vreg_ctr);
197 env->vreg_ctr++;
198 return reg;
199 }
200
newVRegV(ISelEnv * env)201 static HReg newVRegV ( ISelEnv* env )
202 {
203 HReg reg = mkHReg(True/*virtual reg*/, HRcVec128, 0/*enc*/, env->vreg_ctr);
204 env->vreg_ctr++;
205 return reg;
206 }
207
208
209 /*---------------------------------------------------------*/
210 /*--- ISEL: Forward declarations ---*/
211 /*---------------------------------------------------------*/
212
213 /* These are organised as iselXXX and iselXXX_wrk pairs. The
214 iselXXX_wrk do the real work, but are not to be called directly.
215 For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
216 checks that all returned registers are virtual. You should not
217 call the _wrk version directly.
218 */
219 static AMD64RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, const IRExpr* e );
220 static AMD64RMI* iselIntExpr_RMI ( ISelEnv* env, const IRExpr* e );
221
222 static AMD64RI* iselIntExpr_RI_wrk ( ISelEnv* env, const IRExpr* e );
223 static AMD64RI* iselIntExpr_RI ( ISelEnv* env, const IRExpr* e );
224
225 static AMD64RM* iselIntExpr_RM_wrk ( ISelEnv* env, const IRExpr* e );
226 static AMD64RM* iselIntExpr_RM ( ISelEnv* env, const IRExpr* e );
227
228 static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e );
229 static HReg iselIntExpr_R ( ISelEnv* env, const IRExpr* e );
230
231 static AMD64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, const IRExpr* e );
232 static AMD64AMode* iselIntExpr_AMode ( ISelEnv* env, const IRExpr* e );
233
234 static void iselInt128Expr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
235 ISelEnv* env, const IRExpr* e );
236 static void iselInt128Expr ( /*OUT*/HReg* rHi, HReg* rLo,
237 ISelEnv* env, const IRExpr* e );
238
239 static AMD64CondCode iselCondCode_wrk ( ISelEnv* env, const IRExpr* e );
240 static AMD64CondCode iselCondCode ( ISelEnv* env, const IRExpr* e );
241
242 static HReg iselDblExpr_wrk ( ISelEnv* env, const IRExpr* e );
243 static HReg iselDblExpr ( ISelEnv* env, const IRExpr* e );
244
245 static HReg iselFltExpr_wrk ( ISelEnv* env, const IRExpr* e );
246 static HReg iselFltExpr ( ISelEnv* env, const IRExpr* e );
247
248 static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e );
249 static HReg iselVecExpr ( ISelEnv* env, const IRExpr* e );
250
251 static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
252 ISelEnv* env, const IRExpr* e );
253 static void iselDVecExpr ( /*OUT*/HReg* rHi, HReg* rLo,
254 ISelEnv* env, const IRExpr* e );
255
256
257 /*---------------------------------------------------------*/
258 /*--- ISEL: Misc helpers ---*/
259 /*---------------------------------------------------------*/
260
sane_AMode(AMD64AMode * am)261 static Bool sane_AMode ( AMD64AMode* am )
262 {
263 switch (am->tag) {
264 case Aam_IR:
265 return
266 toBool( hregClass(am->Aam.IR.reg) == HRcInt64
267 && (hregIsVirtual(am->Aam.IR.reg)
268 || sameHReg(am->Aam.IR.reg, hregAMD64_RBP())) );
269 case Aam_IRRS:
270 return
271 toBool( hregClass(am->Aam.IRRS.base) == HRcInt64
272 && hregIsVirtual(am->Aam.IRRS.base)
273 && hregClass(am->Aam.IRRS.index) == HRcInt64
274 && hregIsVirtual(am->Aam.IRRS.index) );
275 default:
276 vpanic("sane_AMode: unknown amd64 amode tag");
277 }
278 }
279
280
281 /* Can the lower 32 bits be signedly widened to produce the whole
282 64-bit value? In other words, are the top 33 bits either all 0 or
283 all 1 ? */
fitsIn32Bits(ULong x)284 static Bool fitsIn32Bits ( ULong x )
285 {
286 Long y1;
287 y1 = x << 32;
288 y1 >>=/*s*/ 32;
289 return toBool(x == y1);
290 }
291
292 /* Is this a 64-bit zero expression? */
293
isZeroU64(const IRExpr * e)294 static Bool isZeroU64 ( const IRExpr* e )
295 {
296 return e->tag == Iex_Const
297 && e->Iex.Const.con->tag == Ico_U64
298 && e->Iex.Const.con->Ico.U64 == 0ULL;
299 }
300
isZeroU32(const IRExpr * e)301 static Bool isZeroU32 ( const IRExpr* e )
302 {
303 return e->tag == Iex_Const
304 && e->Iex.Const.con->tag == Ico_U32
305 && e->Iex.Const.con->Ico.U32 == 0;
306 }
307
308 /* Are both args atoms and the same? This is copy of eqIRAtom
309 that omits the assertions that the args are indeed atoms. */
310
areAtomsAndEqual(const IRExpr * a1,const IRExpr * a2)311 static Bool areAtomsAndEqual ( const IRExpr* a1, const IRExpr* a2 )
312 {
313 if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp)
314 return toBool(a1->Iex.RdTmp.tmp == a2->Iex.RdTmp.tmp);
315 if (a1->tag == Iex_Const && a2->tag == Iex_Const)
316 return eqIRConst(a1->Iex.Const.con, a2->Iex.Const.con);
317 return False;
318 }
319
320 /* Make a int reg-reg move. */
321
mk_iMOVsd_RR(HReg src,HReg dst)322 static AMD64Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
323 {
324 vassert(hregClass(src) == HRcInt64);
325 vassert(hregClass(dst) == HRcInt64);
326 return AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst);
327 }
328
329 /* Make a vector (128 bit) reg-reg move. */
330
mk_vMOVsd_RR(HReg src,HReg dst)331 static AMD64Instr* mk_vMOVsd_RR ( HReg src, HReg dst )
332 {
333 vassert(hregClass(src) == HRcVec128);
334 vassert(hregClass(dst) == HRcVec128);
335 return AMD64Instr_SseReRg(Asse_MOV, src, dst);
336 }
337
338 /* Advance/retreat %rsp by n. */
339
add_to_rsp(ISelEnv * env,Int n)340 static void add_to_rsp ( ISelEnv* env, Int n )
341 {
342 vassert(n > 0 && n < 256 && (n%8) == 0);
343 addInstr(env,
344 AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(n),
345 hregAMD64_RSP()));
346 }
347
sub_from_rsp(ISelEnv * env,Int n)348 static void sub_from_rsp ( ISelEnv* env, Int n )
349 {
350 vassert(n > 0 && n < 256 && (n%8) == 0);
351 addInstr(env,
352 AMD64Instr_Alu64R(Aalu_SUB, AMD64RMI_Imm(n),
353 hregAMD64_RSP()));
354 }
355
356 /* Push 64-bit constants on the stack. */
push_uimm64(ISelEnv * env,ULong uimm64)357 static void push_uimm64( ISelEnv* env, ULong uimm64 )
358 {
359 /* If uimm64 can be expressed as the sign extension of its
360 lower 32 bits, we can do it the easy way. */
361 Long simm64 = (Long)uimm64;
362 if ( simm64 == ((Long)(uimm64 << 32) >> 32) ) {
363 addInstr( env, AMD64Instr_Push(AMD64RMI_Imm( (UInt)uimm64 )) );
364 } else {
365 HReg tmp = newVRegI(env);
366 addInstr( env, AMD64Instr_Imm64(uimm64, tmp) );
367 addInstr( env, AMD64Instr_Push(AMD64RMI_Reg(tmp)) );
368 }
369 }
370
371
372 /* Used only in doHelperCall. If possible, produce a single
373 instruction which computes 'e' into 'dst'. If not possible, return
374 NULL. */
375
iselIntExpr_single_instruction(ISelEnv * env,HReg dst,IRExpr * e)376 static AMD64Instr* iselIntExpr_single_instruction ( ISelEnv* env,
377 HReg dst,
378 IRExpr* e )
379 {
380 /* Per comments in doHelperCall below, appearance of
381 Iex_VECRET implies ill-formed IR. */
382 vassert(e->tag != Iex_VECRET);
383
384 /* In this case we give out a copy of the BaseBlock pointer. */
385 if (UNLIKELY(e->tag == Iex_GSPTR)) {
386 return mk_iMOVsd_RR( hregAMD64_RBP(), dst );
387 }
388
389 vassert(typeOfIRExpr(env->type_env, e) == Ity_I64);
390
391 if (e->tag == Iex_Const) {
392 vassert(e->Iex.Const.con->tag == Ico_U64);
393 if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
394 return AMD64Instr_Alu64R(
395 Aalu_MOV,
396 AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64)),
397 dst
398 );
399 } else {
400 return AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, dst);
401 }
402 }
403
404 if (e->tag == Iex_RdTmp) {
405 HReg src = lookupIRTemp(env, e->Iex.RdTmp.tmp);
406 return mk_iMOVsd_RR(src, dst);
407 }
408
409 if (e->tag == Iex_Get) {
410 vassert(e->Iex.Get.ty == Ity_I64);
411 return AMD64Instr_Alu64R(
412 Aalu_MOV,
413 AMD64RMI_Mem(
414 AMD64AMode_IR(e->Iex.Get.offset,
415 hregAMD64_RBP())),
416 dst);
417 }
418
419 if (e->tag == Iex_Unop
420 && e->Iex.Unop.op == Iop_32Uto64
421 && e->Iex.Unop.arg->tag == Iex_RdTmp) {
422 HReg src = lookupIRTemp(env, e->Iex.Unop.arg->Iex.RdTmp.tmp);
423 return AMD64Instr_MovxLQ(False, src, dst);
424 }
425
426 if (0) { ppIRExpr(e); vex_printf("\n"); }
427
428 return NULL;
429 }
430
431
432 /* Do a complete function call. |guard| is a Ity_Bit expression
433 indicating whether or not the call happens. If guard==NULL, the
434 call is unconditional. |retloc| is set to indicate where the
435 return value is after the call. The caller (of this fn) must
436 generate code to add |stackAdjustAfterCall| to the stack pointer
437 after the call is done. */
438
439 static
doHelperCall(UInt * stackAdjustAfterCall,RetLoc * retloc,ISelEnv * env,IRExpr * guard,IRCallee * cee,IRType retTy,IRExpr ** args)440 void doHelperCall ( /*OUT*/UInt* stackAdjustAfterCall,
441 /*OUT*/RetLoc* retloc,
442 ISelEnv* env,
443 IRExpr* guard,
444 IRCallee* cee, IRType retTy, IRExpr** args )
445 {
446 AMD64CondCode cc;
447 HReg argregs[6];
448 HReg tmpregs[6];
449 AMD64Instr* fastinstrs[6];
450 UInt n_args, i;
451
452 /* Set default returns. We'll update them later if needed. */
453 *stackAdjustAfterCall = 0;
454 *retloc = mk_RetLoc_INVALID();
455
456 /* These are used for cross-checking that IR-level constraints on
457 the use of IRExpr_VECRET() and IRExpr_GSPTR() are observed. */
458 UInt nVECRETs = 0;
459 UInt nGSPTRs = 0;
460
461 /* Marshal args for a call and do the call.
462
463 This function only deals with a tiny set of possibilities, which
464 cover all helpers in practice. The restrictions are that only
465 arguments in registers are supported, hence only 6x64 integer
466 bits in total can be passed. In fact the only supported arg
467 type is I64.
468
469 The return type can be I{64,32,16,8} or V{128,256}. In the
470 latter two cases, it is expected that |args| will contain the
471 special node IRExpr_VECRET(), in which case this routine
472 generates code to allocate space on the stack for the vector
473 return value. Since we are not passing any scalars on the
474 stack, it is enough to preallocate the return space before
475 marshalling any arguments, in this case.
476
477 |args| may also contain IRExpr_GSPTR(), in which case the
478 value in %rbp is passed as the corresponding argument.
479
480 Generating code which is both efficient and correct when
481 parameters are to be passed in registers is difficult, for the
482 reasons elaborated in detail in comments attached to
483 doHelperCall() in priv/host-x86/isel.c. Here, we use a variant
484 of the method described in those comments.
485
486 The problem is split into two cases: the fast scheme and the
487 slow scheme. In the fast scheme, arguments are computed
488 directly into the target (real) registers. This is only safe
489 when we can be sure that computation of each argument will not
490 trash any real registers set by computation of any other
491 argument.
492
493 In the slow scheme, all args are first computed into vregs, and
494 once they are all done, they are moved to the relevant real
495 regs. This always gives correct code, but it also gives a bunch
496 of vreg-to-rreg moves which are usually redundant but are hard
497 for the register allocator to get rid of.
498
499 To decide which scheme to use, all argument expressions are
500 first examined. If they are all so simple that it is clear they
501 will be evaluated without use of any fixed registers, use the
502 fast scheme, else use the slow scheme. Note also that only
503 unconditional calls may use the fast scheme, since having to
504 compute a condition expression could itself trash real
505 registers. Note that for simplicity, in the case where
506 IRExpr_VECRET() is present, we use the slow scheme. This is
507 motivated by the desire to avoid any possible complexity
508 w.r.t. nested calls.
509
510 Note this requires being able to examine an expression and
511 determine whether or not evaluation of it might use a fixed
512 register. That requires knowledge of how the rest of this insn
513 selector works. Currently just the following 3 are regarded as
514 safe -- hopefully they cover the majority of arguments in
515 practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
516 */
517
518 /* Note that the cee->regparms field is meaningless on AMD64 host
519 (since there is only one calling convention) and so we always
520 ignore it. */
521 n_args = 0;
522 for (i = 0; args[i]; i++)
523 n_args++;
524
525 if (n_args > 6)
526 vpanic("doHelperCall(AMD64): cannot currently handle > 6 args");
527
528 argregs[0] = hregAMD64_RDI();
529 argregs[1] = hregAMD64_RSI();
530 argregs[2] = hregAMD64_RDX();
531 argregs[3] = hregAMD64_RCX();
532 argregs[4] = hregAMD64_R8();
533 argregs[5] = hregAMD64_R9();
534
535 tmpregs[0] = tmpregs[1] = tmpregs[2] =
536 tmpregs[3] = tmpregs[4] = tmpregs[5] = INVALID_HREG;
537
538 fastinstrs[0] = fastinstrs[1] = fastinstrs[2] =
539 fastinstrs[3] = fastinstrs[4] = fastinstrs[5] = NULL;
540
541 /* First decide which scheme (slow or fast) is to be used. First
542 assume the fast scheme, and select slow if any contraindications
543 (wow) appear. */
544
545 /* We'll need space on the stack for the return value. Avoid
546 possible complications with nested calls by using the slow
547 scheme. */
548 if (retTy == Ity_V128 || retTy == Ity_V256)
549 goto slowscheme;
550
551 if (guard) {
552 if (guard->tag == Iex_Const
553 && guard->Iex.Const.con->tag == Ico_U1
554 && guard->Iex.Const.con->Ico.U1 == True) {
555 /* unconditional */
556 } else {
557 /* Not manifestly unconditional -- be conservative. */
558 goto slowscheme;
559 }
560 }
561
562 /* Ok, let's try for the fast scheme. If it doesn't pan out, we'll
563 use the slow scheme. Because this is tentative, we can't call
564 addInstr (that is, commit to) any instructions until we're
565 handled all the arguments. So park the resulting instructions
566 in a buffer and emit that if we're successful. */
567
568 /* FAST SCHEME */
569 /* In this loop, we process args that can be computed into the
570 destination (real) register with a single instruction, without
571 using any fixed regs. That also includes IRExpr_GSPTR(), but
572 not IRExpr_VECRET(). Indeed, if the IR is well-formed, we can
573 never see IRExpr_VECRET() at this point, since the return-type
574 check above should ensure all those cases use the slow scheme
575 instead. */
576 vassert(n_args >= 0 && n_args <= 6);
577 for (i = 0; i < n_args; i++) {
578 IRExpr* arg = args[i];
579 if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(arg))) {
580 vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
581 }
582 fastinstrs[i]
583 = iselIntExpr_single_instruction( env, argregs[i], args[i] );
584 if (fastinstrs[i] == NULL)
585 goto slowscheme;
586 }
587
588 /* Looks like we're in luck. Emit the accumulated instructions and
589 move on to doing the call itself. */
590 for (i = 0; i < n_args; i++)
591 addInstr(env, fastinstrs[i]);
592
593 /* Fast scheme only applies for unconditional calls. Hence: */
594 cc = Acc_ALWAYS;
595
596 goto handle_call;
597
598
599 /* SLOW SCHEME; move via temporaries */
600 slowscheme:
601 {}
602 # if 0 /* debug only */
603 if (n_args > 0) {for (i = 0; args[i]; i++) {
604 ppIRExpr(args[i]); vex_printf(" "); }
605 vex_printf("\n");}
606 # endif
607
608 /* If we have a vector return type, allocate a place for it on the
609 stack and record its address. */
610 HReg r_vecRetAddr = INVALID_HREG;
611 if (retTy == Ity_V128) {
612 r_vecRetAddr = newVRegI(env);
613 sub_from_rsp(env, 16);
614 addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr ));
615 }
616 else if (retTy == Ity_V256) {
617 r_vecRetAddr = newVRegI(env);
618 sub_from_rsp(env, 32);
619 addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr ));
620 }
621
622 vassert(n_args >= 0 && n_args <= 6);
623 for (i = 0; i < n_args; i++) {
624 IRExpr* arg = args[i];
625 if (UNLIKELY(arg->tag == Iex_GSPTR)) {
626 tmpregs[i] = newVRegI(env);
627 addInstr(env, mk_iMOVsd_RR( hregAMD64_RBP(), tmpregs[i]));
628 nGSPTRs++;
629 }
630 else if (UNLIKELY(arg->tag == Iex_VECRET)) {
631 /* We stashed the address of the return slot earlier, so just
632 retrieve it now. */
633 vassert(!hregIsInvalid(r_vecRetAddr));
634 tmpregs[i] = r_vecRetAddr;
635 nVECRETs++;
636 }
637 else {
638 vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
639 tmpregs[i] = iselIntExpr_R(env, args[i]);
640 }
641 }
642
643 /* Now we can compute the condition. We can't do it earlier
644 because the argument computations could trash the condition
645 codes. Be a bit clever to handle the common case where the
646 guard is 1:Bit. */
647 cc = Acc_ALWAYS;
648 if (guard) {
649 if (guard->tag == Iex_Const
650 && guard->Iex.Const.con->tag == Ico_U1
651 && guard->Iex.Const.con->Ico.U1 == True) {
652 /* unconditional -- do nothing */
653 } else {
654 cc = iselCondCode( env, guard );
655 }
656 }
657
658 /* Move the args to their final destinations. */
659 for (i = 0; i < n_args; i++) {
660 /* None of these insns, including any spill code that might
661 be generated, may alter the condition codes. */
662 addInstr( env, mk_iMOVsd_RR( tmpregs[i], argregs[i] ) );
663 }
664
665
666 /* Do final checks, set the return values, and generate the call
667 instruction proper. */
668 handle_call:
669
670 if (retTy == Ity_V128 || retTy == Ity_V256) {
671 vassert(nVECRETs == 1);
672 } else {
673 vassert(nVECRETs == 0);
674 }
675
676 vassert(nGSPTRs == 0 || nGSPTRs == 1);
677
678 vassert(*stackAdjustAfterCall == 0);
679 vassert(is_RetLoc_INVALID(*retloc));
680 switch (retTy) {
681 case Ity_INVALID:
682 /* Function doesn't return a value. */
683 *retloc = mk_RetLoc_simple(RLPri_None);
684 break;
685 case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
686 *retloc = mk_RetLoc_simple(RLPri_Int);
687 break;
688 case Ity_V128:
689 *retloc = mk_RetLoc_spRel(RLPri_V128SpRel, 0);
690 *stackAdjustAfterCall = 16;
691 break;
692 case Ity_V256:
693 *retloc = mk_RetLoc_spRel(RLPri_V256SpRel, 0);
694 *stackAdjustAfterCall = 32;
695 break;
696 default:
697 /* IR can denote other possible return types, but we don't
698 handle those here. */
699 vassert(0);
700 }
701
702 /* Finally, generate the call itself. This needs the *retloc value
703 set in the switch above, which is why it's at the end. */
704 addInstr(env,
705 AMD64Instr_Call(cc, (Addr)cee->addr, n_args, *retloc));
706 }
707
708
709 /* Given a guest-state array descriptor, an index expression and a
710 bias, generate an AMD64AMode holding the relevant guest state
711 offset. */
712
713 static
genGuestArrayOffset(ISelEnv * env,IRRegArray * descr,IRExpr * off,Int bias)714 AMD64AMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr,
715 IRExpr* off, Int bias )
716 {
717 HReg tmp, roff;
718 Int elemSz = sizeofIRType(descr->elemTy);
719 Int nElems = descr->nElems;
720
721 /* Throw out any cases not generated by an amd64 front end. In
722 theory there might be a day where we need to handle them -- if
723 we ever run non-amd64-guest on amd64 host. */
724
725 if (nElems != 8 || (elemSz != 1 && elemSz != 8))
726 vpanic("genGuestArrayOffset(amd64 host)");
727
728 /* Compute off into a reg, %off. Then return:
729
730 movq %off, %tmp
731 addq $bias, %tmp (if bias != 0)
732 andq %tmp, 7
733 ... base(%rbp, %tmp, shift) ...
734 */
735 tmp = newVRegI(env);
736 roff = iselIntExpr_R(env, off);
737 addInstr(env, mk_iMOVsd_RR(roff, tmp));
738 if (bias != 0) {
739 /* Make sure the bias is sane, in the sense that there are
740 no significant bits above bit 30 in it. */
741 vassert(-10000 < bias && bias < 10000);
742 addInstr(env,
743 AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(bias), tmp));
744 }
745 addInstr(env,
746 AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(7), tmp));
747 vassert(elemSz == 1 || elemSz == 8);
748 return
749 AMD64AMode_IRRS( descr->base, hregAMD64_RBP(), tmp,
750 elemSz==8 ? 3 : 0);
751 }
752
753
754 /* Set the SSE unit's rounding mode to default (%mxcsr = 0x1F80) */
755 static
set_SSE_rounding_default(ISelEnv * env)756 void set_SSE_rounding_default ( ISelEnv* env )
757 {
758 /* pushq $DEFAULT_MXCSR
759 ldmxcsr 0(%rsp)
760 addq $8, %rsp
761 */
762 AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
763 addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(DEFAULT_MXCSR)));
764 addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
765 add_to_rsp(env, 8);
766 }
767
768 /* Mess with the FPU's rounding mode: set to the default rounding mode
769 (DEFAULT_FPUCW). */
770 static
set_FPU_rounding_default(ISelEnv * env)771 void set_FPU_rounding_default ( ISelEnv* env )
772 {
773 /* movq $DEFAULT_FPUCW, -8(%rsp)
774 fldcw -8(%esp)
775 */
776 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
777 addInstr(env, AMD64Instr_Alu64M(
778 Aalu_MOV, AMD64RI_Imm(DEFAULT_FPUCW), m8_rsp));
779 addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
780 }
781
782
783 /* Mess with the SSE unit's rounding mode: 'mode' is an I32-typed
784 expression denoting a value in the range 0 .. 3, indicating a round
785 mode encoded as per type IRRoundingMode. Set the SSE machinery to
786 have the same rounding.
787 */
788 static
set_SSE_rounding_mode(ISelEnv * env,IRExpr * mode)789 void set_SSE_rounding_mode ( ISelEnv* env, IRExpr* mode )
790 {
791 /* Note: this sequence only makes sense because DEFAULT_MXCSR has
792 both rounding bits == 0. If that wasn't the case, we couldn't
793 create a new rounding field simply by ORing the new value into
794 place. */
795
796 /* movq $3, %reg
797 andq [[mode]], %reg -- shouldn't be needed; paranoia
798 shlq $13, %reg
799 orq $DEFAULT_MXCSR, %reg
800 pushq %reg
801 ldmxcsr 0(%esp)
802 addq $8, %rsp
803 */
804 HReg reg = newVRegI(env);
805 AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
806 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Imm(3), reg));
807 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
808 iselIntExpr_RMI(env, mode), reg));
809 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 13, reg));
810 addInstr(env, AMD64Instr_Alu64R(
811 Aalu_OR, AMD64RMI_Imm(DEFAULT_MXCSR), reg));
812 addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(reg)));
813 addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
814 add_to_rsp(env, 8);
815 }
816
817
818 /* Mess with the FPU's rounding mode: 'mode' is an I32-typed
819 expression denoting a value in the range 0 .. 3, indicating a round
820 mode encoded as per type IRRoundingMode. Set the x87 FPU to have
821 the same rounding.
822 */
823 static
set_FPU_rounding_mode(ISelEnv * env,IRExpr * mode)824 void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode )
825 {
826 HReg rrm = iselIntExpr_R(env, mode);
827 HReg rrm2 = newVRegI(env);
828 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
829
830 /* movq %rrm, %rrm2
831 andq $3, %rrm2 -- shouldn't be needed; paranoia
832 shlq $10, %rrm2
833 orq $DEFAULT_FPUCW, %rrm2
834 movq %rrm2, -8(%rsp)
835 fldcw -8(%esp)
836 */
837 addInstr(env, mk_iMOVsd_RR(rrm, rrm2));
838 addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(3), rrm2));
839 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 10, rrm2));
840 addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
841 AMD64RMI_Imm(DEFAULT_FPUCW), rrm2));
842 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,
843 AMD64RI_Reg(rrm2), m8_rsp));
844 addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
845 }
846
847
848 /* Generate all-zeroes into a new vector register.
849 */
generate_zeroes_V128(ISelEnv * env)850 static HReg generate_zeroes_V128 ( ISelEnv* env )
851 {
852 HReg dst = newVRegV(env);
853 addInstr(env, AMD64Instr_SseReRg(Asse_XOR, dst, dst));
854 return dst;
855 }
856
857 /* Generate all-ones into a new vector register.
858 */
generate_ones_V128(ISelEnv * env)859 static HReg generate_ones_V128 ( ISelEnv* env )
860 {
861 HReg dst = newVRegV(env);
862 addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, dst, dst));
863 return dst;
864 }
865
866
867 /* Generate !src into a new vector register. Amazing that there isn't
868 a less crappy way to do this.
869 */
do_sse_NotV128(ISelEnv * env,HReg src)870 static HReg do_sse_NotV128 ( ISelEnv* env, HReg src )
871 {
872 HReg dst = generate_ones_V128(env);
873 addInstr(env, AMD64Instr_SseReRg(Asse_XOR, src, dst));
874 return dst;
875 }
876
877
878 /* Expand the given byte into a 64-bit word, by cloning each bit
879 8 times. */
bitmask8_to_bytemask64(UShort w8)880 static ULong bitmask8_to_bytemask64 ( UShort w8 )
881 {
882 vassert(w8 == (w8 & 0xFF));
883 ULong w64 = 0;
884 Int i;
885 for (i = 0; i < 8; i++) {
886 if (w8 & (1<<i))
887 w64 |= (0xFFULL << (8 * i));
888 }
889 return w64;
890 }
891
892
893 /*---------------------------------------------------------*/
894 /*--- ISEL: Integer expressions (64/32/16/8 bit) ---*/
895 /*---------------------------------------------------------*/
896
897 /* Select insns for an integer-typed expression, and add them to the
898 code list. Return a reg holding the result. This reg will be a
899 virtual register. THE RETURNED REG MUST NOT BE MODIFIED. If you
900 want to modify it, ask for a new vreg, copy it in there, and modify
901 the copy. The register allocator will do its best to map both
902 vregs to the same real register, so the copies will often disappear
903 later in the game.
904
905 This should handle expressions of 64, 32, 16 and 8-bit type. All
906 results are returned in a 64-bit register. For 32-, 16- and 8-bit
907 expressions, the upper 32/48/56 bits are arbitrary, so you should
908 mask or sign extend partial values if necessary.
909 */
910
iselIntExpr_R(ISelEnv * env,const IRExpr * e)911 static HReg iselIntExpr_R ( ISelEnv* env, const IRExpr* e )
912 {
913 HReg r = iselIntExpr_R_wrk(env, e);
914 /* sanity checks ... */
915 # if 0
916 vex_printf("\niselIntExpr_R: "); ppIRExpr(e); vex_printf("\n");
917 # endif
918 vassert(hregClass(r) == HRcInt64);
919 vassert(hregIsVirtual(r));
920 return r;
921 }
922
923 /* DO NOT CALL THIS DIRECTLY ! */
iselIntExpr_R_wrk(ISelEnv * env,const IRExpr * e)924 static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e )
925 {
926 MatchInfo mi;
927 DECLARE_PATTERN(p_1Uto8_64to1);
928 DECLARE_PATTERN(p_LDle8_then_8Uto64);
929 DECLARE_PATTERN(p_LDle16_then_16Uto64);
930
931 IRType ty = typeOfIRExpr(env->type_env,e);
932 switch (ty) {
933 case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: break;
934 default: vassert(0);
935 }
936
937 switch (e->tag) {
938
939 /* --------- TEMP --------- */
940 case Iex_RdTmp: {
941 return lookupIRTemp(env, e->Iex.RdTmp.tmp);
942 }
943
944 /* --------- LOAD --------- */
945 case Iex_Load: {
946 HReg dst = newVRegI(env);
947 AMD64AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr );
948
949 /* We can't handle big-endian loads, nor load-linked. */
950 if (e->Iex.Load.end != Iend_LE)
951 goto irreducible;
952
953 if (ty == Ity_I64) {
954 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
955 AMD64RMI_Mem(amode), dst) );
956 return dst;
957 }
958 if (ty == Ity_I32) {
959 addInstr(env, AMD64Instr_LoadEX(4,False,amode,dst));
960 return dst;
961 }
962 if (ty == Ity_I16) {
963 addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
964 return dst;
965 }
966 if (ty == Ity_I8) {
967 addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
968 return dst;
969 }
970 break;
971 }
972
973 /* --------- BINARY OP --------- */
974 case Iex_Binop: {
975 AMD64AluOp aluOp;
976 AMD64ShiftOp shOp;
977
978 /* Pattern: Sub64(0,x) */
979 /* and: Sub32(0,x) */
980 if ((e->Iex.Binop.op == Iop_Sub64 && isZeroU64(e->Iex.Binop.arg1))
981 || (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1))) {
982 HReg dst = newVRegI(env);
983 HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
984 addInstr(env, mk_iMOVsd_RR(reg,dst));
985 addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
986 return dst;
987 }
988
989 /* Is it an addition or logical style op? */
990 switch (e->Iex.Binop.op) {
991 case Iop_Add8: case Iop_Add16: case Iop_Add32: case Iop_Add64:
992 aluOp = Aalu_ADD; break;
993 case Iop_Sub8: case Iop_Sub16: case Iop_Sub32: case Iop_Sub64:
994 aluOp = Aalu_SUB; break;
995 case Iop_And8: case Iop_And16: case Iop_And32: case Iop_And64:
996 aluOp = Aalu_AND; break;
997 case Iop_Or8: case Iop_Or16: case Iop_Or32: case Iop_Or64:
998 aluOp = Aalu_OR; break;
999 case Iop_Xor8: case Iop_Xor16: case Iop_Xor32: case Iop_Xor64:
1000 aluOp = Aalu_XOR; break;
1001 case Iop_Mul16: case Iop_Mul32: case Iop_Mul64:
1002 aluOp = Aalu_MUL; break;
1003 default:
1004 aluOp = Aalu_INVALID; break;
1005 }
1006 /* For commutative ops we assume any literal
1007 values are on the second operand. */
1008 if (aluOp != Aalu_INVALID) {
1009 HReg dst = newVRegI(env);
1010 HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg1);
1011 AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
1012 addInstr(env, mk_iMOVsd_RR(reg,dst));
1013 addInstr(env, AMD64Instr_Alu64R(aluOp, rmi, dst));
1014 return dst;
1015 }
1016
1017 /* Perhaps a shift op? */
1018 switch (e->Iex.Binop.op) {
1019 case Iop_Shl64: case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
1020 shOp = Ash_SHL; break;
1021 case Iop_Shr64: case Iop_Shr32: case Iop_Shr16: case Iop_Shr8:
1022 shOp = Ash_SHR; break;
1023 case Iop_Sar64: case Iop_Sar32: case Iop_Sar16: case Iop_Sar8:
1024 shOp = Ash_SAR; break;
1025 default:
1026 shOp = Ash_INVALID; break;
1027 }
1028 if (shOp != Ash_INVALID) {
1029 HReg dst = newVRegI(env);
1030
1031 /* regL = the value to be shifted */
1032 HReg regL = iselIntExpr_R(env, e->Iex.Binop.arg1);
1033 addInstr(env, mk_iMOVsd_RR(regL,dst));
1034
1035 /* Do any necessary widening for 32/16/8 bit operands */
1036 switch (e->Iex.Binop.op) {
1037 case Iop_Shr64: case Iop_Shl64: case Iop_Sar64:
1038 break;
1039 case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
1040 break;
1041 case Iop_Shr8:
1042 addInstr(env, AMD64Instr_Alu64R(
1043 Aalu_AND, AMD64RMI_Imm(0xFF), dst));
1044 break;
1045 case Iop_Shr16:
1046 addInstr(env, AMD64Instr_Alu64R(
1047 Aalu_AND, AMD64RMI_Imm(0xFFFF), dst));
1048 break;
1049 case Iop_Shr32:
1050 addInstr(env, AMD64Instr_MovxLQ(False, dst, dst));
1051 break;
1052 case Iop_Sar8:
1053 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 56, dst));
1054 addInstr(env, AMD64Instr_Sh64(Ash_SAR, 56, dst));
1055 break;
1056 case Iop_Sar16:
1057 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 48, dst));
1058 addInstr(env, AMD64Instr_Sh64(Ash_SAR, 48, dst));
1059 break;
1060 case Iop_Sar32:
1061 addInstr(env, AMD64Instr_MovxLQ(True, dst, dst));
1062 break;
1063 default:
1064 ppIROp(e->Iex.Binop.op);
1065 vassert(0);
1066 }
1067
1068 /* Now consider the shift amount. If it's a literal, we
1069 can do a much better job than the general case. */
1070 if (e->Iex.Binop.arg2->tag == Iex_Const) {
1071 /* assert that the IR is well-typed */
1072 Int nshift;
1073 vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
1074 nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1075 vassert(nshift >= 0);
1076 if (nshift > 0)
1077 /* Can't allow nshift==0 since that means %cl */
1078 addInstr(env, AMD64Instr_Sh64(shOp, nshift, dst));
1079 } else {
1080 /* General case; we have to force the amount into %cl. */
1081 HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1082 addInstr(env, mk_iMOVsd_RR(regR,hregAMD64_RCX()));
1083 addInstr(env, AMD64Instr_Sh64(shOp, 0/* %cl */, dst));
1084 }
1085 return dst;
1086 }
1087
1088 /* Handle misc other scalar ops. */
1089 if (e->Iex.Binop.op == Iop_Max32U) {
1090 HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1091 HReg dst = newVRegI(env);
1092 HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
1093 addInstr(env, mk_iMOVsd_RR(src1, dst));
1094 addInstr(env, AMD64Instr_Alu32R(Aalu_CMP, AMD64RMI_Reg(src2), dst));
1095 addInstr(env, AMD64Instr_CMov64(Acc_B, src2, dst));
1096 return dst;
1097 }
1098
1099 if (e->Iex.Binop.op == Iop_DivModS64to32
1100 || e->Iex.Binop.op == Iop_DivModU64to32) {
1101 /* 64 x 32 -> (32(rem),32(div)) division */
1102 /* Get the 64-bit operand into edx:eax, and the other into
1103 any old R/M. */
1104 HReg rax = hregAMD64_RAX();
1105 HReg rdx = hregAMD64_RDX();
1106 HReg dst = newVRegI(env);
1107 Bool syned = toBool(e->Iex.Binop.op == Iop_DivModS64to32);
1108 AMD64RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
1109 /* Compute the left operand into a reg, and then
1110 put the top half in edx and the bottom in eax. */
1111 HReg left64 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1112 addInstr(env, mk_iMOVsd_RR(left64, rdx));
1113 addInstr(env, mk_iMOVsd_RR(left64, rax));
1114 addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, rdx));
1115 addInstr(env, AMD64Instr_Div(syned, 4, rmRight));
1116 addInstr(env, AMD64Instr_MovxLQ(False, rdx, rdx));
1117 addInstr(env, AMD64Instr_MovxLQ(False, rax, rax));
1118 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, rdx));
1119 addInstr(env, mk_iMOVsd_RR(rax, dst));
1120 addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(rdx), dst));
1121 return dst;
1122 }
1123
1124 if (e->Iex.Binop.op == Iop_32HLto64) {
1125 HReg hi32 = newVRegI(env);
1126 HReg lo32 = newVRegI(env);
1127 HReg hi32s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1128 HReg lo32s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1129 addInstr(env, mk_iMOVsd_RR(hi32s, hi32));
1130 addInstr(env, mk_iMOVsd_RR(lo32s, lo32));
1131 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, hi32));
1132 addInstr(env, AMD64Instr_MovxLQ(False, lo32, lo32));
1133 addInstr(env, AMD64Instr_Alu64R(
1134 Aalu_OR, AMD64RMI_Reg(lo32), hi32));
1135 return hi32;
1136 }
1137
1138 if (e->Iex.Binop.op == Iop_16HLto32) {
1139 HReg hi16 = newVRegI(env);
1140 HReg lo16 = newVRegI(env);
1141 HReg hi16s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1142 HReg lo16s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1143 addInstr(env, mk_iMOVsd_RR(hi16s, hi16));
1144 addInstr(env, mk_iMOVsd_RR(lo16s, lo16));
1145 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 16, hi16));
1146 addInstr(env, AMD64Instr_Alu64R(
1147 Aalu_AND, AMD64RMI_Imm(0xFFFF), lo16));
1148 addInstr(env, AMD64Instr_Alu64R(
1149 Aalu_OR, AMD64RMI_Reg(lo16), hi16));
1150 return hi16;
1151 }
1152
1153 if (e->Iex.Binop.op == Iop_8HLto16) {
1154 HReg hi8 = newVRegI(env);
1155 HReg lo8 = newVRegI(env);
1156 HReg hi8s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1157 HReg lo8s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1158 addInstr(env, mk_iMOVsd_RR(hi8s, hi8));
1159 addInstr(env, mk_iMOVsd_RR(lo8s, lo8));
1160 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 8, hi8));
1161 addInstr(env, AMD64Instr_Alu64R(
1162 Aalu_AND, AMD64RMI_Imm(0xFF), lo8));
1163 addInstr(env, AMD64Instr_Alu64R(
1164 Aalu_OR, AMD64RMI_Reg(lo8), hi8));
1165 return hi8;
1166 }
1167
1168 if (e->Iex.Binop.op == Iop_MullS32
1169 || e->Iex.Binop.op == Iop_MullS16
1170 || e->Iex.Binop.op == Iop_MullS8
1171 || e->Iex.Binop.op == Iop_MullU32
1172 || e->Iex.Binop.op == Iop_MullU16
1173 || e->Iex.Binop.op == Iop_MullU8) {
1174 HReg a32 = newVRegI(env);
1175 HReg b32 = newVRegI(env);
1176 HReg a32s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1177 HReg b32s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1178 Int shift = 0;
1179 AMD64ShiftOp shr_op = Ash_SHR;
1180 switch (e->Iex.Binop.op) {
1181 case Iop_MullS32: shr_op = Ash_SAR; shift = 32; break;
1182 case Iop_MullS16: shr_op = Ash_SAR; shift = 48; break;
1183 case Iop_MullS8: shr_op = Ash_SAR; shift = 56; break;
1184 case Iop_MullU32: shr_op = Ash_SHR; shift = 32; break;
1185 case Iop_MullU16: shr_op = Ash_SHR; shift = 48; break;
1186 case Iop_MullU8: shr_op = Ash_SHR; shift = 56; break;
1187 default: vassert(0);
1188 }
1189
1190 addInstr(env, mk_iMOVsd_RR(a32s, a32));
1191 addInstr(env, mk_iMOVsd_RR(b32s, b32));
1192 addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, a32));
1193 addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, b32));
1194 addInstr(env, AMD64Instr_Sh64(shr_op, shift, a32));
1195 addInstr(env, AMD64Instr_Sh64(shr_op, shift, b32));
1196 addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(a32), b32));
1197 return b32;
1198 }
1199
1200 if (e->Iex.Binop.op == Iop_CmpF64) {
1201 HReg fL = iselDblExpr(env, e->Iex.Binop.arg1);
1202 HReg fR = iselDblExpr(env, e->Iex.Binop.arg2);
1203 HReg dst = newVRegI(env);
1204 addInstr(env, AMD64Instr_SseUComIS(8,fL,fR,dst));
1205 /* Mask out irrelevant parts of the result so as to conform
1206 to the CmpF64 definition. */
1207 addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(0x45), dst));
1208 return dst;
1209 }
1210
1211 if (e->Iex.Binop.op == Iop_F64toI32S
1212 || e->Iex.Binop.op == Iop_F64toI64S) {
1213 Int szD = e->Iex.Binop.op==Iop_F64toI32S ? 4 : 8;
1214 HReg rf = iselDblExpr(env, e->Iex.Binop.arg2);
1215 HReg dst = newVRegI(env);
1216 set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
1217 addInstr(env, AMD64Instr_SseSF2SI( 8, szD, rf, dst ));
1218 set_SSE_rounding_default(env);
1219 return dst;
1220 }
1221
1222 /* Deal with 64-bit SIMD binary ops. For the most part these are doable
1223 by using the equivalent 128-bit operation and ignoring the upper half
1224 of the result. */
1225 AMD64SseOp op = Asse_INVALID;
1226 Bool arg1isEReg = False;
1227 Bool preShift32R = False;
1228 switch (e->Iex.Binop.op) {
1229 // The following 3 could be done with 128 bit insns too, but
1230 // first require the inputs to be reformatted.
1231 //case Iop_QNarrowBin32Sto16Sx4:
1232 //op = Asse_PACKSSD; arg1isEReg = True; break;
1233 //case Iop_QNarrowBin16Sto8Sx8:
1234 //op = Asse_PACKSSW; arg1isEReg = True; break;
1235 //case Iop_QNarrowBin16Sto8Ux8:
1236 //op = Asse_PACKUSW; arg1isEReg = True; break;
1237
1238 case Iop_InterleaveHI8x8:
1239 op = Asse_UNPCKLB; arg1isEReg = True; preShift32R = True;
1240 break;
1241 case Iop_InterleaveHI16x4:
1242 op = Asse_UNPCKLW; arg1isEReg = True; preShift32R = True;
1243 break;
1244 case Iop_InterleaveHI32x2:
1245 op = Asse_UNPCKLD; arg1isEReg = True; preShift32R = True;
1246 break;
1247 case Iop_InterleaveLO8x8:
1248 op = Asse_UNPCKLB; arg1isEReg = True;
1249 break;
1250 case Iop_InterleaveLO16x4:
1251 op = Asse_UNPCKLW; arg1isEReg = True;
1252 break;
1253 case Iop_InterleaveLO32x2:
1254 op = Asse_UNPCKLD; arg1isEReg = True;
1255 break;
1256
1257 case Iop_Add8x8: op = Asse_ADD8; break;
1258 case Iop_Add16x4: op = Asse_ADD16; break;
1259 case Iop_Add32x2: op = Asse_ADD32; break;
1260 case Iop_QAdd8Sx8: op = Asse_QADD8S; break;
1261 case Iop_QAdd16Sx4: op = Asse_QADD16S; break;
1262 case Iop_QAdd8Ux8: op = Asse_QADD8U; break;
1263 case Iop_QAdd16Ux4: op = Asse_QADD16U; break;
1264 case Iop_Avg8Ux8: op = Asse_AVG8U; break;
1265 case Iop_Avg16Ux4: op = Asse_AVG16U; break;
1266 case Iop_CmpEQ8x8: op = Asse_CMPEQ8; break;
1267 case Iop_CmpEQ16x4: op = Asse_CMPEQ16; break;
1268 case Iop_CmpEQ32x2: op = Asse_CMPEQ32; break;
1269 case Iop_CmpGT8Sx8: op = Asse_CMPGT8S; break;
1270 case Iop_CmpGT16Sx4: op = Asse_CMPGT16S; break;
1271 case Iop_CmpGT32Sx2: op = Asse_CMPGT32S; break;
1272 case Iop_Max16Sx4: op = Asse_MAX16S; break;
1273 case Iop_Max8Ux8: op = Asse_MAX8U; break;
1274 case Iop_Min16Sx4: op = Asse_MIN16S; break;
1275 case Iop_Min8Ux8: op = Asse_MIN8U; break;
1276 case Iop_MulHi16Ux4: op = Asse_MULHI16U; break;
1277 case Iop_MulHi16Sx4: op = Asse_MULHI16S; break;
1278 case Iop_Mul16x4: op = Asse_MUL16; break;
1279 case Iop_Sub8x8: op = Asse_SUB8; break;
1280 case Iop_Sub16x4: op = Asse_SUB16; break;
1281 case Iop_Sub32x2: op = Asse_SUB32; break;
1282 case Iop_QSub8Sx8: op = Asse_QSUB8S; break;
1283 case Iop_QSub16Sx4: op = Asse_QSUB16S; break;
1284 case Iop_QSub8Ux8: op = Asse_QSUB8U; break;
1285 case Iop_QSub16Ux4: op = Asse_QSUB16U; break;
1286 default: break;
1287 }
1288 if (op != Asse_INVALID) {
1289 /* This isn't pretty, but .. move each arg to the low half of an XMM
1290 register, do the operation on the whole register, and move the
1291 result back to an integer register. */
1292 const IRExpr* arg1 = e->Iex.Binop.arg1;
1293 const IRExpr* arg2 = e->Iex.Binop.arg2;
1294 vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64);
1295 vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I64);
1296 HReg iarg1 = iselIntExpr_R(env, arg1);
1297 HReg iarg2 = iselIntExpr_R(env, arg2);
1298 HReg varg1 = newVRegV(env);
1299 HReg varg2 = newVRegV(env);
1300 HReg idst = newVRegI(env);
1301 addInstr(env, AMD64Instr_SseMOVQ(iarg1, varg1, True/*toXMM*/));
1302 addInstr(env, AMD64Instr_SseMOVQ(iarg2, varg2, True/*toXMM*/));
1303 if (arg1isEReg) {
1304 if (preShift32R) {
1305 addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 32, varg1));
1306 addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 32, varg2));
1307 }
1308 addInstr(env, AMD64Instr_SseReRg(op, varg1, varg2));
1309 addInstr(env, AMD64Instr_SseMOVQ(idst, varg2, False/*!toXMM*/));
1310 } else {
1311 vassert(!preShift32R);
1312 addInstr(env, AMD64Instr_SseReRg(op, varg2, varg1));
1313 addInstr(env, AMD64Instr_SseMOVQ(idst, varg1, False/*!toXMM*/));
1314 }
1315 return idst;
1316 }
1317
1318 UInt laneBits = 0;
1319 op = Asse_INVALID;
1320 switch (e->Iex.Binop.op) {
1321 case Iop_ShlN16x4: laneBits = 16; op = Asse_SHL16; break;
1322 case Iop_ShlN32x2: laneBits = 32; op = Asse_SHL32; break;
1323 case Iop_SarN16x4: laneBits = 16; op = Asse_SAR16; break;
1324 case Iop_SarN32x2: laneBits = 32; op = Asse_SAR32; break;
1325 case Iop_ShrN16x4: laneBits = 16; op = Asse_SHR16; break;
1326 case Iop_ShrN32x2: laneBits = 32; op = Asse_SHR32; break;
1327 default: break;
1328 }
1329 if (op != Asse_INVALID) {
1330 const IRExpr* arg1 = e->Iex.Binop.arg1;
1331 const IRExpr* arg2 = e->Iex.Binop.arg2;
1332 vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64);
1333 vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I8);
1334 HReg igreg = iselIntExpr_R(env, arg1);
1335 HReg vgreg = newVRegV(env);
1336 HReg idst = newVRegI(env);
1337 addInstr(env, AMD64Instr_SseMOVQ(igreg, vgreg, True/*toXMM*/));
1338 /* If it's a shift by an in-range immediate, generate a single
1339 instruction. */
1340 if (arg2->tag == Iex_Const) {
1341 IRConst* c = arg2->Iex.Const.con;
1342 vassert(c->tag == Ico_U8);
1343 UInt shift = c->Ico.U8;
1344 if (shift < laneBits) {
1345 addInstr(env, AMD64Instr_SseShiftN(op, shift, vgreg));
1346 addInstr(env, AMD64Instr_SseMOVQ(idst, vgreg, False/*!toXMM*/));
1347 return idst;
1348 }
1349 }
1350 /* Otherwise we have to do it the longwinded way. */
1351 HReg ishift = iselIntExpr_R(env, arg2);
1352 HReg vshift = newVRegV(env);
1353 addInstr(env, AMD64Instr_SseMOVQ(ishift, vshift, True/*toXMM*/));
1354 addInstr(env, AMD64Instr_SseReRg(op, vshift, vgreg));
1355 addInstr(env, AMD64Instr_SseMOVQ(idst, vgreg, False/*!toXMM*/));
1356 return idst;
1357 }
1358
1359 if (e->Iex.Binop.op == Iop_Mul32x2) {
1360 const IRExpr* arg1 = e->Iex.Binop.arg1;
1361 const IRExpr* arg2 = e->Iex.Binop.arg2;
1362 vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64);
1363 vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I64);
1364 HReg s1 = iselIntExpr_R(env, arg1);
1365 HReg s2 = iselIntExpr_R(env, arg2);
1366 HReg resLo = newVRegI(env);
1367 // resLo = (s1 *64 s2) & 0xFFFF'FFFF
1368 addInstr(env, mk_iMOVsd_RR(s1, resLo));
1369 addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(s2), resLo));
1370 addInstr(env, AMD64Instr_MovxLQ(False, resLo, resLo));
1371
1372 // resHi = ((s1 >>u 32) *64 (s2 >>u 32)) << 32;
1373 HReg resHi = newVRegI(env);
1374 addInstr(env, mk_iMOVsd_RR(s1, resHi));
1375 addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, resHi));
1376 HReg tmp = newVRegI(env);
1377 addInstr(env, mk_iMOVsd_RR(s2, tmp));
1378 addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, tmp));
1379 addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(tmp), resHi));
1380 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, resHi));
1381
1382 // final result = resHi | resLo
1383 addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(resHi), resLo));
1384 return resLo;
1385 }
1386
1387 // A few remaining SIMD64 ops require helper functions, at least for
1388 // now.
1389 Bool second_is_UInt = False;
1390 HWord fn = 0;
1391 switch (e->Iex.Binop.op) {
1392 case Iop_CatOddLanes16x4:
1393 fn = (HWord)h_generic_calc_CatOddLanes16x4; break;
1394 case Iop_CatEvenLanes16x4:
1395 fn = (HWord)h_generic_calc_CatEvenLanes16x4; break;
1396 case Iop_PermOrZero8x8:
1397 fn = (HWord)h_generic_calc_PermOrZero8x8; break;
1398
1399 case Iop_QNarrowBin32Sto16Sx4:
1400 fn = (HWord)h_generic_calc_QNarrowBin32Sto16Sx4; break;
1401 case Iop_QNarrowBin16Sto8Sx8:
1402 fn = (HWord)h_generic_calc_QNarrowBin16Sto8Sx8; break;
1403 case Iop_QNarrowBin16Sto8Ux8:
1404 fn = (HWord)h_generic_calc_QNarrowBin16Sto8Ux8; break;
1405
1406 case Iop_NarrowBin16to8x8:
1407 fn = (HWord)h_generic_calc_NarrowBin16to8x8; break;
1408 case Iop_NarrowBin32to16x4:
1409 fn = (HWord)h_generic_calc_NarrowBin32to16x4; break;
1410
1411 case Iop_SarN8x8:
1412 fn = (HWord)h_generic_calc_SarN8x8;
1413 second_is_UInt = True;
1414 break;
1415
1416 default:
1417 fn = (HWord)0; break;
1418 }
1419 if (fn != (HWord)0) {
1420 /* Note: the following assumes all helpers are of signature
1421 ULong fn ( ULong, ULong ), and they are
1422 not marked as regparm functions.
1423 */
1424 HReg dst = newVRegI(env);
1425 HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
1426 HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1427 if (second_is_UInt)
1428 addInstr(env, AMD64Instr_MovxLQ(False, argR, argR));
1429 addInstr(env, mk_iMOVsd_RR(argL, hregAMD64_RDI()) );
1430 addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RSI()) );
1431 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 2,
1432 mk_RetLoc_simple(RLPri_Int) ));
1433 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1434 return dst;
1435 }
1436
1437 break;
1438 }
1439
1440 /* --------- UNARY OP --------- */
1441 case Iex_Unop: {
1442
1443 /* 1Uto8(64to1(expr64)) */
1444 {
1445 DEFINE_PATTERN( p_1Uto8_64to1,
1446 unop(Iop_1Uto8, unop(Iop_64to1, bind(0))) );
1447 if (matchIRExpr(&mi,p_1Uto8_64to1,e)) {
1448 const IRExpr* expr64 = mi.bindee[0];
1449 HReg dst = newVRegI(env);
1450 HReg src = iselIntExpr_R(env, expr64);
1451 addInstr(env, mk_iMOVsd_RR(src,dst) );
1452 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
1453 AMD64RMI_Imm(1), dst));
1454 return dst;
1455 }
1456 }
1457
1458 /* 8Uto64(LDle(expr64)) */
1459 {
1460 DEFINE_PATTERN(p_LDle8_then_8Uto64,
1461 unop(Iop_8Uto64,
1462 IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
1463 if (matchIRExpr(&mi,p_LDle8_then_8Uto64,e)) {
1464 HReg dst = newVRegI(env);
1465 AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1466 addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
1467 return dst;
1468 }
1469 }
1470
1471 /* 16Uto64(LDle(expr64)) */
1472 {
1473 DEFINE_PATTERN(p_LDle16_then_16Uto64,
1474 unop(Iop_16Uto64,
1475 IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
1476 if (matchIRExpr(&mi,p_LDle16_then_16Uto64,e)) {
1477 HReg dst = newVRegI(env);
1478 AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1479 addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
1480 return dst;
1481 }
1482 }
1483
1484 /* 32Uto64( Add32/Sub32/And32/Or32/Xor32(expr32, expr32) )
1485 Use 32 bit arithmetic and let the default zero-extend rule
1486 do the 32Uto64 for free. */
1487 if (e->Iex.Unop.op == Iop_32Uto64 && e->Iex.Unop.arg->tag == Iex_Binop) {
1488 IROp opi = e->Iex.Unop.arg->Iex.Binop.op; /* inner op */
1489 IRExpr* argL = e->Iex.Unop.arg->Iex.Binop.arg1;
1490 IRExpr* argR = e->Iex.Unop.arg->Iex.Binop.arg2;
1491 AMD64AluOp aluOp = Aalu_INVALID;
1492 switch (opi) {
1493 case Iop_Add32: aluOp = Aalu_ADD; break;
1494 case Iop_Sub32: aluOp = Aalu_SUB; break;
1495 case Iop_And32: aluOp = Aalu_AND; break;
1496 case Iop_Or32: aluOp = Aalu_OR; break;
1497 case Iop_Xor32: aluOp = Aalu_XOR; break;
1498 default: break;
1499 }
1500 if (aluOp != Aalu_INVALID) {
1501 /* For commutative ops we assume any literal values are on
1502 the second operand. */
1503 HReg dst = newVRegI(env);
1504 HReg reg = iselIntExpr_R(env, argL);
1505 AMD64RMI* rmi = iselIntExpr_RMI(env, argR);
1506 addInstr(env, mk_iMOVsd_RR(reg,dst));
1507 addInstr(env, AMD64Instr_Alu32R(aluOp, rmi, dst));
1508 return dst;
1509 }
1510 /* just fall through to normal handling for Iop_32Uto64 */
1511 }
1512
1513 /* Fallback cases */
1514 switch (e->Iex.Unop.op) {
1515 case Iop_32Uto64:
1516 case Iop_32Sto64: {
1517 HReg dst = newVRegI(env);
1518 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1519 addInstr(env, AMD64Instr_MovxLQ(e->Iex.Unop.op == Iop_32Sto64,
1520 src, dst) );
1521 return dst;
1522 }
1523 case Iop_128HIto64: {
1524 HReg rHi, rLo;
1525 iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1526 return rHi; /* and abandon rLo */
1527 }
1528 case Iop_128to64: {
1529 HReg rHi, rLo;
1530 iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1531 return rLo; /* and abandon rHi */
1532 }
1533 case Iop_8Uto16:
1534 case Iop_8Uto32:
1535 case Iop_8Uto64:
1536 case Iop_16Uto64:
1537 case Iop_16Uto32: {
1538 HReg dst = newVRegI(env);
1539 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1540 Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Uto32
1541 || e->Iex.Unop.op==Iop_16Uto64 );
1542 UInt mask = srcIs16 ? 0xFFFF : 0xFF;
1543 addInstr(env, mk_iMOVsd_RR(src,dst) );
1544 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
1545 AMD64RMI_Imm(mask), dst));
1546 return dst;
1547 }
1548 case Iop_8Sto16:
1549 case Iop_8Sto64:
1550 case Iop_8Sto32:
1551 case Iop_16Sto32:
1552 case Iop_16Sto64: {
1553 HReg dst = newVRegI(env);
1554 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1555 Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Sto32
1556 || e->Iex.Unop.op==Iop_16Sto64 );
1557 UInt amt = srcIs16 ? 48 : 56;
1558 addInstr(env, mk_iMOVsd_RR(src,dst) );
1559 addInstr(env, AMD64Instr_Sh64(Ash_SHL, amt, dst));
1560 addInstr(env, AMD64Instr_Sh64(Ash_SAR, amt, dst));
1561 return dst;
1562 }
1563 case Iop_Not8:
1564 case Iop_Not16:
1565 case Iop_Not32:
1566 case Iop_Not64: {
1567 HReg dst = newVRegI(env);
1568 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1569 addInstr(env, mk_iMOVsd_RR(src,dst) );
1570 addInstr(env, AMD64Instr_Unary64(Aun_NOT,dst));
1571 return dst;
1572 }
1573 case Iop_16HIto8:
1574 case Iop_32HIto16:
1575 case Iop_64HIto32: {
1576 HReg dst = newVRegI(env);
1577 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1578 Int shift = 0;
1579 switch (e->Iex.Unop.op) {
1580 case Iop_16HIto8: shift = 8; break;
1581 case Iop_32HIto16: shift = 16; break;
1582 case Iop_64HIto32: shift = 32; break;
1583 default: vassert(0);
1584 }
1585 addInstr(env, mk_iMOVsd_RR(src,dst) );
1586 addInstr(env, AMD64Instr_Sh64(Ash_SHR, shift, dst));
1587 return dst;
1588 }
1589 case Iop_1Uto64:
1590 case Iop_1Uto32:
1591 case Iop_1Uto8: {
1592 HReg dst = newVRegI(env);
1593 AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1594 addInstr(env, AMD64Instr_Set64(cond,dst));
1595 return dst;
1596 }
1597 case Iop_1Sto8:
1598 case Iop_1Sto16:
1599 case Iop_1Sto32:
1600 case Iop_1Sto64: {
1601 /* could do better than this, but for now ... */
1602 HReg dst = newVRegI(env);
1603 AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1604 addInstr(env, AMD64Instr_Set64(cond,dst));
1605 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 63, dst));
1606 addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1607 return dst;
1608 }
1609 case Iop_Ctz64: {
1610 /* Count trailing zeroes, implemented by amd64 'bsfq' */
1611 HReg dst = newVRegI(env);
1612 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1613 addInstr(env, AMD64Instr_Bsfr64(True,src,dst));
1614 return dst;
1615 }
1616 case Iop_Clz64: {
1617 /* Count leading zeroes. Do 'bsrq' to establish the index
1618 of the highest set bit, and subtract that value from
1619 63. */
1620 HReg tmp = newVRegI(env);
1621 HReg dst = newVRegI(env);
1622 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1623 addInstr(env, AMD64Instr_Bsfr64(False,src,tmp));
1624 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
1625 AMD64RMI_Imm(63), dst));
1626 addInstr(env, AMD64Instr_Alu64R(Aalu_SUB,
1627 AMD64RMI_Reg(tmp), dst));
1628 return dst;
1629 }
1630
1631 case Iop_CmpwNEZ64: {
1632 HReg dst = newVRegI(env);
1633 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1634 addInstr(env, mk_iMOVsd_RR(src,dst));
1635 addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
1636 addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
1637 AMD64RMI_Reg(src), dst));
1638 addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1639 return dst;
1640 }
1641
1642 case Iop_CmpwNEZ32: {
1643 HReg src = newVRegI(env);
1644 HReg dst = newVRegI(env);
1645 HReg pre = iselIntExpr_R(env, e->Iex.Unop.arg);
1646 addInstr(env, mk_iMOVsd_RR(pre,src));
1647 addInstr(env, AMD64Instr_MovxLQ(False, src, src));
1648 addInstr(env, mk_iMOVsd_RR(src,dst));
1649 addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
1650 addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
1651 AMD64RMI_Reg(src), dst));
1652 addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1653 return dst;
1654 }
1655
1656 case Iop_Left8:
1657 case Iop_Left16:
1658 case Iop_Left32:
1659 case Iop_Left64: {
1660 HReg dst = newVRegI(env);
1661 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1662 addInstr(env, mk_iMOVsd_RR(src, dst));
1663 addInstr(env, AMD64Instr_Unary64(Aun_NEG, dst));
1664 addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(src), dst));
1665 return dst;
1666 }
1667
1668 case Iop_V128to32: {
1669 HReg dst = newVRegI(env);
1670 HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1671 AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
1672 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, rsp_m16));
1673 addInstr(env, AMD64Instr_LoadEX(4, False/*z-widen*/, rsp_m16, dst));
1674 return dst;
1675 }
1676
1677 /* V128{HI}to64 */
1678 case Iop_V128to64: {
1679 HReg dst = newVRegI(env);
1680 HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1681 addInstr(env, AMD64Instr_SseMOVQ(dst, vec, False/*!toXMM*/));
1682 return dst;
1683 }
1684 case Iop_V128HIto64: {
1685 HReg dst = newVRegI(env);
1686 HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1687 HReg vec2 = newVRegV(env);
1688 addInstr(env, mk_vMOVsd_RR(vec, vec2));
1689 addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, vec2));
1690 addInstr(env, AMD64Instr_SseMOVQ(dst, vec2, False/*!toXMM*/));
1691 return dst;
1692 }
1693
1694 /* V256to64_{3,2,1,0} */
1695 case Iop_V256to64_0: case Iop_V256to64_1:
1696 case Iop_V256to64_2: case Iop_V256to64_3: {
1697 HReg vHi, vLo, vec;
1698 iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
1699 /* Do the first part of the selection by deciding which of
1700 the 128 bit registers to look at, and second part using
1701 the same scheme as for V128{HI}to64 above. */
1702 Bool low64of128 = True;
1703 switch (e->Iex.Unop.op) {
1704 case Iop_V256to64_0: vec = vLo; low64of128 = True; break;
1705 case Iop_V256to64_1: vec = vLo; low64of128 = False; break;
1706 case Iop_V256to64_2: vec = vHi; low64of128 = True; break;
1707 case Iop_V256to64_3: vec = vHi; low64of128 = False; break;
1708 default: vassert(0);
1709 }
1710 HReg dst = newVRegI(env);
1711 if (low64of128) {
1712 addInstr(env, AMD64Instr_SseMOVQ(dst, vec, False/*!toXMM*/));
1713 } else {
1714 HReg vec2 = newVRegV(env);
1715 addInstr(env, mk_vMOVsd_RR(vec, vec2));
1716 addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, vec2));
1717 addInstr(env, AMD64Instr_SseMOVQ(dst, vec2, False/*!toXMM*/));
1718 }
1719 return dst;
1720 }
1721
1722 /* ReinterpF64asI64(e) */
1723 /* Given an IEEE754 double, produce an I64 with the same bit
1724 pattern. */
1725 case Iop_ReinterpF64asI64: {
1726 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1727 HReg dst = newVRegI(env);
1728 HReg src = iselDblExpr(env, e->Iex.Unop.arg);
1729 /* paranoia */
1730 set_SSE_rounding_default(env);
1731 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, src, m8_rsp));
1732 addInstr(env, AMD64Instr_Alu64R(
1733 Aalu_MOV, AMD64RMI_Mem(m8_rsp), dst));
1734 return dst;
1735 }
1736
1737 /* ReinterpF32asI32(e) */
1738 /* Given an IEEE754 single, produce an I64 with the same bit
1739 pattern in the lower half. */
1740 case Iop_ReinterpF32asI32: {
1741 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1742 HReg dst = newVRegI(env);
1743 HReg src = iselFltExpr(env, e->Iex.Unop.arg);
1744 /* paranoia */
1745 set_SSE_rounding_default(env);
1746 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, src, m8_rsp));
1747 addInstr(env, AMD64Instr_LoadEX(4, False/*unsigned*/, m8_rsp, dst ));
1748 return dst;
1749 }
1750
1751 case Iop_16to8:
1752 case Iop_32to8:
1753 case Iop_64to8:
1754 case Iop_32to16:
1755 case Iop_64to16:
1756 case Iop_64to32:
1757 /* These are no-ops. */
1758 return iselIntExpr_R(env, e->Iex.Unop.arg);
1759
1760 case Iop_GetMSBs8x8: {
1761 /* Note: the following assumes the helper is of
1762 signature
1763 UInt fn ( ULong ), and is not a regparm fn.
1764 */
1765 HReg dst = newVRegI(env);
1766 HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
1767 HWord fn = (HWord)h_generic_calc_GetMSBs8x8;
1768 addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
1769 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
1770 1, mk_RetLoc_simple(RLPri_Int) ));
1771 /* MovxLQ is not exactly the right thing here. We just
1772 need to get the bottom 8 bits of RAX into dst, and zero
1773 out everything else. Assuming that the helper returns
1774 a UInt with the top 24 bits zeroed out, it'll do,
1775 though. */
1776 addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1777 return dst;
1778 }
1779
1780 case Iop_GetMSBs8x16: {
1781 /* Note: the following assumes the helper is of signature
1782 UInt fn ( ULong w64hi, ULong w64Lo ),
1783 and is not a regparm fn. */
1784 HReg dst = newVRegI(env);
1785 HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1786 HReg rsp = hregAMD64_RSP();
1787 HWord fn = (HWord)h_generic_calc_GetMSBs8x16;
1788 AMD64AMode* m8_rsp = AMD64AMode_IR( -8, rsp);
1789 AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
1790 addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
1791 16, vec, m16_rsp));
1792 /* hi 64 bits into RDI -- the first arg */
1793 addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1794 AMD64RMI_Mem(m8_rsp),
1795 hregAMD64_RDI() )); /* 1st arg */
1796 /* lo 64 bits into RSI -- the 2nd arg */
1797 addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1798 AMD64RMI_Mem(m16_rsp),
1799 hregAMD64_RSI() )); /* 2nd arg */
1800 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
1801 2, mk_RetLoc_simple(RLPri_Int) ));
1802 /* MovxLQ is not exactly the right thing here. We just
1803 need to get the bottom 16 bits of RAX into dst, and zero
1804 out everything else. Assuming that the helper returns
1805 a UInt with the top 16 bits zeroed out, it'll do,
1806 though. */
1807 addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1808 return dst;
1809 }
1810
1811 default:
1812 break;
1813 }
1814
1815 /* Deal with unary 64-bit SIMD ops. */
1816 HWord fn = 0;
1817 switch (e->Iex.Unop.op) {
1818 case Iop_CmpNEZ32x2:
1819 fn = (HWord)h_generic_calc_CmpNEZ32x2; break;
1820 case Iop_CmpNEZ16x4:
1821 fn = (HWord)h_generic_calc_CmpNEZ16x4; break;
1822 case Iop_CmpNEZ8x8:
1823 fn = (HWord)h_generic_calc_CmpNEZ8x8; break;
1824 default:
1825 fn = (HWord)0; break;
1826 }
1827 if (fn != (HWord)0) {
1828 /* Note: the following assumes all helpers are of
1829 signature
1830 ULong fn ( ULong ), and they are
1831 not marked as regparm functions.
1832 */
1833 HReg dst = newVRegI(env);
1834 HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
1835 addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
1836 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 1,
1837 mk_RetLoc_simple(RLPri_Int) ));
1838 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1839 return dst;
1840 }
1841
1842 break;
1843 }
1844
1845 /* --------- GET --------- */
1846 case Iex_Get: {
1847 if (ty == Ity_I64) {
1848 HReg dst = newVRegI(env);
1849 addInstr(env, AMD64Instr_Alu64R(
1850 Aalu_MOV,
1851 AMD64RMI_Mem(
1852 AMD64AMode_IR(e->Iex.Get.offset,
1853 hregAMD64_RBP())),
1854 dst));
1855 return dst;
1856 }
1857 if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
1858 HReg dst = newVRegI(env);
1859 addInstr(env, AMD64Instr_LoadEX(
1860 toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
1861 False,
1862 AMD64AMode_IR(e->Iex.Get.offset,hregAMD64_RBP()),
1863 dst));
1864 return dst;
1865 }
1866 break;
1867 }
1868
1869 case Iex_GetI: {
1870 AMD64AMode* am
1871 = genGuestArrayOffset(
1872 env, e->Iex.GetI.descr,
1873 e->Iex.GetI.ix, e->Iex.GetI.bias );
1874 HReg dst = newVRegI(env);
1875 if (ty == Ity_I8) {
1876 addInstr(env, AMD64Instr_LoadEX( 1, False, am, dst ));
1877 return dst;
1878 }
1879 if (ty == Ity_I64) {
1880 addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, AMD64RMI_Mem(am), dst ));
1881 return dst;
1882 }
1883 break;
1884 }
1885
1886 /* --------- CCALL --------- */
1887 case Iex_CCall: {
1888 HReg dst = newVRegI(env);
1889 vassert(ty == e->Iex.CCall.retty);
1890
1891 /* be very restrictive for now. Only 64-bit ints allowed for
1892 args, and 64 or 32 bits for return type. */
1893 if (e->Iex.CCall.retty != Ity_I64 && e->Iex.CCall.retty != Ity_I32)
1894 goto irreducible;
1895
1896 /* Marshal args, do the call. */
1897 UInt addToSp = 0;
1898 RetLoc rloc = mk_RetLoc_INVALID();
1899 doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
1900 e->Iex.CCall.cee, e->Iex.CCall.retty, e->Iex.CCall.args );
1901 vassert(is_sane_RetLoc(rloc));
1902 vassert(rloc.pri == RLPri_Int);
1903 vassert(addToSp == 0);
1904
1905 /* Move to dst, and zero out the top 32 bits if the result type is
1906 Ity_I32. Probably overkill, but still .. */
1907 if (e->Iex.CCall.retty == Ity_I64)
1908 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1909 else
1910 addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1911
1912 return dst;
1913 }
1914
1915 /* --------- LITERAL --------- */
1916 /* 64/32/16/8-bit literals */
1917 case Iex_Const:
1918 if (ty == Ity_I64) {
1919 HReg r = newVRegI(env);
1920 addInstr(env, AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, r));
1921 return r;
1922 } else {
1923 AMD64RMI* rmi = iselIntExpr_RMI ( env, e );
1924 HReg r = newVRegI(env);
1925 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, rmi, r));
1926 return r;
1927 }
1928
1929 /* --------- MULTIPLEX --------- */
1930 case Iex_ITE: { // VFD
1931 if ((ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8)
1932 && typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1) {
1933 HReg r1 = iselIntExpr_R(env, e->Iex.ITE.iftrue);
1934 HReg r0 = iselIntExpr_R(env, e->Iex.ITE.iffalse);
1935 HReg dst = newVRegI(env);
1936 addInstr(env, mk_iMOVsd_RR(r1,dst));
1937 AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
1938 addInstr(env, AMD64Instr_CMov64(cc ^ 1, r0, dst));
1939 return dst;
1940 }
1941 break;
1942 }
1943
1944 /* --------- TERNARY OP --------- */
1945 case Iex_Triop: {
1946 IRTriop *triop = e->Iex.Triop.details;
1947 /* C3210 flags following FPU partial remainder (fprem), both
1948 IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
1949 if (triop->op == Iop_PRemC3210F64
1950 || triop->op == Iop_PRem1C3210F64) {
1951 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1952 HReg arg1 = iselDblExpr(env, triop->arg2);
1953 HReg arg2 = iselDblExpr(env, triop->arg3);
1954 HReg dst = newVRegI(env);
1955 addInstr(env, AMD64Instr_A87Free(2));
1956
1957 /* one arg -> top of x87 stack */
1958 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg2, m8_rsp));
1959 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
1960
1961 /* other arg -> top of x87 stack */
1962 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg1, m8_rsp));
1963 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
1964
1965 switch (triop->op) {
1966 case Iop_PRemC3210F64:
1967 addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
1968 break;
1969 case Iop_PRem1C3210F64:
1970 addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
1971 break;
1972 default:
1973 vassert(0);
1974 }
1975 /* Ignore the result, and instead make off with the FPU's
1976 C3210 flags (in the status word). */
1977 addInstr(env, AMD64Instr_A87StSW(m8_rsp));
1978 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Mem(m8_rsp),dst));
1979 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0x4700),dst));
1980 return dst;
1981 }
1982 break;
1983 }
1984
1985 default:
1986 break;
1987 } /* switch (e->tag) */
1988
1989 /* We get here if no pattern matched. */
1990 irreducible:
1991 ppIRExpr(e);
1992 vpanic("iselIntExpr_R(amd64): cannot reduce tree");
1993 }
1994
1995
1996 /*---------------------------------------------------------*/
1997 /*--- ISEL: Integer expression auxiliaries ---*/
1998 /*---------------------------------------------------------*/
1999
2000 /* --------------------- AMODEs --------------------- */
2001
2002 /* Return an AMode which computes the value of the specified
2003 expression, possibly also adding insns to the code list as a
2004 result. The expression may only be a 32-bit one.
2005 */
2006
iselIntExpr_AMode(ISelEnv * env,const IRExpr * e)2007 static AMD64AMode* iselIntExpr_AMode ( ISelEnv* env, const IRExpr* e )
2008 {
2009 AMD64AMode* am = iselIntExpr_AMode_wrk(env, e);
2010 vassert(sane_AMode(am));
2011 return am;
2012 }
2013
2014 /* DO NOT CALL THIS DIRECTLY ! */
iselIntExpr_AMode_wrk(ISelEnv * env,const IRExpr * e)2015 static AMD64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, const IRExpr* e )
2016 {
2017 MatchInfo mi;
2018 DECLARE_PATTERN(p_complex);
2019 IRType ty = typeOfIRExpr(env->type_env,e);
2020 vassert(ty == Ity_I64);
2021
2022 /* Add64( Add64(expr1, Shl64(expr2, imm8)), simm32 ) */
2023 /* bind0 bind1 bind2 bind3 */
2024 DEFINE_PATTERN(p_complex,
2025 binop( Iop_Add64,
2026 binop( Iop_Add64,
2027 bind(0),
2028 binop(Iop_Shl64, bind(1), bind(2))
2029 ),
2030 bind(3)
2031 )
2032 );
2033 if (matchIRExpr(&mi, p_complex, e)) {
2034 const IRExpr* expr1 = mi.bindee[0];
2035 const IRExpr* expr2 = mi.bindee[1];
2036 const IRExpr* imm8 = mi.bindee[2];
2037 const IRExpr* simm32 = mi.bindee[3];
2038 if (imm8->tag == Iex_Const
2039 && imm8->Iex.Const.con->tag == Ico_U8
2040 && imm8->Iex.Const.con->Ico.U8 < 4
2041 /* imm8 is OK, now check simm32 */
2042 && simm32->tag == Iex_Const
2043 && simm32->Iex.Const.con->tag == Ico_U64
2044 && fitsIn32Bits(simm32->Iex.Const.con->Ico.U64)) {
2045 UInt shift = imm8->Iex.Const.con->Ico.U8;
2046 UInt offset = toUInt(simm32->Iex.Const.con->Ico.U64);
2047 HReg r1 = iselIntExpr_R(env, expr1);
2048 HReg r2 = iselIntExpr_R(env, expr2);
2049 vassert(shift == 0 || shift == 1 || shift == 2 || shift == 3);
2050 return AMD64AMode_IRRS(offset, r1, r2, shift);
2051 }
2052 }
2053
2054 /* Add64(expr1, Shl64(expr2, imm)) */
2055 if (e->tag == Iex_Binop
2056 && e->Iex.Binop.op == Iop_Add64
2057 && e->Iex.Binop.arg2->tag == Iex_Binop
2058 && e->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl64
2059 && e->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
2060 && e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
2061 UInt shift = e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
2062 if (shift == 1 || shift == 2 || shift == 3) {
2063 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2064 HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg2->Iex.Binop.arg1 );
2065 return AMD64AMode_IRRS(0, r1, r2, shift);
2066 }
2067 }
2068
2069 /* Add64(expr,i) */
2070 if (e->tag == Iex_Binop
2071 && e->Iex.Binop.op == Iop_Add64
2072 && e->Iex.Binop.arg2->tag == Iex_Const
2073 && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64
2074 && fitsIn32Bits(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)) {
2075 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2076 return AMD64AMode_IR(
2077 toUInt(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64),
2078 r1
2079 );
2080 }
2081
2082 /* Doesn't match anything in particular. Generate it into
2083 a register and use that. */
2084 {
2085 HReg r1 = iselIntExpr_R(env, e);
2086 return AMD64AMode_IR(0, r1);
2087 }
2088 }
2089
2090
2091 /* --------------------- RMIs --------------------- */
2092
2093 /* Similarly, calculate an expression into an X86RMI operand. As with
2094 iselIntExpr_R, the expression can have type 32, 16 or 8 bits. */
2095
iselIntExpr_RMI(ISelEnv * env,const IRExpr * e)2096 static AMD64RMI* iselIntExpr_RMI ( ISelEnv* env, const IRExpr* e )
2097 {
2098 AMD64RMI* rmi = iselIntExpr_RMI_wrk(env, e);
2099 /* sanity checks ... */
2100 switch (rmi->tag) {
2101 case Armi_Imm:
2102 return rmi;
2103 case Armi_Reg:
2104 vassert(hregClass(rmi->Armi.Reg.reg) == HRcInt64);
2105 vassert(hregIsVirtual(rmi->Armi.Reg.reg));
2106 return rmi;
2107 case Armi_Mem:
2108 vassert(sane_AMode(rmi->Armi.Mem.am));
2109 return rmi;
2110 default:
2111 vpanic("iselIntExpr_RMI: unknown amd64 RMI tag");
2112 }
2113 }
2114
2115 /* DO NOT CALL THIS DIRECTLY ! */
iselIntExpr_RMI_wrk(ISelEnv * env,const IRExpr * e)2116 static AMD64RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, const IRExpr* e )
2117 {
2118 IRType ty = typeOfIRExpr(env->type_env,e);
2119 vassert(ty == Ity_I64 || ty == Ity_I32
2120 || ty == Ity_I16 || ty == Ity_I8);
2121
2122 /* special case: immediate 64/32/16/8 */
2123 if (e->tag == Iex_Const) {
2124 switch (e->Iex.Const.con->tag) {
2125 case Ico_U64:
2126 if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
2127 return AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
2128 }
2129 break;
2130 case Ico_U32:
2131 return AMD64RMI_Imm(e->Iex.Const.con->Ico.U32); break;
2132 case Ico_U16:
2133 return AMD64RMI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16); break;
2134 case Ico_U8:
2135 return AMD64RMI_Imm(0xFF & e->Iex.Const.con->Ico.U8); break;
2136 default:
2137 vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
2138 }
2139 }
2140
2141 /* special case: 64-bit GET */
2142 if (e->tag == Iex_Get && ty == Ity_I64) {
2143 return AMD64RMI_Mem(AMD64AMode_IR(e->Iex.Get.offset,
2144 hregAMD64_RBP()));
2145 }
2146
2147 /* special case: 64-bit load from memory */
2148 if (e->tag == Iex_Load && ty == Ity_I64
2149 && e->Iex.Load.end == Iend_LE) {
2150 AMD64AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2151 return AMD64RMI_Mem(am);
2152 }
2153
2154 /* default case: calculate into a register and return that */
2155 {
2156 HReg r = iselIntExpr_R ( env, e );
2157 return AMD64RMI_Reg(r);
2158 }
2159 }
2160
2161
2162 /* --------------------- RIs --------------------- */
2163
2164 /* Calculate an expression into an AMD64RI operand. As with
2165 iselIntExpr_R, the expression can have type 64, 32, 16 or 8
2166 bits. */
2167
iselIntExpr_RI(ISelEnv * env,const IRExpr * e)2168 static AMD64RI* iselIntExpr_RI ( ISelEnv* env, const IRExpr* e )
2169 {
2170 AMD64RI* ri = iselIntExpr_RI_wrk(env, e);
2171 /* sanity checks ... */
2172 switch (ri->tag) {
2173 case Ari_Imm:
2174 return ri;
2175 case Ari_Reg:
2176 vassert(hregClass(ri->Ari.Reg.reg) == HRcInt64);
2177 vassert(hregIsVirtual(ri->Ari.Reg.reg));
2178 return ri;
2179 default:
2180 vpanic("iselIntExpr_RI: unknown amd64 RI tag");
2181 }
2182 }
2183
2184 /* DO NOT CALL THIS DIRECTLY ! */
iselIntExpr_RI_wrk(ISelEnv * env,const IRExpr * e)2185 static AMD64RI* iselIntExpr_RI_wrk ( ISelEnv* env, const IRExpr* e )
2186 {
2187 IRType ty = typeOfIRExpr(env->type_env,e);
2188 vassert(ty == Ity_I64 || ty == Ity_I32
2189 || ty == Ity_I16 || ty == Ity_I8);
2190
2191 /* special case: immediate */
2192 if (e->tag == Iex_Const) {
2193 switch (e->Iex.Const.con->tag) {
2194 case Ico_U64:
2195 if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
2196 return AMD64RI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
2197 }
2198 break;
2199 case Ico_U32:
2200 return AMD64RI_Imm(e->Iex.Const.con->Ico.U32);
2201 case Ico_U16:
2202 return AMD64RI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16);
2203 case Ico_U8:
2204 return AMD64RI_Imm(0xFF & e->Iex.Const.con->Ico.U8);
2205 default:
2206 vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
2207 }
2208 }
2209
2210 /* default case: calculate into a register and return that */
2211 {
2212 HReg r = iselIntExpr_R ( env, e );
2213 return AMD64RI_Reg(r);
2214 }
2215 }
2216
2217
2218 /* --------------------- RMs --------------------- */
2219
2220 /* Similarly, calculate an expression into an AMD64RM operand. As
2221 with iselIntExpr_R, the expression can have type 64, 32, 16 or 8
2222 bits. */
2223
iselIntExpr_RM(ISelEnv * env,const IRExpr * e)2224 static AMD64RM* iselIntExpr_RM ( ISelEnv* env, const IRExpr* e )
2225 {
2226 AMD64RM* rm = iselIntExpr_RM_wrk(env, e);
2227 /* sanity checks ... */
2228 switch (rm->tag) {
2229 case Arm_Reg:
2230 vassert(hregClass(rm->Arm.Reg.reg) == HRcInt64);
2231 vassert(hregIsVirtual(rm->Arm.Reg.reg));
2232 return rm;
2233 case Arm_Mem:
2234 vassert(sane_AMode(rm->Arm.Mem.am));
2235 return rm;
2236 default:
2237 vpanic("iselIntExpr_RM: unknown amd64 RM tag");
2238 }
2239 }
2240
2241 /* DO NOT CALL THIS DIRECTLY ! */
iselIntExpr_RM_wrk(ISelEnv * env,const IRExpr * e)2242 static AMD64RM* iselIntExpr_RM_wrk ( ISelEnv* env, const IRExpr* e )
2243 {
2244 IRType ty = typeOfIRExpr(env->type_env,e);
2245 vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
2246
2247 /* special case: 64-bit GET */
2248 if (e->tag == Iex_Get && ty == Ity_I64) {
2249 return AMD64RM_Mem(AMD64AMode_IR(e->Iex.Get.offset,
2250 hregAMD64_RBP()));
2251 }
2252
2253 /* special case: load from memory */
2254
2255 /* default case: calculate into a register and return that */
2256 {
2257 HReg r = iselIntExpr_R ( env, e );
2258 return AMD64RM_Reg(r);
2259 }
2260 }
2261
2262
2263 /* --------------------- CONDCODE --------------------- */
2264
2265 /* Generate code to evaluated a bit-typed expression, returning the
2266 condition code which would correspond when the expression would
2267 notionally have returned 1. */
2268
iselCondCode(ISelEnv * env,const IRExpr * e)2269 static AMD64CondCode iselCondCode ( ISelEnv* env, const IRExpr* e )
2270 {
2271 /* Uh, there's nothing we can sanity check here, unfortunately. */
2272 return iselCondCode_wrk(env,e);
2273 }
2274
2275 /* DO NOT CALL THIS DIRECTLY ! */
iselCondCode_wrk(ISelEnv * env,const IRExpr * e)2276 static AMD64CondCode iselCondCode_wrk ( ISelEnv* env, const IRExpr* e )
2277 {
2278 vassert(e);
2279 vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
2280
2281 /* var */
2282 if (e->tag == Iex_RdTmp) {
2283 HReg r64 = lookupIRTemp(env, e->Iex.RdTmp.tmp);
2284 HReg dst = newVRegI(env);
2285 addInstr(env, mk_iMOVsd_RR(r64,dst));
2286 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(1),dst));
2287 return Acc_NZ;
2288 }
2289
2290 /* Constant 1:Bit */
2291 if (e->tag == Iex_Const) {
2292 HReg r;
2293 vassert(e->Iex.Const.con->tag == Ico_U1);
2294 vassert(e->Iex.Const.con->Ico.U1 == True
2295 || e->Iex.Const.con->Ico.U1 == False);
2296 r = newVRegI(env);
2297 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Imm(0),r));
2298 addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,AMD64RMI_Reg(r),r));
2299 return e->Iex.Const.con->Ico.U1 ? Acc_Z : Acc_NZ;
2300 }
2301
2302 /* Not1(...) */
2303 if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
2304 /* Generate code for the arg, and negate the test condition */
2305 return 1 ^ iselCondCode(env, e->Iex.Unop.arg);
2306 }
2307
2308 /* --- patterns rooted at: 64to1 --- */
2309
2310 /* 64to1 */
2311 if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_64to1) {
2312 HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
2313 addInstr(env, AMD64Instr_Test64(1,reg));
2314 return Acc_NZ;
2315 }
2316
2317 /* --- patterns rooted at: 32to1 --- */
2318
2319 /* 32to1 */
2320 if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_32to1) {
2321 HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
2322 addInstr(env, AMD64Instr_Test64(1,reg));
2323 return Acc_NZ;
2324 }
2325
2326 /* --- patterns rooted at: CmpNEZ8 --- */
2327
2328 /* CmpNEZ8(x) */
2329 if (e->tag == Iex_Unop
2330 && e->Iex.Unop.op == Iop_CmpNEZ8) {
2331 HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
2332 addInstr(env, AMD64Instr_Test64(0xFF,r));
2333 return Acc_NZ;
2334 }
2335
2336 /* --- patterns rooted at: CmpNEZ16 --- */
2337
2338 /* CmpNEZ16(x) */
2339 if (e->tag == Iex_Unop
2340 && e->Iex.Unop.op == Iop_CmpNEZ16) {
2341 HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
2342 addInstr(env, AMD64Instr_Test64(0xFFFF,r));
2343 return Acc_NZ;
2344 }
2345
2346 /* --- patterns rooted at: CmpNEZ32 --- */
2347
2348 if (e->tag == Iex_Unop
2349 && e->Iex.Unop.op == Iop_CmpNEZ32) {
2350 IRExpr* arg = e->Iex.Unop.arg;
2351 if (arg->tag == Iex_Binop
2352 && (arg->Iex.Binop.op == Iop_Or32
2353 || arg->Iex.Binop.op == Iop_And32)) {
2354 /* CmpNEZ32(Or32(x,y)) */
2355 /* CmpNEZ32(And32(x,y)) */
2356 HReg r0 = iselIntExpr_R(env, arg->Iex.Binop.arg1);
2357 AMD64RMI* rmi1 = iselIntExpr_RMI(env, arg->Iex.Binop.arg2);
2358 HReg tmp = newVRegI(env);
2359 addInstr(env, mk_iMOVsd_RR(r0, tmp));
2360 addInstr(env, AMD64Instr_Alu32R(
2361 arg->Iex.Binop.op == Iop_Or32 ? Aalu_OR : Aalu_AND,
2362 rmi1, tmp));
2363 return Acc_NZ;
2364 }
2365 /* CmpNEZ32(x) */
2366 HReg r1 = iselIntExpr_R(env, arg);
2367 AMD64RMI* rmi2 = AMD64RMI_Imm(0);
2368 addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
2369 return Acc_NZ;
2370 }
2371
2372 /* --- patterns rooted at: CmpNEZ64 --- */
2373
2374 if (e->tag == Iex_Unop
2375 && e->Iex.Unop.op == Iop_CmpNEZ64) {
2376 IRExpr* arg = e->Iex.Unop.arg;
2377 if (arg->tag == Iex_Binop
2378 && (arg->Iex.Binop.op == Iop_Or64
2379 || arg->Iex.Binop.op == Iop_And64)) {
2380 /* CmpNEZ64(Or64(x,y)) */
2381 /* CmpNEZ64(And64(x,y)) */
2382 HReg r0 = iselIntExpr_R(env, arg->Iex.Binop.arg1);
2383 AMD64RMI* rmi1 = iselIntExpr_RMI(env, arg->Iex.Binop.arg2);
2384 HReg tmp = newVRegI(env);
2385 addInstr(env, mk_iMOVsd_RR(r0, tmp));
2386 addInstr(env, AMD64Instr_Alu64R(
2387 arg->Iex.Binop.op == Iop_Or64 ? Aalu_OR : Aalu_AND,
2388 rmi1, tmp));
2389 return Acc_NZ;
2390 }
2391 /* CmpNEZ64(x) */
2392 HReg r1 = iselIntExpr_R(env, arg);
2393 AMD64RMI* rmi2 = AMD64RMI_Imm(0);
2394 addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
2395 return Acc_NZ;
2396 }
2397
2398 /* --- patterns rooted at: Cmp{EQ,NE}{8,16,32} --- */
2399
2400 /* CmpEQ8 / CmpNE8 */
2401 if (e->tag == Iex_Binop
2402 && (e->Iex.Binop.op == Iop_CmpEQ8
2403 || e->Iex.Binop.op == Iop_CmpNE8
2404 || e->Iex.Binop.op == Iop_CasCmpEQ8
2405 || e->Iex.Binop.op == Iop_CasCmpNE8)) {
2406 if (isZeroU8(e->Iex.Binop.arg2)) {
2407 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2408 addInstr(env, AMD64Instr_Test64(0xFF,r1));
2409 switch (e->Iex.Binop.op) {
2410 case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
2411 case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
2412 default: vpanic("iselCondCode(amd64): CmpXX8(expr,0:I8)");
2413 }
2414 } else {
2415 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2416 AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2417 HReg r = newVRegI(env);
2418 addInstr(env, mk_iMOVsd_RR(r1,r));
2419 addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
2420 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFF),r));
2421 switch (e->Iex.Binop.op) {
2422 case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
2423 case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
2424 default: vpanic("iselCondCode(amd64): CmpXX8(expr,expr)");
2425 }
2426 }
2427 }
2428
2429 /* CmpEQ16 / CmpNE16 */
2430 if (e->tag == Iex_Binop
2431 && (e->Iex.Binop.op == Iop_CmpEQ16
2432 || e->Iex.Binop.op == Iop_CmpNE16
2433 || e->Iex.Binop.op == Iop_CasCmpEQ16
2434 || e->Iex.Binop.op == Iop_CasCmpNE16)) {
2435 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2436 AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2437 HReg r = newVRegI(env);
2438 addInstr(env, mk_iMOVsd_RR(r1,r));
2439 addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
2440 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFFFF),r));
2441 switch (e->Iex.Binop.op) {
2442 case Iop_CmpEQ16: case Iop_CasCmpEQ16: return Acc_Z;
2443 case Iop_CmpNE16: case Iop_CasCmpNE16: return Acc_NZ;
2444 default: vpanic("iselCondCode(amd64): CmpXX16");
2445 }
2446 }
2447
2448 /* CmpNE64(ccall, 64-bit constant) (--smc-check=all optimisation).
2449 Saves a "movq %rax, %tmp" compared to the default route. */
2450 if (e->tag == Iex_Binop
2451 && e->Iex.Binop.op == Iop_CmpNE64
2452 && e->Iex.Binop.arg1->tag == Iex_CCall
2453 && e->Iex.Binop.arg2->tag == Iex_Const) {
2454 IRExpr* cal = e->Iex.Binop.arg1;
2455 IRExpr* con = e->Iex.Binop.arg2;
2456 HReg tmp = newVRegI(env);
2457 /* clone & partial-eval of generic Iex_CCall and Iex_Const cases */
2458 vassert(cal->Iex.CCall.retty == Ity_I64); /* else ill-typed IR */
2459 vassert(con->Iex.Const.con->tag == Ico_U64);
2460 /* Marshal args, do the call. */
2461 UInt addToSp = 0;
2462 RetLoc rloc = mk_RetLoc_INVALID();
2463 doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
2464 cal->Iex.CCall.cee,
2465 cal->Iex.CCall.retty, cal->Iex.CCall.args );
2466 vassert(is_sane_RetLoc(rloc));
2467 vassert(rloc.pri == RLPri_Int);
2468 vassert(addToSp == 0);
2469 /* */
2470 addInstr(env, AMD64Instr_Imm64(con->Iex.Const.con->Ico.U64, tmp));
2471 addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,
2472 AMD64RMI_Reg(hregAMD64_RAX()), tmp));
2473 return Acc_NZ;
2474 }
2475
2476 /* Cmp*64*(x,y) */
2477 if (e->tag == Iex_Binop
2478 && (e->Iex.Binop.op == Iop_CmpEQ64
2479 || e->Iex.Binop.op == Iop_CmpNE64
2480 || e->Iex.Binop.op == Iop_CmpLT64S
2481 || e->Iex.Binop.op == Iop_CmpLT64U
2482 || e->Iex.Binop.op == Iop_CmpLE64S
2483 || e->Iex.Binop.op == Iop_CmpLE64U
2484 || e->Iex.Binop.op == Iop_CasCmpEQ64
2485 || e->Iex.Binop.op == Iop_CasCmpNE64
2486 || e->Iex.Binop.op == Iop_ExpCmpNE64)) {
2487 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2488 AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2489 addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
2490 switch (e->Iex.Binop.op) {
2491 case Iop_CmpEQ64: case Iop_CasCmpEQ64: return Acc_Z;
2492 case Iop_CmpNE64:
2493 case Iop_CasCmpNE64: case Iop_ExpCmpNE64: return Acc_NZ;
2494 case Iop_CmpLT64S: return Acc_L;
2495 case Iop_CmpLT64U: return Acc_B;
2496 case Iop_CmpLE64S: return Acc_LE;
2497 case Iop_CmpLE64U: return Acc_BE;
2498 default: vpanic("iselCondCode(amd64): CmpXX64");
2499 }
2500 }
2501
2502 /* Cmp*32*(x,y) */
2503 if (e->tag == Iex_Binop
2504 && (e->Iex.Binop.op == Iop_CmpEQ32
2505 || e->Iex.Binop.op == Iop_CmpNE32
2506 || e->Iex.Binop.op == Iop_CmpLT32S
2507 || e->Iex.Binop.op == Iop_CmpLT32U
2508 || e->Iex.Binop.op == Iop_CmpLE32S
2509 || e->Iex.Binop.op == Iop_CmpLE32U
2510 || e->Iex.Binop.op == Iop_CasCmpEQ32
2511 || e->Iex.Binop.op == Iop_CasCmpNE32
2512 || e->Iex.Binop.op == Iop_ExpCmpNE32)) {
2513 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2514 AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2515 addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
2516 switch (e->Iex.Binop.op) {
2517 case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Acc_Z;
2518 case Iop_CmpNE32:
2519 case Iop_CasCmpNE32: case Iop_ExpCmpNE32: return Acc_NZ;
2520 case Iop_CmpLT32S: return Acc_L;
2521 case Iop_CmpLT32U: return Acc_B;
2522 case Iop_CmpLE32S: return Acc_LE;
2523 case Iop_CmpLE32U: return Acc_BE;
2524 default: vpanic("iselCondCode(amd64): CmpXX32");
2525 }
2526 }
2527
2528 ppIRExpr(e);
2529 vpanic("iselCondCode(amd64)");
2530 }
2531
2532
2533 /*---------------------------------------------------------*/
2534 /*--- ISEL: Integer expressions (128 bit) ---*/
2535 /*---------------------------------------------------------*/
2536
2537 /* Compute a 128-bit value into a register pair, which is returned as
2538 the first two parameters. As with iselIntExpr_R, these may be
2539 either real or virtual regs; in any case they must not be changed
2540 by subsequent code emitted by the caller. */
2541
iselInt128Expr(HReg * rHi,HReg * rLo,ISelEnv * env,const IRExpr * e)2542 static void iselInt128Expr ( HReg* rHi, HReg* rLo,
2543 ISelEnv* env, const IRExpr* e )
2544 {
2545 iselInt128Expr_wrk(rHi, rLo, env, e);
2546 # if 0
2547 vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2548 # endif
2549 vassert(hregClass(*rHi) == HRcInt64);
2550 vassert(hregIsVirtual(*rHi));
2551 vassert(hregClass(*rLo) == HRcInt64);
2552 vassert(hregIsVirtual(*rLo));
2553 }
2554
2555 /* DO NOT CALL THIS DIRECTLY ! */
iselInt128Expr_wrk(HReg * rHi,HReg * rLo,ISelEnv * env,const IRExpr * e)2556 static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo,
2557 ISelEnv* env, const IRExpr* e )
2558 {
2559 vassert(e);
2560 vassert(typeOfIRExpr(env->type_env,e) == Ity_I128);
2561
2562 /* read 128-bit IRTemp */
2563 if (e->tag == Iex_RdTmp) {
2564 lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
2565 return;
2566 }
2567
2568 /* --------- BINARY ops --------- */
2569 if (e->tag == Iex_Binop) {
2570 switch (e->Iex.Binop.op) {
2571 /* 64 x 64 -> 128 multiply */
2572 case Iop_MullU64:
2573 case Iop_MullS64: {
2574 /* get one operand into %rax, and the other into a R/M.
2575 Need to make an educated guess about which is better in
2576 which. */
2577 HReg tLo = newVRegI(env);
2578 HReg tHi = newVRegI(env);
2579 Bool syned = toBool(e->Iex.Binop.op == Iop_MullS64);
2580 AMD64RM* rmLeft = iselIntExpr_RM(env, e->Iex.Binop.arg1);
2581 HReg rRight = iselIntExpr_R(env, e->Iex.Binop.arg2);
2582 addInstr(env, mk_iMOVsd_RR(rRight, hregAMD64_RAX()));
2583 addInstr(env, AMD64Instr_MulL(syned, rmLeft));
2584 /* Result is now in RDX:RAX. Tell the caller. */
2585 addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
2586 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
2587 *rHi = tHi;
2588 *rLo = tLo;
2589 return;
2590 }
2591
2592 /* 128 x 64 -> (64(rem),64(div)) division */
2593 case Iop_DivModU128to64:
2594 case Iop_DivModS128to64: {
2595 /* Get the 128-bit operand into rdx:rax, and the other into
2596 any old R/M. */
2597 HReg sHi, sLo;
2598 HReg tLo = newVRegI(env);
2599 HReg tHi = newVRegI(env);
2600 Bool syned = toBool(e->Iex.Binop.op == Iop_DivModS128to64);
2601 AMD64RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
2602 iselInt128Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2603 addInstr(env, mk_iMOVsd_RR(sHi, hregAMD64_RDX()));
2604 addInstr(env, mk_iMOVsd_RR(sLo, hregAMD64_RAX()));
2605 addInstr(env, AMD64Instr_Div(syned, 8, rmRight));
2606 addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
2607 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
2608 *rHi = tHi;
2609 *rLo = tLo;
2610 return;
2611 }
2612
2613 /* 64HLto128(e1,e2) */
2614 case Iop_64HLto128:
2615 *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
2616 *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
2617 return;
2618
2619 default:
2620 break;
2621 }
2622 } /* if (e->tag == Iex_Binop) */
2623
2624 ppIRExpr(e);
2625 vpanic("iselInt128Expr");
2626 }
2627
2628
2629 /*---------------------------------------------------------*/
2630 /*--- ISEL: Floating point expressions (32 bit) ---*/
2631 /*---------------------------------------------------------*/
2632
2633 /* Nothing interesting here; really just wrappers for
2634 64-bit stuff. */
2635
iselFltExpr(ISelEnv * env,const IRExpr * e)2636 static HReg iselFltExpr ( ISelEnv* env, const IRExpr* e )
2637 {
2638 HReg r = iselFltExpr_wrk( env, e );
2639 # if 0
2640 vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2641 # endif
2642 vassert(hregClass(r) == HRcVec128);
2643 vassert(hregIsVirtual(r));
2644 return r;
2645 }
2646
2647 /* DO NOT CALL THIS DIRECTLY */
iselFltExpr_wrk(ISelEnv * env,const IRExpr * e)2648 static HReg iselFltExpr_wrk ( ISelEnv* env, const IRExpr* e )
2649 {
2650 IRType ty = typeOfIRExpr(env->type_env,e);
2651 vassert(ty == Ity_F32);
2652
2653 if (e->tag == Iex_RdTmp) {
2654 return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2655 }
2656
2657 if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2658 AMD64AMode* am;
2659 HReg res = newVRegV(env);
2660 vassert(e->Iex.Load.ty == Ity_F32);
2661 am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2662 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, res, am));
2663 return res;
2664 }
2665
2666 if (e->tag == Iex_Binop
2667 && e->Iex.Binop.op == Iop_F64toF32) {
2668 /* Although the result is still held in a standard SSE register,
2669 we need to round it to reflect the loss of accuracy/range
2670 entailed in casting it to a 32-bit float. */
2671 HReg dst = newVRegV(env);
2672 HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
2673 set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
2674 addInstr(env, AMD64Instr_SseSDSS(True/*D->S*/,src,dst));
2675 set_SSE_rounding_default( env );
2676 return dst;
2677 }
2678
2679 if (e->tag == Iex_Get) {
2680 AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
2681 hregAMD64_RBP() );
2682 HReg res = newVRegV(env);
2683 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, res, am ));
2684 return res;
2685 }
2686
2687 if (e->tag == Iex_Unop
2688 && e->Iex.Unop.op == Iop_ReinterpI32asF32) {
2689 /* Given an I32, produce an IEEE754 float with the same bit
2690 pattern. */
2691 HReg dst = newVRegV(env);
2692 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
2693 AMD64AMode* m4_rsp = AMD64AMode_IR(-4, hregAMD64_RSP());
2694 addInstr(env, AMD64Instr_Store(4, src, m4_rsp));
2695 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, dst, m4_rsp ));
2696 return dst;
2697 }
2698
2699 if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF32toInt) {
2700 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2701 HReg arg = iselFltExpr(env, e->Iex.Binop.arg2);
2702 HReg dst = newVRegV(env);
2703
2704 /* rf now holds the value to be rounded. The first thing to do
2705 is set the FPU's rounding mode accordingly. */
2706
2707 /* Set host x87 rounding mode */
2708 set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2709
2710 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, arg, m8_rsp));
2711 addInstr(env, AMD64Instr_A87Free(1));
2712 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 4));
2713 addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
2714 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 4));
2715 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, dst, m8_rsp));
2716
2717 /* Restore default x87 rounding. */
2718 set_FPU_rounding_default( env );
2719
2720 return dst;
2721 }
2722
2723 if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_NegF32) {
2724 /* Sigh ... very rough code. Could do much better. */
2725 /* Get the 128-bit literal 00---0 10---0 into a register
2726 and xor it with the value to be negated. */
2727 HReg r1 = newVRegI(env);
2728 HReg dst = newVRegV(env);
2729 HReg tmp = newVRegV(env);
2730 HReg src = iselFltExpr(env, e->Iex.Unop.arg);
2731 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
2732 addInstr(env, mk_vMOVsd_RR(src,tmp));
2733 addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
2734 addInstr(env, AMD64Instr_Imm64( 1ULL<<31, r1 ));
2735 addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
2736 addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
2737 addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
2738 add_to_rsp(env, 16);
2739 return dst;
2740 }
2741
2742 if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF32) {
2743 IRQop *qop = e->Iex.Qop.details;
2744 HReg dst = newVRegV(env);
2745 HReg argX = iselFltExpr(env, qop->arg2);
2746 HReg argY = iselFltExpr(env, qop->arg3);
2747 HReg argZ = iselFltExpr(env, qop->arg4);
2748 /* XXXROUNDINGFIXME */
2749 /* set roundingmode here */
2750 /* subq $16, %rsp -- make a space*/
2751 sub_from_rsp(env, 16);
2752 /* Prepare 4 arg regs:
2753 leaq 0(%rsp), %rdi
2754 leaq 4(%rsp), %rsi
2755 leaq 8(%rsp), %rdx
2756 leaq 12(%rsp), %rcx
2757 */
2758 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
2759 hregAMD64_RDI()));
2760 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(4, hregAMD64_RSP()),
2761 hregAMD64_RSI()));
2762 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
2763 hregAMD64_RDX()));
2764 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(12, hregAMD64_RSP()),
2765 hregAMD64_RCX()));
2766 /* Store the three args, at (%rsi), (%rdx) and (%rcx):
2767 movss %argX, 0(%rsi)
2768 movss %argY, 0(%rdx)
2769 movss %argZ, 0(%rcx)
2770 */
2771 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argX,
2772 AMD64AMode_IR(0, hregAMD64_RSI())));
2773 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argY,
2774 AMD64AMode_IR(0, hregAMD64_RDX())));
2775 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argZ,
2776 AMD64AMode_IR(0, hregAMD64_RCX())));
2777 /* call the helper */
2778 addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
2779 (ULong)(HWord)h_generic_calc_MAddF32,
2780 4, mk_RetLoc_simple(RLPri_None) ));
2781 /* fetch the result from memory, using %r_argp, which the
2782 register allocator will keep alive across the call. */
2783 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 4, dst,
2784 AMD64AMode_IR(0, hregAMD64_RSP())));
2785 /* and finally, clear the space */
2786 add_to_rsp(env, 16);
2787 return dst;
2788 }
2789
2790 ppIRExpr(e);
2791 vpanic("iselFltExpr_wrk");
2792 }
2793
2794
2795 /*---------------------------------------------------------*/
2796 /*--- ISEL: Floating point expressions (64 bit) ---*/
2797 /*---------------------------------------------------------*/
2798
2799 /* Compute a 64-bit floating point value into the lower half of an xmm
2800 register, the identity of which is returned. As with
2801 iselIntExpr_R, the returned reg will be virtual, and it must not be
2802 changed by subsequent code emitted by the caller.
2803 */
2804
2805 /* IEEE 754 formats. From http://www.freesoft.org/CIE/RFC/1832/32.htm:
2806
2807 Type S (1 bit) E (11 bits) F (52 bits)
2808 ---- --------- ----------- -----------
2809 signalling NaN u 2047 (max) .0uuuuu---u
2810 (with at least
2811 one 1 bit)
2812 quiet NaN u 2047 (max) .1uuuuu---u
2813
2814 negative infinity 1 2047 (max) .000000---0
2815
2816 positive infinity 0 2047 (max) .000000---0
2817
2818 negative zero 1 0 .000000---0
2819
2820 positive zero 0 0 .000000---0
2821 */
2822
iselDblExpr(ISelEnv * env,const IRExpr * e)2823 static HReg iselDblExpr ( ISelEnv* env, const IRExpr* e )
2824 {
2825 HReg r = iselDblExpr_wrk( env, e );
2826 # if 0
2827 vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2828 # endif
2829 vassert(hregClass(r) == HRcVec128);
2830 vassert(hregIsVirtual(r));
2831 return r;
2832 }
2833
2834 /* DO NOT CALL THIS DIRECTLY */
iselDblExpr_wrk(ISelEnv * env,const IRExpr * e)2835 static HReg iselDblExpr_wrk ( ISelEnv* env, const IRExpr* e )
2836 {
2837 IRType ty = typeOfIRExpr(env->type_env,e);
2838 vassert(e);
2839 vassert(ty == Ity_F64);
2840
2841 if (e->tag == Iex_RdTmp) {
2842 return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2843 }
2844
2845 if (e->tag == Iex_Const) {
2846 union { ULong u64; Double f64; } u;
2847 HReg res = newVRegV(env);
2848 HReg tmp = newVRegI(env);
2849 vassert(sizeof(u) == 8);
2850 vassert(sizeof(u.u64) == 8);
2851 vassert(sizeof(u.f64) == 8);
2852
2853 if (e->Iex.Const.con->tag == Ico_F64) {
2854 u.f64 = e->Iex.Const.con->Ico.F64;
2855 }
2856 else if (e->Iex.Const.con->tag == Ico_F64i) {
2857 u.u64 = e->Iex.Const.con->Ico.F64i;
2858 }
2859 else
2860 vpanic("iselDblExpr(amd64): const");
2861
2862 addInstr(env, AMD64Instr_Imm64(u.u64, tmp));
2863 addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(tmp)));
2864 addInstr(env, AMD64Instr_SseLdSt(
2865 True/*load*/, 8, res,
2866 AMD64AMode_IR(0, hregAMD64_RSP())
2867 ));
2868 add_to_rsp(env, 8);
2869 return res;
2870 }
2871
2872 if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2873 AMD64AMode* am;
2874 HReg res = newVRegV(env);
2875 vassert(e->Iex.Load.ty == Ity_F64);
2876 am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2877 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2878 return res;
2879 }
2880
2881 if (e->tag == Iex_Get) {
2882 AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
2883 hregAMD64_RBP() );
2884 HReg res = newVRegV(env);
2885 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2886 return res;
2887 }
2888
2889 if (e->tag == Iex_GetI) {
2890 AMD64AMode* am
2891 = genGuestArrayOffset(
2892 env, e->Iex.GetI.descr,
2893 e->Iex.GetI.ix, e->Iex.GetI.bias );
2894 HReg res = newVRegV(env);
2895 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2896 return res;
2897 }
2898
2899 if (e->tag == Iex_Triop) {
2900 IRTriop *triop = e->Iex.Triop.details;
2901 AMD64SseOp op = Asse_INVALID;
2902 switch (triop->op) {
2903 case Iop_AddF64: op = Asse_ADDF; break;
2904 case Iop_SubF64: op = Asse_SUBF; break;
2905 case Iop_MulF64: op = Asse_MULF; break;
2906 case Iop_DivF64: op = Asse_DIVF; break;
2907 default: break;
2908 }
2909 if (op != Asse_INVALID) {
2910 HReg dst = newVRegV(env);
2911 HReg argL = iselDblExpr(env, triop->arg2);
2912 HReg argR = iselDblExpr(env, triop->arg3);
2913 addInstr(env, mk_vMOVsd_RR(argL, dst));
2914 /* XXXROUNDINGFIXME */
2915 /* set roundingmode here */
2916 addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
2917 return dst;
2918 }
2919 }
2920
2921 if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF64) {
2922 IRQop *qop = e->Iex.Qop.details;
2923 HReg dst = newVRegV(env);
2924 HReg argX = iselDblExpr(env, qop->arg2);
2925 HReg argY = iselDblExpr(env, qop->arg3);
2926 HReg argZ = iselDblExpr(env, qop->arg4);
2927 /* XXXROUNDINGFIXME */
2928 /* set roundingmode here */
2929 /* subq $32, %rsp -- make a space*/
2930 sub_from_rsp(env, 32);
2931 /* Prepare 4 arg regs:
2932 leaq 0(%rsp), %rdi
2933 leaq 8(%rsp), %rsi
2934 leaq 16(%rsp), %rdx
2935 leaq 24(%rsp), %rcx
2936 */
2937 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
2938 hregAMD64_RDI()));
2939 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
2940 hregAMD64_RSI()));
2941 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, hregAMD64_RSP()),
2942 hregAMD64_RDX()));
2943 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(24, hregAMD64_RSP()),
2944 hregAMD64_RCX()));
2945 /* Store the three args, at (%rsi), (%rdx) and (%rcx):
2946 movsd %argX, 0(%rsi)
2947 movsd %argY, 0(%rdx)
2948 movsd %argZ, 0(%rcx)
2949 */
2950 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argX,
2951 AMD64AMode_IR(0, hregAMD64_RSI())));
2952 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argY,
2953 AMD64AMode_IR(0, hregAMD64_RDX())));
2954 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argZ,
2955 AMD64AMode_IR(0, hregAMD64_RCX())));
2956 /* call the helper */
2957 addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
2958 (ULong)(HWord)h_generic_calc_MAddF64,
2959 4, mk_RetLoc_simple(RLPri_None) ));
2960 /* fetch the result from memory, using %r_argp, which the
2961 register allocator will keep alive across the call. */
2962 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 8, dst,
2963 AMD64AMode_IR(0, hregAMD64_RSP())));
2964 /* and finally, clear the space */
2965 add_to_rsp(env, 32);
2966 return dst;
2967 }
2968
2969 if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
2970 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2971 HReg arg = iselDblExpr(env, e->Iex.Binop.arg2);
2972 HReg dst = newVRegV(env);
2973
2974 /* rf now holds the value to be rounded. The first thing to do
2975 is set the FPU's rounding mode accordingly. */
2976
2977 /* Set host x87 rounding mode */
2978 set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2979
2980 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
2981 addInstr(env, AMD64Instr_A87Free(1));
2982 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
2983 addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
2984 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
2985 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
2986
2987 /* Restore default x87 rounding. */
2988 set_FPU_rounding_default( env );
2989
2990 return dst;
2991 }
2992
2993 IRTriop *triop = e->Iex.Triop.details;
2994 if (e->tag == Iex_Triop
2995 && (triop->op == Iop_ScaleF64
2996 || triop->op == Iop_AtanF64
2997 || triop->op == Iop_Yl2xF64
2998 || triop->op == Iop_Yl2xp1F64
2999 || triop->op == Iop_PRemF64
3000 || triop->op == Iop_PRem1F64)
3001 ) {
3002 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3003 HReg arg1 = iselDblExpr(env, triop->arg2);
3004 HReg arg2 = iselDblExpr(env, triop->arg3);
3005 HReg dst = newVRegV(env);
3006 Bool arg2first = toBool(triop->op == Iop_ScaleF64
3007 || triop->op == Iop_PRemF64
3008 || triop->op == Iop_PRem1F64);
3009 addInstr(env, AMD64Instr_A87Free(2));
3010
3011 /* one arg -> top of x87 stack */
3012 addInstr(env, AMD64Instr_SseLdSt(
3013 False/*store*/, 8, arg2first ? arg2 : arg1, m8_rsp));
3014 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3015
3016 /* other arg -> top of x87 stack */
3017 addInstr(env, AMD64Instr_SseLdSt(
3018 False/*store*/, 8, arg2first ? arg1 : arg2, m8_rsp));
3019 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3020
3021 /* do it */
3022 /* XXXROUNDINGFIXME */
3023 /* set roundingmode here */
3024 switch (triop->op) {
3025 case Iop_ScaleF64:
3026 addInstr(env, AMD64Instr_A87FpOp(Afp_SCALE));
3027 break;
3028 case Iop_AtanF64:
3029 addInstr(env, AMD64Instr_A87FpOp(Afp_ATAN));
3030 break;
3031 case Iop_Yl2xF64:
3032 addInstr(env, AMD64Instr_A87FpOp(Afp_YL2X));
3033 break;
3034 case Iop_Yl2xp1F64:
3035 addInstr(env, AMD64Instr_A87FpOp(Afp_YL2XP1));
3036 break;
3037 case Iop_PRemF64:
3038 addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
3039 break;
3040 case Iop_PRem1F64:
3041 addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
3042 break;
3043 default:
3044 vassert(0);
3045 }
3046
3047 /* save result */
3048 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3049 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3050 return dst;
3051 }
3052
3053 if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_I64StoF64) {
3054 HReg dst = newVRegV(env);
3055 HReg src = iselIntExpr_R(env, e->Iex.Binop.arg2);
3056 set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
3057 addInstr(env, AMD64Instr_SseSI2SF( 8, 8, src, dst ));
3058 set_SSE_rounding_default( env );
3059 return dst;
3060 }
3061
3062 if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_I32StoF64) {
3063 HReg dst = newVRegV(env);
3064 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
3065 set_SSE_rounding_default( env );
3066 addInstr(env, AMD64Instr_SseSI2SF( 4, 8, src, dst ));
3067 return dst;
3068 }
3069
3070 if (e->tag == Iex_Unop
3071 && (e->Iex.Unop.op == Iop_NegF64
3072 || e->Iex.Unop.op == Iop_AbsF64)) {
3073 /* Sigh ... very rough code. Could do much better. */
3074 /* Get the 128-bit literal 00---0 10---0 into a register
3075 and xor/nand it with the value to be negated. */
3076 HReg r1 = newVRegI(env);
3077 HReg dst = newVRegV(env);
3078 HReg tmp = newVRegV(env);
3079 HReg src = iselDblExpr(env, e->Iex.Unop.arg);
3080 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3081 addInstr(env, mk_vMOVsd_RR(src,tmp));
3082 addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3083 addInstr(env, AMD64Instr_Imm64( 1ULL<<63, r1 ));
3084 addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
3085 addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
3086
3087 if (e->Iex.Unop.op == Iop_NegF64)
3088 addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
3089 else
3090 addInstr(env, AMD64Instr_SseReRg(Asse_ANDN, tmp, dst));
3091
3092 add_to_rsp(env, 16);
3093 return dst;
3094 }
3095
3096 if (e->tag == Iex_Binop) {
3097 A87FpOp fpop = Afp_INVALID;
3098 switch (e->Iex.Binop.op) {
3099 case Iop_SqrtF64: fpop = Afp_SQRT; break;
3100 case Iop_SinF64: fpop = Afp_SIN; break;
3101 case Iop_CosF64: fpop = Afp_COS; break;
3102 case Iop_TanF64: fpop = Afp_TAN; break;
3103 case Iop_2xm1F64: fpop = Afp_2XM1; break;
3104 default: break;
3105 }
3106 if (fpop != Afp_INVALID) {
3107 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3108 HReg arg = iselDblExpr(env, e->Iex.Binop.arg2);
3109 HReg dst = newVRegV(env);
3110 Int nNeeded = e->Iex.Binop.op==Iop_TanF64 ? 2 : 1;
3111 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
3112 addInstr(env, AMD64Instr_A87Free(nNeeded));
3113 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3114 /* XXXROUNDINGFIXME */
3115 /* set roundingmode here */
3116 /* Note that AMD64Instr_A87FpOp(Afp_TAN) sets the condition
3117 codes. I don't think that matters, since this insn
3118 selector never generates such an instruction intervening
3119 between an flag-setting instruction and a flag-using
3120 instruction. */
3121 addInstr(env, AMD64Instr_A87FpOp(fpop));
3122 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3123 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3124 return dst;
3125 }
3126 }
3127
3128 if (e->tag == Iex_Unop) {
3129 switch (e->Iex.Unop.op) {
3130 //.. case Iop_I32toF64: {
3131 //.. HReg dst = newVRegF(env);
3132 //.. HReg ri = iselIntExpr_R(env, e->Iex.Unop.arg);
3133 //.. addInstr(env, X86Instr_Push(X86RMI_Reg(ri)));
3134 //.. set_FPU_rounding_default(env);
3135 //.. addInstr(env, X86Instr_FpLdStI(
3136 //.. True/*load*/, 4, dst,
3137 //.. X86AMode_IR(0, hregX86_ESP())));
3138 //.. add_to_esp(env, 4);
3139 //.. return dst;
3140 //.. }
3141 case Iop_ReinterpI64asF64: {
3142 /* Given an I64, produce an IEEE754 double with the same
3143 bit pattern. */
3144 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3145 HReg dst = newVRegV(env);
3146 AMD64RI* src = iselIntExpr_RI(env, e->Iex.Unop.arg);
3147 /* paranoia */
3148 set_SSE_rounding_default(env);
3149 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, src, m8_rsp));
3150 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3151 return dst;
3152 }
3153 case Iop_F32toF64: {
3154 HReg f32;
3155 HReg f64 = newVRegV(env);
3156 /* this shouldn't be necessary, but be paranoid ... */
3157 set_SSE_rounding_default(env);
3158 f32 = iselFltExpr(env, e->Iex.Unop.arg);
3159 addInstr(env, AMD64Instr_SseSDSS(False/*S->D*/, f32, f64));
3160 return f64;
3161 }
3162 default:
3163 break;
3164 }
3165 }
3166
3167 /* --------- MULTIPLEX --------- */
3168 if (e->tag == Iex_ITE) { // VFD
3169 HReg r1, r0, dst;
3170 vassert(ty == Ity_F64);
3171 vassert(typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1);
3172 r1 = iselDblExpr(env, e->Iex.ITE.iftrue);
3173 r0 = iselDblExpr(env, e->Iex.ITE.iffalse);
3174 dst = newVRegV(env);
3175 addInstr(env, mk_vMOVsd_RR(r1,dst));
3176 AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
3177 addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
3178 return dst;
3179 }
3180
3181 ppIRExpr(e);
3182 vpanic("iselDblExpr_wrk");
3183 }
3184
3185
3186 /*---------------------------------------------------------*/
3187 /*--- ISEL: SIMD (Vector) expressions, 128 bit. ---*/
3188 /*---------------------------------------------------------*/
3189
iselVecExpr(ISelEnv * env,const IRExpr * e)3190 static HReg iselVecExpr ( ISelEnv* env, const IRExpr* e )
3191 {
3192 HReg r = iselVecExpr_wrk( env, e );
3193 # if 0
3194 vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3195 # endif
3196 vassert(hregClass(r) == HRcVec128);
3197 vassert(hregIsVirtual(r));
3198 return r;
3199 }
3200
3201
3202 /* DO NOT CALL THIS DIRECTLY */
iselVecExpr_wrk(ISelEnv * env,const IRExpr * e)3203 static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e )
3204 {
3205 HWord fn = 0; /* address of helper fn, if required */
3206 Bool arg1isEReg = False;
3207 AMD64SseOp op = Asse_INVALID;
3208 vassert(e);
3209 IRType ty = typeOfIRExpr(env->type_env, e);
3210 vassert(ty == Ity_V128);
3211 UInt laneBits = 0;
3212
3213 if (e->tag == Iex_RdTmp) {
3214 return lookupIRTemp(env, e->Iex.RdTmp.tmp);
3215 }
3216
3217 if (e->tag == Iex_Get) {
3218 HReg dst = newVRegV(env);
3219 addInstr(env, AMD64Instr_SseLdSt(
3220 True/*load*/,
3221 16,
3222 dst,
3223 AMD64AMode_IR(e->Iex.Get.offset, hregAMD64_RBP())
3224 )
3225 );
3226 return dst;
3227 }
3228
3229 if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
3230 HReg dst = newVRegV(env);
3231 AMD64AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
3232 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
3233 return dst;
3234 }
3235
3236 if (e->tag == Iex_Const) {
3237 HReg dst = newVRegV(env);
3238 vassert(e->Iex.Const.con->tag == Ico_V128);
3239 switch (e->Iex.Const.con->Ico.V128) {
3240 case 0x0000:
3241 dst = generate_zeroes_V128(env);
3242 break;
3243 case 0xFFFF:
3244 dst = generate_ones_V128(env);
3245 break;
3246 default: {
3247 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3248 /* do push_uimm64 twice, first time for the high-order half. */
3249 push_uimm64(env, bitmask8_to_bytemask64(
3250 (e->Iex.Const.con->Ico.V128 >> 8) & 0xFF
3251 ));
3252 push_uimm64(env, bitmask8_to_bytemask64(
3253 (e->Iex.Const.con->Ico.V128 >> 0) & 0xFF
3254 ));
3255 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, rsp0 ));
3256 add_to_rsp(env, 16);
3257 break;
3258 }
3259 }
3260 return dst;
3261 }
3262
3263 if (e->tag == Iex_Unop) {
3264 switch (e->Iex.Unop.op) {
3265
3266 case Iop_NotV128: {
3267 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3268 return do_sse_NotV128(env, arg);
3269 }
3270
3271 case Iop_CmpNEZ64x2: {
3272 /* We can use SSE2 instructions for this. */
3273 /* Ideally, we want to do a 64Ix2 comparison against zero of
3274 the operand. Problem is no such insn exists. Solution
3275 therefore is to do a 32Ix4 comparison instead, and bitwise-
3276 negate (NOT) the result. Let a,b,c,d be 32-bit lanes, and
3277 let the not'd result of this initial comparison be a:b:c:d.
3278 What we need to compute is (a|b):(a|b):(c|d):(c|d). So, use
3279 pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
3280 giving the required result.
3281
3282 The required selection sequence is 2,3,0,1, which
3283 according to Intel's documentation means the pshufd
3284 literal value is 0xB1, that is,
3285 (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)
3286 */
3287 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3288 HReg tmp = generate_zeroes_V128(env);
3289 HReg dst = newVRegV(env);
3290 addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, arg, tmp));
3291 tmp = do_sse_NotV128(env, tmp);
3292 addInstr(env, AMD64Instr_SseShuf(0xB1, tmp, dst));
3293 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3294 return dst;
3295 }
3296
3297 case Iop_CmpNEZ32x4: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
3298 case Iop_CmpNEZ16x8: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
3299 case Iop_CmpNEZ8x16: op = Asse_CMPEQ8; goto do_CmpNEZ_vector;
3300 do_CmpNEZ_vector:
3301 {
3302 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3303 HReg tmp = newVRegV(env);
3304 HReg zero = generate_zeroes_V128(env);
3305 HReg dst;
3306 addInstr(env, mk_vMOVsd_RR(arg, tmp));
3307 addInstr(env, AMD64Instr_SseReRg(op, zero, tmp));
3308 dst = do_sse_NotV128(env, tmp);
3309 return dst;
3310 }
3311
3312 case Iop_RecipEst32Fx4: op = Asse_RCPF; goto do_32Fx4_unary;
3313 case Iop_RSqrtEst32Fx4: op = Asse_RSQRTF; goto do_32Fx4_unary;
3314 do_32Fx4_unary:
3315 {
3316 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3317 HReg dst = newVRegV(env);
3318 addInstr(env, AMD64Instr_Sse32Fx4(op, arg, dst));
3319 return dst;
3320 }
3321
3322 case Iop_RecipEst32F0x4: op = Asse_RCPF; goto do_32F0x4_unary;
3323 case Iop_RSqrtEst32F0x4: op = Asse_RSQRTF; goto do_32F0x4_unary;
3324 case Iop_Sqrt32F0x4: op = Asse_SQRTF; goto do_32F0x4_unary;
3325 do_32F0x4_unary:
3326 {
3327 /* A bit subtle. We have to copy the arg to the result
3328 register first, because actually doing the SSE scalar insn
3329 leaves the upper 3/4 of the destination register
3330 unchanged. Whereas the required semantics of these
3331 primops is that the upper 3/4 is simply copied in from the
3332 argument. */
3333 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3334 HReg dst = newVRegV(env);
3335 addInstr(env, mk_vMOVsd_RR(arg, dst));
3336 addInstr(env, AMD64Instr_Sse32FLo(op, arg, dst));
3337 return dst;
3338 }
3339
3340 case Iop_Sqrt64F0x2: op = Asse_SQRTF; goto do_64F0x2_unary;
3341 do_64F0x2_unary:
3342 {
3343 /* A bit subtle. We have to copy the arg to the result
3344 register first, because actually doing the SSE scalar insn
3345 leaves the upper half of the destination register
3346 unchanged. Whereas the required semantics of these
3347 primops is that the upper half is simply copied in from the
3348 argument. */
3349 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3350 HReg dst = newVRegV(env);
3351 addInstr(env, mk_vMOVsd_RR(arg, dst));
3352 addInstr(env, AMD64Instr_Sse64FLo(op, arg, dst));
3353 return dst;
3354 }
3355
3356 case Iop_32UtoV128: {
3357 HReg dst = newVRegV(env);
3358 AMD64AMode* rsp_m32 = AMD64AMode_IR(-32, hregAMD64_RSP());
3359 AMD64RI* ri = iselIntExpr_RI(env, e->Iex.Unop.arg);
3360 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, ri, rsp_m32));
3361 addInstr(env, AMD64Instr_SseLdzLO(4, dst, rsp_m32));
3362 return dst;
3363 }
3364
3365 case Iop_64UtoV128: {
3366 HReg dst = newVRegV(env);
3367 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3368 AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Unop.arg);
3369 addInstr(env, AMD64Instr_Push(rmi));
3370 addInstr(env, AMD64Instr_SseLdzLO(8, dst, rsp0));
3371 add_to_rsp(env, 8);
3372 return dst;
3373 }
3374
3375 case Iop_V256toV128_0:
3376 case Iop_V256toV128_1: {
3377 HReg vHi, vLo;
3378 iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
3379 return (e->Iex.Unop.op == Iop_V256toV128_1) ? vHi : vLo;
3380 }
3381
3382 default:
3383 break;
3384 } /* switch (e->Iex.Unop.op) */
3385 } /* if (e->tag == Iex_Unop) */
3386
3387 if (e->tag == Iex_Binop) {
3388 switch (e->Iex.Binop.op) {
3389
3390 case Iop_Sqrt64Fx2:
3391 case Iop_Sqrt32Fx4: {
3392 /* :: (rmode, vec) -> vec */
3393 HReg arg = iselVecExpr(env, e->Iex.Binop.arg2);
3394 HReg dst = newVRegV(env);
3395 /* XXXROUNDINGFIXME */
3396 /* set roundingmode here */
3397 addInstr(env, (e->Iex.Binop.op == Iop_Sqrt64Fx2
3398 ? AMD64Instr_Sse64Fx2 : AMD64Instr_Sse32Fx4)
3399 (Asse_SQRTF, arg, dst));
3400 return dst;
3401 }
3402
3403 /* FIXME: could we generate MOVQ here? */
3404 case Iop_SetV128lo64: {
3405 HReg dst = newVRegV(env);
3406 HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3407 HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3408 AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
3409 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
3410 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, AMD64RI_Reg(srcI), rsp_m16));
3411 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
3412 return dst;
3413 }
3414
3415 /* FIXME: could we generate MOVD here? */
3416 case Iop_SetV128lo32: {
3417 HReg dst = newVRegV(env);
3418 HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3419 HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3420 AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
3421 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
3422 addInstr(env, AMD64Instr_Store(4, srcI, rsp_m16));
3423 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
3424 return dst;
3425 }
3426
3427 case Iop_64HLtoV128: {
3428 const IRExpr* arg1 = e->Iex.Binop.arg1;
3429 const IRExpr* arg2 = e->Iex.Binop.arg2;
3430 HReg dst = newVRegV(env);
3431 HReg tmp = newVRegV(env);
3432 HReg qHi = iselIntExpr_R(env, arg1);
3433 // If the args are trivially the same (tmp or const), use the same
3434 // source register for both, and only one movq since those are
3435 // (relatively) expensive.
3436 if (areAtomsAndEqual(arg1, arg2)) {
3437 addInstr(env, AMD64Instr_SseMOVQ(qHi, dst, True/*toXMM*/));
3438 addInstr(env, mk_vMOVsd_RR(dst, tmp));
3439 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
3440 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3441 } else {
3442 HReg qLo = iselIntExpr_R(env, arg2);
3443 addInstr(env, AMD64Instr_SseMOVQ(qHi, dst, True/*toXMM*/));
3444 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
3445 addInstr(env, AMD64Instr_SseMOVQ(qLo, tmp, True/*toXMM*/));
3446 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3447 }
3448 return dst;
3449 }
3450
3451 case Iop_CmpEQ32Fx4: op = Asse_CMPEQF; goto do_32Fx4;
3452 case Iop_CmpLT32Fx4: op = Asse_CMPLTF; goto do_32Fx4;
3453 case Iop_CmpLE32Fx4: op = Asse_CMPLEF; goto do_32Fx4;
3454 case Iop_CmpUN32Fx4: op = Asse_CMPUNF; goto do_32Fx4;
3455 case Iop_Max32Fx4: op = Asse_MAXF; goto do_32Fx4;
3456 case Iop_Min32Fx4: op = Asse_MINF; goto do_32Fx4;
3457 do_32Fx4:
3458 {
3459 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3460 HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3461 HReg dst = newVRegV(env);
3462 addInstr(env, mk_vMOVsd_RR(argL, dst));
3463 addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
3464 return dst;
3465 }
3466
3467 case Iop_CmpEQ64Fx2: op = Asse_CMPEQF; goto do_64Fx2;
3468 case Iop_CmpLT64Fx2: op = Asse_CMPLTF; goto do_64Fx2;
3469 case Iop_CmpLE64Fx2: op = Asse_CMPLEF; goto do_64Fx2;
3470 case Iop_CmpUN64Fx2: op = Asse_CMPUNF; goto do_64Fx2;
3471 case Iop_Max64Fx2: op = Asse_MAXF; goto do_64Fx2;
3472 case Iop_Min64Fx2: op = Asse_MINF; goto do_64Fx2;
3473 do_64Fx2:
3474 {
3475 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3476 HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3477 HReg dst = newVRegV(env);
3478 addInstr(env, mk_vMOVsd_RR(argL, dst));
3479 addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
3480 return dst;
3481 }
3482
3483 case Iop_CmpEQ32F0x4: op = Asse_CMPEQF; goto do_32F0x4;
3484 case Iop_CmpLT32F0x4: op = Asse_CMPLTF; goto do_32F0x4;
3485 case Iop_CmpLE32F0x4: op = Asse_CMPLEF; goto do_32F0x4;
3486 case Iop_CmpUN32F0x4: op = Asse_CMPUNF; goto do_32F0x4;
3487 case Iop_Add32F0x4: op = Asse_ADDF; goto do_32F0x4;
3488 case Iop_Div32F0x4: op = Asse_DIVF; goto do_32F0x4;
3489 case Iop_Max32F0x4: op = Asse_MAXF; goto do_32F0x4;
3490 case Iop_Min32F0x4: op = Asse_MINF; goto do_32F0x4;
3491 case Iop_Mul32F0x4: op = Asse_MULF; goto do_32F0x4;
3492 case Iop_Sub32F0x4: op = Asse_SUBF; goto do_32F0x4;
3493 do_32F0x4: {
3494 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3495 HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3496 HReg dst = newVRegV(env);
3497 addInstr(env, mk_vMOVsd_RR(argL, dst));
3498 addInstr(env, AMD64Instr_Sse32FLo(op, argR, dst));
3499 return dst;
3500 }
3501
3502 case Iop_CmpEQ64F0x2: op = Asse_CMPEQF; goto do_64F0x2;
3503 case Iop_CmpLT64F0x2: op = Asse_CMPLTF; goto do_64F0x2;
3504 case Iop_CmpLE64F0x2: op = Asse_CMPLEF; goto do_64F0x2;
3505 case Iop_CmpUN64F0x2: op = Asse_CMPUNF; goto do_64F0x2;
3506 case Iop_Add64F0x2: op = Asse_ADDF; goto do_64F0x2;
3507 case Iop_Div64F0x2: op = Asse_DIVF; goto do_64F0x2;
3508 case Iop_Max64F0x2: op = Asse_MAXF; goto do_64F0x2;
3509 case Iop_Min64F0x2: op = Asse_MINF; goto do_64F0x2;
3510 case Iop_Mul64F0x2: op = Asse_MULF; goto do_64F0x2;
3511 case Iop_Sub64F0x2: op = Asse_SUBF; goto do_64F0x2;
3512 do_64F0x2: {
3513 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3514 HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3515 HReg dst = newVRegV(env);
3516 addInstr(env, mk_vMOVsd_RR(argL, dst));
3517 addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
3518 return dst;
3519 }
3520
3521 case Iop_PermOrZero8x16:
3522 if (env->hwcaps & VEX_HWCAPS_AMD64_SSSE3) {
3523 op = Asse_PSHUFB;
3524 goto do_SseReRg;
3525 }
3526 // Otherwise we'll have to generate a call to
3527 // h_generic_calc_PermOrZero8x16 (ATK). But that would only be for a
3528 // host which doesn't have SSSE3, in which case we don't expect this
3529 // IROp to enter the compilation pipeline in the first place.
3530 break;
3531
3532 case Iop_QNarrowBin32Sto16Sx8:
3533 op = Asse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
3534 case Iop_QNarrowBin16Sto8Sx16:
3535 op = Asse_PACKSSW; arg1isEReg = True; goto do_SseReRg;
3536 case Iop_QNarrowBin16Sto8Ux16:
3537 op = Asse_PACKUSW; arg1isEReg = True; goto do_SseReRg;
3538
3539 case Iop_InterleaveHI8x16:
3540 op = Asse_UNPCKHB; arg1isEReg = True; goto do_SseReRg;
3541 case Iop_InterleaveHI16x8:
3542 op = Asse_UNPCKHW; arg1isEReg = True; goto do_SseReRg;
3543 case Iop_InterleaveHI32x4:
3544 op = Asse_UNPCKHD; arg1isEReg = True; goto do_SseReRg;
3545 case Iop_InterleaveHI64x2:
3546 op = Asse_UNPCKHQ; arg1isEReg = True; goto do_SseReRg;
3547
3548 case Iop_InterleaveLO8x16:
3549 op = Asse_UNPCKLB; arg1isEReg = True; goto do_SseReRg;
3550 case Iop_InterleaveLO16x8:
3551 op = Asse_UNPCKLW; arg1isEReg = True; goto do_SseReRg;
3552 case Iop_InterleaveLO32x4:
3553 op = Asse_UNPCKLD; arg1isEReg = True; goto do_SseReRg;
3554 case Iop_InterleaveLO64x2:
3555 op = Asse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg;
3556
3557 case Iop_AndV128: op = Asse_AND; goto do_SseReRg;
3558 case Iop_OrV128: op = Asse_OR; goto do_SseReRg;
3559 case Iop_XorV128: op = Asse_XOR; goto do_SseReRg;
3560 case Iop_Add8x16: op = Asse_ADD8; goto do_SseReRg;
3561 case Iop_Add16x8: op = Asse_ADD16; goto do_SseReRg;
3562 case Iop_Add32x4: op = Asse_ADD32; goto do_SseReRg;
3563 case Iop_Add64x2: op = Asse_ADD64; goto do_SseReRg;
3564 case Iop_QAdd8Sx16: op = Asse_QADD8S; goto do_SseReRg;
3565 case Iop_QAdd16Sx8: op = Asse_QADD16S; goto do_SseReRg;
3566 case Iop_QAdd8Ux16: op = Asse_QADD8U; goto do_SseReRg;
3567 case Iop_QAdd16Ux8: op = Asse_QADD16U; goto do_SseReRg;
3568 case Iop_Avg8Ux16: op = Asse_AVG8U; goto do_SseReRg;
3569 case Iop_Avg16Ux8: op = Asse_AVG16U; goto do_SseReRg;
3570 case Iop_CmpEQ8x16: op = Asse_CMPEQ8; goto do_SseReRg;
3571 case Iop_CmpEQ16x8: op = Asse_CMPEQ16; goto do_SseReRg;
3572 case Iop_CmpEQ32x4: op = Asse_CMPEQ32; goto do_SseReRg;
3573 case Iop_CmpGT8Sx16: op = Asse_CMPGT8S; goto do_SseReRg;
3574 case Iop_CmpGT16Sx8: op = Asse_CMPGT16S; goto do_SseReRg;
3575 case Iop_CmpGT32Sx4: op = Asse_CMPGT32S; goto do_SseReRg;
3576 case Iop_Max16Sx8: op = Asse_MAX16S; goto do_SseReRg;
3577 case Iop_Max8Ux16: op = Asse_MAX8U; goto do_SseReRg;
3578 case Iop_Min16Sx8: op = Asse_MIN16S; goto do_SseReRg;
3579 case Iop_Min8Ux16: op = Asse_MIN8U; goto do_SseReRg;
3580 case Iop_MulHi16Ux8: op = Asse_MULHI16U; goto do_SseReRg;
3581 case Iop_MulHi16Sx8: op = Asse_MULHI16S; goto do_SseReRg;
3582 case Iop_Mul16x8: op = Asse_MUL16; goto do_SseReRg;
3583 case Iop_Sub8x16: op = Asse_SUB8; goto do_SseReRg;
3584 case Iop_Sub16x8: op = Asse_SUB16; goto do_SseReRg;
3585 case Iop_Sub32x4: op = Asse_SUB32; goto do_SseReRg;
3586 case Iop_Sub64x2: op = Asse_SUB64; goto do_SseReRg;
3587 case Iop_QSub8Sx16: op = Asse_QSUB8S; goto do_SseReRg;
3588 case Iop_QSub16Sx8: op = Asse_QSUB16S; goto do_SseReRg;
3589 case Iop_QSub8Ux16: op = Asse_QSUB8U; goto do_SseReRg;
3590 case Iop_QSub16Ux8: op = Asse_QSUB16U; goto do_SseReRg;
3591 do_SseReRg: {
3592 HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
3593 HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
3594 HReg dst = newVRegV(env);
3595 if (arg1isEReg) {
3596 addInstr(env, mk_vMOVsd_RR(arg2, dst));
3597 addInstr(env, AMD64Instr_SseReRg(op, arg1, dst));
3598 } else {
3599 addInstr(env, mk_vMOVsd_RR(arg1, dst));
3600 addInstr(env, AMD64Instr_SseReRg(op, arg2, dst));
3601 }
3602 return dst;
3603 }
3604
3605 case Iop_ShlN16x8: laneBits = 16; op = Asse_SHL16; goto do_SseShift;
3606 case Iop_ShlN32x4: laneBits = 32; op = Asse_SHL32; goto do_SseShift;
3607 case Iop_ShlN64x2: laneBits = 64; op = Asse_SHL64; goto do_SseShift;
3608 case Iop_SarN16x8: laneBits = 16; op = Asse_SAR16; goto do_SseShift;
3609 case Iop_SarN32x4: laneBits = 32; op = Asse_SAR32; goto do_SseShift;
3610 case Iop_ShrN16x8: laneBits = 16; op = Asse_SHR16; goto do_SseShift;
3611 case Iop_ShrN32x4: laneBits = 32; op = Asse_SHR32; goto do_SseShift;
3612 case Iop_ShrN64x2: laneBits = 64; op = Asse_SHR64; goto do_SseShift;
3613 do_SseShift: {
3614 HReg dst = newVRegV(env);
3615 HReg greg = iselVecExpr(env, e->Iex.Binop.arg1);
3616 /* If it's a shift by an in-range immediate, generate a single
3617 instruction. */
3618 if (e->Iex.Binop.arg2->tag == Iex_Const) {
3619 IRConst* c = e->Iex.Binop.arg2->Iex.Const.con;
3620 vassert(c->tag == Ico_U8);
3621 UInt shift = c->Ico.U8;
3622 if (shift < laneBits) {
3623 addInstr(env, mk_vMOVsd_RR(greg, dst));
3624 addInstr(env, AMD64Instr_SseShiftN(op, shift, dst));
3625 return dst;
3626 }
3627 }
3628 /* Otherwise we have to do it the longwinded way. */
3629 AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
3630 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3631 HReg ereg = newVRegV(env);
3632 addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3633 addInstr(env, AMD64Instr_Push(rmi));
3634 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
3635 addInstr(env, mk_vMOVsd_RR(greg, dst));
3636 addInstr(env, AMD64Instr_SseReRg(op, ereg, dst));
3637 add_to_rsp(env, 16);
3638 return dst;
3639 }
3640
3641 case Iop_Mul32x4: fn = (HWord)h_generic_calc_Mul32x4;
3642 goto do_SseAssistedBinary;
3643 case Iop_Max32Sx4: fn = (HWord)h_generic_calc_Max32Sx4;
3644 goto do_SseAssistedBinary;
3645 case Iop_Min32Sx4: fn = (HWord)h_generic_calc_Min32Sx4;
3646 goto do_SseAssistedBinary;
3647 case Iop_Max32Ux4: fn = (HWord)h_generic_calc_Max32Ux4;
3648 goto do_SseAssistedBinary;
3649 case Iop_Min32Ux4: fn = (HWord)h_generic_calc_Min32Ux4;
3650 goto do_SseAssistedBinary;
3651 case Iop_Max16Ux8: fn = (HWord)h_generic_calc_Max16Ux8;
3652 goto do_SseAssistedBinary;
3653 case Iop_Min16Ux8: fn = (HWord)h_generic_calc_Min16Ux8;
3654 goto do_SseAssistedBinary;
3655 case Iop_Max8Sx16: fn = (HWord)h_generic_calc_Max8Sx16;
3656 goto do_SseAssistedBinary;
3657 case Iop_Min8Sx16: fn = (HWord)h_generic_calc_Min8Sx16;
3658 goto do_SseAssistedBinary;
3659 case Iop_CmpEQ64x2: fn = (HWord)h_generic_calc_CmpEQ64x2;
3660 goto do_SseAssistedBinary;
3661 case Iop_CmpGT64Sx2: fn = (HWord)h_generic_calc_CmpGT64Sx2;
3662 goto do_SseAssistedBinary;
3663 case Iop_Perm32x4: fn = (HWord)h_generic_calc_Perm32x4;
3664 goto do_SseAssistedBinary;
3665 case Iop_QNarrowBin32Sto16Ux8:
3666 fn = (HWord)h_generic_calc_QNarrowBin32Sto16Ux8;
3667 goto do_SseAssistedBinary;
3668 case Iop_NarrowBin16to8x16:
3669 fn = (HWord)h_generic_calc_NarrowBin16to8x16;
3670 goto do_SseAssistedBinary;
3671 case Iop_NarrowBin32to16x8:
3672 fn = (HWord)h_generic_calc_NarrowBin32to16x8;
3673 goto do_SseAssistedBinary;
3674 do_SseAssistedBinary: {
3675 /* RRRufff! RRRufff code is what we're generating here. Oh
3676 well. */
3677 vassert(fn != 0);
3678 HReg dst = newVRegV(env);
3679 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3680 HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3681 HReg argp = newVRegI(env);
3682 /* subq $112, %rsp -- make a space*/
3683 sub_from_rsp(env, 112);
3684 /* leaq 48(%rsp), %r_argp -- point into it */
3685 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3686 argp));
3687 /* andq $-16, %r_argp -- 16-align the pointer */
3688 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
3689 AMD64RMI_Imm( ~(UInt)15 ),
3690 argp));
3691 /* Prepare 3 arg regs:
3692 leaq 0(%r_argp), %rdi
3693 leaq 16(%r_argp), %rsi
3694 leaq 32(%r_argp), %rdx
3695 */
3696 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
3697 hregAMD64_RDI()));
3698 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
3699 hregAMD64_RSI()));
3700 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
3701 hregAMD64_RDX()));
3702 /* Store the two args, at (%rsi) and (%rdx):
3703 movupd %argL, 0(%rsi)
3704 movupd %argR, 0(%rdx)
3705 */
3706 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
3707 AMD64AMode_IR(0, hregAMD64_RSI())));
3708 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argR,
3709 AMD64AMode_IR(0, hregAMD64_RDX())));
3710 /* call the helper */
3711 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
3712 3, mk_RetLoc_simple(RLPri_None) ));
3713 /* fetch the result from memory, using %r_argp, which the
3714 register allocator will keep alive across the call. */
3715 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
3716 AMD64AMode_IR(0, argp)));
3717 /* and finally, clear the space */
3718 add_to_rsp(env, 112);
3719 return dst;
3720 }
3721
3722 case Iop_SarN64x2: fn = (HWord)h_generic_calc_SarN64x2;
3723 goto do_SseAssistedVectorAndScalar;
3724 case Iop_SarN8x16: fn = (HWord)h_generic_calc_SarN8x16;
3725 goto do_SseAssistedVectorAndScalar;
3726 do_SseAssistedVectorAndScalar: {
3727 /* RRRufff! RRRufff code is what we're generating here. Oh
3728 well. */
3729 vassert(fn != 0);
3730 HReg dst = newVRegV(env);
3731 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3732 HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
3733 HReg argp = newVRegI(env);
3734 /* subq $112, %rsp -- make a space*/
3735 sub_from_rsp(env, 112);
3736 /* leaq 48(%rsp), %r_argp -- point into it */
3737 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3738 argp));
3739 /* andq $-16, %r_argp -- 16-align the pointer */
3740 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
3741 AMD64RMI_Imm( ~(UInt)15 ),
3742 argp));
3743 /* Prepare 2 vector arg regs:
3744 leaq 0(%r_argp), %rdi
3745 leaq 16(%r_argp), %rsi
3746 */
3747 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
3748 hregAMD64_RDI()));
3749 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
3750 hregAMD64_RSI()));
3751 /* Store the vector arg, at (%rsi):
3752 movupd %argL, 0(%rsi)
3753 */
3754 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
3755 AMD64AMode_IR(0, hregAMD64_RSI())));
3756 /* And get the scalar value into rdx */
3757 addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RDX()));
3758
3759 /* call the helper */
3760 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
3761 3, mk_RetLoc_simple(RLPri_None) ));
3762 /* fetch the result from memory, using %r_argp, which the
3763 register allocator will keep alive across the call. */
3764 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
3765 AMD64AMode_IR(0, argp)));
3766 /* and finally, clear the space */
3767 add_to_rsp(env, 112);
3768 return dst;
3769 }
3770
3771 case Iop_I32StoF32x4:
3772 case Iop_F32toI32Sx4: {
3773 HReg arg = iselVecExpr(env, e->Iex.Binop.arg2);
3774 HReg dst = newVRegV(env);
3775 AMD64SseOp mop
3776 = e->Iex.Binop.op == Iop_I32StoF32x4 ? Asse_I2F : Asse_F2I;
3777 set_SSE_rounding_mode(env, e->Iex.Binop.arg1);
3778 addInstr(env, AMD64Instr_Sse32Fx4(mop, arg, dst));
3779 set_SSE_rounding_default(env);
3780 return dst;
3781 }
3782
3783 default:
3784 break;
3785 } /* switch (e->Iex.Binop.op) */
3786 } /* if (e->tag == Iex_Binop) */
3787
3788 if (e->tag == Iex_Triop) {
3789 IRTriop *triop = e->Iex.Triop.details;
3790 switch (triop->op) {
3791
3792 case Iop_Add64Fx2: op = Asse_ADDF; goto do_64Fx2_w_rm;
3793 case Iop_Sub64Fx2: op = Asse_SUBF; goto do_64Fx2_w_rm;
3794 case Iop_Mul64Fx2: op = Asse_MULF; goto do_64Fx2_w_rm;
3795 case Iop_Div64Fx2: op = Asse_DIVF; goto do_64Fx2_w_rm;
3796 do_64Fx2_w_rm:
3797 {
3798 HReg argL = iselVecExpr(env, triop->arg2);
3799 HReg argR = iselVecExpr(env, triop->arg3);
3800 HReg dst = newVRegV(env);
3801 addInstr(env, mk_vMOVsd_RR(argL, dst));
3802 /* XXXROUNDINGFIXME */
3803 /* set roundingmode here */
3804 addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
3805 return dst;
3806 }
3807
3808 case Iop_Add32Fx4: op = Asse_ADDF; goto do_32Fx4_w_rm;
3809 case Iop_Sub32Fx4: op = Asse_SUBF; goto do_32Fx4_w_rm;
3810 case Iop_Mul32Fx4: op = Asse_MULF; goto do_32Fx4_w_rm;
3811 case Iop_Div32Fx4: op = Asse_DIVF; goto do_32Fx4_w_rm;
3812 do_32Fx4_w_rm:
3813 {
3814 HReg argL = iselVecExpr(env, triop->arg2);
3815 HReg argR = iselVecExpr(env, triop->arg3);
3816 HReg dst = newVRegV(env);
3817 addInstr(env, mk_vMOVsd_RR(argL, dst));
3818 /* XXXROUNDINGFIXME */
3819 /* set roundingmode here */
3820 addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
3821 return dst;
3822 }
3823
3824 default:
3825 break;
3826 } /* switch (triop->op) */
3827 } /* if (e->tag == Iex_Triop) */
3828
3829 if (e->tag == Iex_ITE) { // VFD
3830 HReg r1 = iselVecExpr(env, e->Iex.ITE.iftrue);
3831 HReg r0 = iselVecExpr(env, e->Iex.ITE.iffalse);
3832 HReg dst = newVRegV(env);
3833 addInstr(env, mk_vMOVsd_RR(r1,dst));
3834 AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
3835 addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
3836 return dst;
3837 }
3838
3839 //vec_fail:
3840 vex_printf("iselVecExpr (amd64, subarch = %s): can't reduce\n",
3841 LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
3842 ppIRExpr(e);
3843 vpanic("iselVecExpr_wrk");
3844 }
3845
3846
3847 /*---------------------------------------------------------*/
3848 /*--- ISEL: SIMD (V256) expressions, into 2 XMM regs. --*/
3849 /*---------------------------------------------------------*/
3850
iselDVecExpr(HReg * rHi,HReg * rLo,ISelEnv * env,const IRExpr * e)3851 static void iselDVecExpr ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
3852 ISelEnv* env, const IRExpr* e )
3853 {
3854 iselDVecExpr_wrk( rHi, rLo, env, e );
3855 # if 0
3856 vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3857 # endif
3858 vassert(hregClass(*rHi) == HRcVec128);
3859 vassert(hregClass(*rLo) == HRcVec128);
3860 vassert(hregIsVirtual(*rHi));
3861 vassert(hregIsVirtual(*rLo));
3862 }
3863
3864
3865 /* DO NOT CALL THIS DIRECTLY */
iselDVecExpr_wrk(HReg * rHi,HReg * rLo,ISelEnv * env,const IRExpr * e)3866 static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
3867 ISelEnv* env, const IRExpr* e )
3868 {
3869 HWord fn = 0; /* address of helper fn, if required */
3870 vassert(e);
3871 IRType ty = typeOfIRExpr(env->type_env, e);
3872 vassert(ty == Ity_V256);
3873 UInt laneBits = 0;
3874
3875 AMD64SseOp op = Asse_INVALID;
3876
3877 /* read 256-bit IRTemp */
3878 if (e->tag == Iex_RdTmp) {
3879 lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
3880 return;
3881 }
3882
3883 if (e->tag == Iex_Get) {
3884 HReg vHi = newVRegV(env);
3885 HReg vLo = newVRegV(env);
3886 HReg rbp = hregAMD64_RBP();
3887 AMD64AMode* am0 = AMD64AMode_IR(e->Iex.Get.offset + 0, rbp);
3888 AMD64AMode* am16 = AMD64AMode_IR(e->Iex.Get.offset + 16, rbp);
3889 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
3890 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
3891 *rHi = vHi;
3892 *rLo = vLo;
3893 return;
3894 }
3895
3896 if (e->tag == Iex_Load) {
3897 HReg vHi = newVRegV(env);
3898 HReg vLo = newVRegV(env);
3899 HReg rA = iselIntExpr_R(env, e->Iex.Load.addr);
3900 AMD64AMode* am0 = AMD64AMode_IR(0, rA);
3901 AMD64AMode* am16 = AMD64AMode_IR(16, rA);
3902 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
3903 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
3904 *rHi = vHi;
3905 *rLo = vLo;
3906 return;
3907 }
3908
3909 if (e->tag == Iex_Const) {
3910 vassert(e->Iex.Const.con->tag == Ico_V256);
3911 switch (e->Iex.Const.con->Ico.V256) {
3912 case 0x00000000: {
3913 HReg vHi = generate_zeroes_V128(env);
3914 HReg vLo = newVRegV(env);
3915 addInstr(env, mk_vMOVsd_RR(vHi, vLo));
3916 *rHi = vHi;
3917 *rLo = vLo;
3918 return;
3919 }
3920 default:
3921 break; /* give up. Until such time as is necessary. */
3922 }
3923 }
3924
3925 if (e->tag == Iex_Unop) {
3926 switch (e->Iex.Unop.op) {
3927
3928 case Iop_NotV256: {
3929 HReg argHi, argLo;
3930 iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3931 *rHi = do_sse_NotV128(env, argHi);
3932 *rLo = do_sse_NotV128(env, argLo);
3933 return;
3934 }
3935
3936 case Iop_RecipEst32Fx8: op = Asse_RCPF; goto do_32Fx8_unary;
3937 case Iop_Sqrt32Fx8: op = Asse_SQRTF; goto do_32Fx8_unary;
3938 case Iop_RSqrtEst32Fx8: op = Asse_RSQRTF; goto do_32Fx8_unary;
3939 do_32Fx8_unary:
3940 {
3941 HReg argHi, argLo;
3942 iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3943 HReg dstHi = newVRegV(env);
3944 HReg dstLo = newVRegV(env);
3945 addInstr(env, AMD64Instr_Sse32Fx4(op, argHi, dstHi));
3946 addInstr(env, AMD64Instr_Sse32Fx4(op, argLo, dstLo));
3947 *rHi = dstHi;
3948 *rLo = dstLo;
3949 return;
3950 }
3951
3952 case Iop_Sqrt64Fx4: op = Asse_SQRTF; goto do_64Fx4_unary;
3953 do_64Fx4_unary:
3954 {
3955 HReg argHi, argLo;
3956 iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3957 HReg dstHi = newVRegV(env);
3958 HReg dstLo = newVRegV(env);
3959 addInstr(env, AMD64Instr_Sse64Fx2(op, argHi, dstHi));
3960 addInstr(env, AMD64Instr_Sse64Fx2(op, argLo, dstLo));
3961 *rHi = dstHi;
3962 *rLo = dstLo;
3963 return;
3964 }
3965
3966 case Iop_CmpNEZ64x4: {
3967 /* We can use SSE2 instructions for this. */
3968 /* Same scheme as Iop_CmpNEZ64x2, except twice as wide
3969 (obviously). See comment on Iop_CmpNEZ64x2 for
3970 explanation of what's going on here. */
3971 HReg argHi, argLo;
3972 iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3973 HReg tmpHi = generate_zeroes_V128(env);
3974 HReg tmpLo = newVRegV(env);
3975 addInstr(env, mk_vMOVsd_RR(tmpHi, tmpLo));
3976 HReg dstHi = newVRegV(env);
3977 HReg dstLo = newVRegV(env);
3978 addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argHi, tmpHi));
3979 addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argLo, tmpLo));
3980 tmpHi = do_sse_NotV128(env, tmpHi);
3981 tmpLo = do_sse_NotV128(env, tmpLo);
3982 addInstr(env, AMD64Instr_SseShuf(0xB1, tmpHi, dstHi));
3983 addInstr(env, AMD64Instr_SseShuf(0xB1, tmpLo, dstLo));
3984 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpHi, dstHi));
3985 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpLo, dstLo));
3986 *rHi = dstHi;
3987 *rLo = dstLo;
3988 return;
3989 }
3990
3991 case Iop_CmpNEZ32x8: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
3992 case Iop_CmpNEZ16x16: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
3993 case Iop_CmpNEZ8x32: op = Asse_CMPEQ8; goto do_CmpNEZ_vector;
3994 do_CmpNEZ_vector:
3995 {
3996 HReg argHi, argLo;
3997 iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3998 HReg tmpHi = newVRegV(env);
3999 HReg tmpLo = newVRegV(env);
4000 HReg zero = generate_zeroes_V128(env);
4001 HReg dstHi, dstLo;
4002 addInstr(env, mk_vMOVsd_RR(argHi, tmpHi));
4003 addInstr(env, mk_vMOVsd_RR(argLo, tmpLo));
4004 addInstr(env, AMD64Instr_SseReRg(op, zero, tmpHi));
4005 addInstr(env, AMD64Instr_SseReRg(op, zero, tmpLo));
4006 dstHi = do_sse_NotV128(env, tmpHi);
4007 dstLo = do_sse_NotV128(env, tmpLo);
4008 *rHi = dstHi;
4009 *rLo = dstLo;
4010 return;
4011 }
4012
4013 default:
4014 break;
4015 } /* switch (e->Iex.Unop.op) */
4016 } /* if (e->tag == Iex_Unop) */
4017
4018 if (e->tag == Iex_Binop) {
4019 switch (e->Iex.Binop.op) {
4020
4021 case Iop_Max64Fx4: op = Asse_MAXF; goto do_64Fx4;
4022 case Iop_Min64Fx4: op = Asse_MINF; goto do_64Fx4;
4023 do_64Fx4:
4024 {
4025 HReg argLhi, argLlo, argRhi, argRlo;
4026 iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4027 iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4028 HReg dstHi = newVRegV(env);
4029 HReg dstLo = newVRegV(env);
4030 addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4031 addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4032 addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
4033 addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
4034 *rHi = dstHi;
4035 *rLo = dstLo;
4036 return;
4037 }
4038
4039 case Iop_Max32Fx8: op = Asse_MAXF; goto do_32Fx8;
4040 case Iop_Min32Fx8: op = Asse_MINF; goto do_32Fx8;
4041 do_32Fx8:
4042 {
4043 HReg argLhi, argLlo, argRhi, argRlo;
4044 iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4045 iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4046 HReg dstHi = newVRegV(env);
4047 HReg dstLo = newVRegV(env);
4048 addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4049 addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4050 addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
4051 addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
4052 *rHi = dstHi;
4053 *rLo = dstLo;
4054 return;
4055 }
4056
4057 case Iop_AndV256: op = Asse_AND; goto do_SseReRg;
4058 case Iop_OrV256: op = Asse_OR; goto do_SseReRg;
4059 case Iop_XorV256: op = Asse_XOR; goto do_SseReRg;
4060 case Iop_Add8x32: op = Asse_ADD8; goto do_SseReRg;
4061 case Iop_Add16x16: op = Asse_ADD16; goto do_SseReRg;
4062 case Iop_Add32x8: op = Asse_ADD32; goto do_SseReRg;
4063 case Iop_Add64x4: op = Asse_ADD64; goto do_SseReRg;
4064 case Iop_QAdd8Sx32: op = Asse_QADD8S; goto do_SseReRg;
4065 case Iop_QAdd16Sx16: op = Asse_QADD16S; goto do_SseReRg;
4066 case Iop_QAdd8Ux32: op = Asse_QADD8U; goto do_SseReRg;
4067 case Iop_QAdd16Ux16: op = Asse_QADD16U; goto do_SseReRg;
4068 case Iop_Avg8Ux32: op = Asse_AVG8U; goto do_SseReRg;
4069 case Iop_Avg16Ux16: op = Asse_AVG16U; goto do_SseReRg;
4070 case Iop_CmpEQ8x32: op = Asse_CMPEQ8; goto do_SseReRg;
4071 case Iop_CmpEQ16x16: op = Asse_CMPEQ16; goto do_SseReRg;
4072 case Iop_CmpEQ32x8: op = Asse_CMPEQ32; goto do_SseReRg;
4073 case Iop_CmpGT8Sx32: op = Asse_CMPGT8S; goto do_SseReRg;
4074 case Iop_CmpGT16Sx16: op = Asse_CMPGT16S; goto do_SseReRg;
4075 case Iop_CmpGT32Sx8: op = Asse_CMPGT32S; goto do_SseReRg;
4076 case Iop_Max16Sx16: op = Asse_MAX16S; goto do_SseReRg;
4077 case Iop_Max8Ux32: op = Asse_MAX8U; goto do_SseReRg;
4078 case Iop_Min16Sx16: op = Asse_MIN16S; goto do_SseReRg;
4079 case Iop_Min8Ux32: op = Asse_MIN8U; goto do_SseReRg;
4080 case Iop_MulHi16Ux16: op = Asse_MULHI16U; goto do_SseReRg;
4081 case Iop_MulHi16Sx16: op = Asse_MULHI16S; goto do_SseReRg;
4082 case Iop_Mul16x16: op = Asse_MUL16; goto do_SseReRg;
4083 case Iop_Sub8x32: op = Asse_SUB8; goto do_SseReRg;
4084 case Iop_Sub16x16: op = Asse_SUB16; goto do_SseReRg;
4085 case Iop_Sub32x8: op = Asse_SUB32; goto do_SseReRg;
4086 case Iop_Sub64x4: op = Asse_SUB64; goto do_SseReRg;
4087 case Iop_QSub8Sx32: op = Asse_QSUB8S; goto do_SseReRg;
4088 case Iop_QSub16Sx16: op = Asse_QSUB16S; goto do_SseReRg;
4089 case Iop_QSub8Ux32: op = Asse_QSUB8U; goto do_SseReRg;
4090 case Iop_QSub16Ux16: op = Asse_QSUB16U; goto do_SseReRg;
4091 do_SseReRg:
4092 {
4093 HReg argLhi, argLlo, argRhi, argRlo;
4094 iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4095 iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4096 HReg dstHi = newVRegV(env);
4097 HReg dstLo = newVRegV(env);
4098 addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4099 addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4100 addInstr(env, AMD64Instr_SseReRg(op, argRhi, dstHi));
4101 addInstr(env, AMD64Instr_SseReRg(op, argRlo, dstLo));
4102 *rHi = dstHi;
4103 *rLo = dstLo;
4104 return;
4105 }
4106
4107 case Iop_ShlN16x16: laneBits = 16; op = Asse_SHL16; goto do_SseShift;
4108 case Iop_ShlN32x8: laneBits = 32; op = Asse_SHL32; goto do_SseShift;
4109 case Iop_ShlN64x4: laneBits = 64; op = Asse_SHL64; goto do_SseShift;
4110 case Iop_SarN16x16: laneBits = 16; op = Asse_SAR16; goto do_SseShift;
4111 case Iop_SarN32x8: laneBits = 32; op = Asse_SAR32; goto do_SseShift;
4112 case Iop_ShrN16x16: laneBits = 16; op = Asse_SHR16; goto do_SseShift;
4113 case Iop_ShrN32x8: laneBits = 32; op = Asse_SHR32; goto do_SseShift;
4114 case Iop_ShrN64x4: laneBits = 64; op = Asse_SHR64; goto do_SseShift;
4115 do_SseShift: {
4116 HReg dstHi = newVRegV(env);
4117 HReg dstLo = newVRegV(env);
4118 HReg gregHi, gregLo;
4119 iselDVecExpr(&gregHi, &gregLo, env, e->Iex.Binop.arg1);
4120 /* If it's a shift by an in-range immediate, generate two single
4121 instructions. */
4122 if (e->Iex.Binop.arg2->tag == Iex_Const) {
4123 IRConst* c = e->Iex.Binop.arg2->Iex.Const.con;
4124 vassert(c->tag == Ico_U8);
4125 UInt shift = c->Ico.U8;
4126 if (shift < laneBits) {
4127 addInstr(env, mk_vMOVsd_RR(gregHi, dstHi));
4128 addInstr(env, AMD64Instr_SseShiftN(op, shift, dstHi));
4129 addInstr(env, mk_vMOVsd_RR(gregLo, dstLo));
4130 addInstr(env, AMD64Instr_SseShiftN(op, shift, dstLo));
4131 *rHi = dstHi;
4132 *rLo = dstLo;
4133 return;
4134 }
4135 }
4136 /* Otherwise we have to do it the longwinded way. */
4137 AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
4138 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
4139 HReg ereg = newVRegV(env);
4140 addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
4141 addInstr(env, AMD64Instr_Push(rmi));
4142 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
4143 addInstr(env, mk_vMOVsd_RR(gregHi, dstHi));
4144 addInstr(env, AMD64Instr_SseReRg(op, ereg, dstHi));
4145 addInstr(env, mk_vMOVsd_RR(gregLo, dstLo));
4146 addInstr(env, AMD64Instr_SseReRg(op, ereg, dstLo));
4147 add_to_rsp(env, 16);
4148 *rHi = dstHi;
4149 *rLo = dstLo;
4150 return;
4151 }
4152
4153 case Iop_V128HLtoV256: {
4154 // Curiously, there doesn't seem to be any benefit to be had here by
4155 // checking whether arg1 and arg2 are the same, in the style of how
4156 // (eg) 64HLtoV128 is handled elsewhere in this file.
4157 *rHi = iselVecExpr(env, e->Iex.Binop.arg1);
4158 *rLo = iselVecExpr(env, e->Iex.Binop.arg2);
4159 return;
4160 }
4161
4162 case Iop_Mul32x8: fn = (HWord)h_generic_calc_Mul32x4;
4163 goto do_SseAssistedBinary;
4164 case Iop_Max32Sx8: fn = (HWord)h_generic_calc_Max32Sx4;
4165 goto do_SseAssistedBinary;
4166 case Iop_Min32Sx8: fn = (HWord)h_generic_calc_Min32Sx4;
4167 goto do_SseAssistedBinary;
4168 case Iop_Max32Ux8: fn = (HWord)h_generic_calc_Max32Ux4;
4169 goto do_SseAssistedBinary;
4170 case Iop_Min32Ux8: fn = (HWord)h_generic_calc_Min32Ux4;
4171 goto do_SseAssistedBinary;
4172 case Iop_Max16Ux16: fn = (HWord)h_generic_calc_Max16Ux8;
4173 goto do_SseAssistedBinary;
4174 case Iop_Min16Ux16: fn = (HWord)h_generic_calc_Min16Ux8;
4175 goto do_SseAssistedBinary;
4176 case Iop_Max8Sx32: fn = (HWord)h_generic_calc_Max8Sx16;
4177 goto do_SseAssistedBinary;
4178 case Iop_Min8Sx32: fn = (HWord)h_generic_calc_Min8Sx16;
4179 goto do_SseAssistedBinary;
4180 case Iop_CmpEQ64x4: fn = (HWord)h_generic_calc_CmpEQ64x2;
4181 goto do_SseAssistedBinary;
4182 case Iop_CmpGT64Sx4: fn = (HWord)h_generic_calc_CmpGT64Sx2;
4183 goto do_SseAssistedBinary;
4184 do_SseAssistedBinary: {
4185 /* RRRufff! RRRufff code is what we're generating here. Oh
4186 well. */
4187 vassert(fn != 0);
4188 HReg dstHi = newVRegV(env);
4189 HReg dstLo = newVRegV(env);
4190 HReg argLhi, argLlo, argRhi, argRlo;
4191 iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4192 iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4193 HReg argp = newVRegI(env);
4194 /* subq $160, %rsp -- make a space*/
4195 sub_from_rsp(env, 160);
4196 /* leaq 48(%rsp), %r_argp -- point into it */
4197 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
4198 argp));
4199 /* andq $-16, %r_argp -- 16-align the pointer */
4200 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
4201 AMD64RMI_Imm( ~(UInt)15 ),
4202 argp));
4203 /* Prepare 3 arg regs:
4204 leaq 0(%r_argp), %rdi
4205 leaq 16(%r_argp), %rsi
4206 leaq 32(%r_argp), %rdx
4207 */
4208 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
4209 hregAMD64_RDI()));
4210 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
4211 hregAMD64_RSI()));
4212 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
4213 hregAMD64_RDX()));
4214 /* Store the two high args, at (%rsi) and (%rdx):
4215 movupd %argLhi, 0(%rsi)
4216 movupd %argRhi, 0(%rdx)
4217 */
4218 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
4219 AMD64AMode_IR(0, hregAMD64_RSI())));
4220 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
4221 AMD64AMode_IR(0, hregAMD64_RDX())));
4222 /* Store the two low args, at 48(%rsi) and 48(%rdx):
4223 movupd %argLlo, 48(%rsi)
4224 movupd %argRlo, 48(%rdx)
4225 */
4226 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
4227 AMD64AMode_IR(48, hregAMD64_RSI())));
4228 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
4229 AMD64AMode_IR(48, hregAMD64_RDX())));
4230 /* call the helper */
4231 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4232 mk_RetLoc_simple(RLPri_None) ));
4233 /* Prepare 3 arg regs:
4234 leaq 48(%r_argp), %rdi
4235 leaq 64(%r_argp), %rsi
4236 leaq 80(%r_argp), %rdx
4237 */
4238 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, argp),
4239 hregAMD64_RDI()));
4240 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
4241 hregAMD64_RSI()));
4242 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(80, argp),
4243 hregAMD64_RDX()));
4244 /* call the helper */
4245 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4246 mk_RetLoc_simple(RLPri_None) ));
4247 /* fetch the result from memory, using %r_argp, which the
4248 register allocator will keep alive across the call. */
4249 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
4250 AMD64AMode_IR(0, argp)));
4251 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
4252 AMD64AMode_IR(48, argp)));
4253 /* and finally, clear the space */
4254 add_to_rsp(env, 160);
4255 *rHi = dstHi;
4256 *rLo = dstLo;
4257 return;
4258 }
4259
4260 case Iop_Perm32x8: fn = (HWord)h_generic_calc_Perm32x8;
4261 goto do_SseAssistedBinary256;
4262 do_SseAssistedBinary256: {
4263 /* RRRufff! RRRufff code is what we're generating here. Oh
4264 well. */
4265 vassert(fn != 0);
4266 HReg dstHi = newVRegV(env);
4267 HReg dstLo = newVRegV(env);
4268 HReg argLhi, argLlo, argRhi, argRlo;
4269 iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4270 iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4271 HReg argp = newVRegI(env);
4272 /* subq $160, %rsp -- make a space*/
4273 sub_from_rsp(env, 160);
4274 /* leaq 48(%rsp), %r_argp -- point into it */
4275 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
4276 argp));
4277 /* andq $-16, %r_argp -- 16-align the pointer */
4278 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
4279 AMD64RMI_Imm( ~(UInt)15 ),
4280 argp));
4281 /* Prepare 3 arg regs:
4282 leaq 0(%r_argp), %rdi
4283 leaq 32(%r_argp), %rsi
4284 leaq 64(%r_argp), %rdx
4285 */
4286 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
4287 hregAMD64_RDI()));
4288 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
4289 hregAMD64_RSI()));
4290 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
4291 hregAMD64_RDX()));
4292 /* Store the two args, at (%rsi) and (%rdx):
4293 movupd %argLlo, 0(%rsi)
4294 movupd %argLhi, 16(%rsi)
4295 movupd %argRlo, 0(%rdx)
4296 movupd %argRhi, 16(%rdx)
4297 */
4298 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
4299 AMD64AMode_IR(0, hregAMD64_RSI())));
4300 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
4301 AMD64AMode_IR(16, hregAMD64_RSI())));
4302 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
4303 AMD64AMode_IR(0, hregAMD64_RDX())));
4304 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
4305 AMD64AMode_IR(16, hregAMD64_RDX())));
4306 /* call the helper */
4307 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4308 mk_RetLoc_simple(RLPri_None) ));
4309 /* fetch the result from memory, using %r_argp, which the
4310 register allocator will keep alive across the call. */
4311 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
4312 AMD64AMode_IR(0, argp)));
4313 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
4314 AMD64AMode_IR(16, argp)));
4315 /* and finally, clear the space */
4316 add_to_rsp(env, 160);
4317 *rHi = dstHi;
4318 *rLo = dstLo;
4319 return;
4320 }
4321
4322 case Iop_I32StoF32x8:
4323 case Iop_F32toI32Sx8: {
4324 HReg argHi, argLo;
4325 iselDVecExpr(&argHi, &argLo, env, e->Iex.Binop.arg2);
4326 HReg dstHi = newVRegV(env);
4327 HReg dstLo = newVRegV(env);
4328 AMD64SseOp mop
4329 = e->Iex.Binop.op == Iop_I32StoF32x8 ? Asse_I2F : Asse_F2I;
4330 set_SSE_rounding_mode(env, e->Iex.Binop.arg1);
4331 addInstr(env, AMD64Instr_Sse32Fx4(mop, argHi, dstHi));
4332 addInstr(env, AMD64Instr_Sse32Fx4(mop, argLo, dstLo));
4333 set_SSE_rounding_default(env);
4334 *rHi = dstHi;
4335 *rLo = dstLo;
4336 return;
4337 }
4338
4339 default:
4340 break;
4341 } /* switch (e->Iex.Binop.op) */
4342 } /* if (e->tag == Iex_Binop) */
4343
4344 if (e->tag == Iex_Triop) {
4345 IRTriop *triop = e->Iex.Triop.details;
4346 switch (triop->op) {
4347
4348 case Iop_Add64Fx4: op = Asse_ADDF; goto do_64Fx4_w_rm;
4349 case Iop_Sub64Fx4: op = Asse_SUBF; goto do_64Fx4_w_rm;
4350 case Iop_Mul64Fx4: op = Asse_MULF; goto do_64Fx4_w_rm;
4351 case Iop_Div64Fx4: op = Asse_DIVF; goto do_64Fx4_w_rm;
4352 do_64Fx4_w_rm:
4353 {
4354 HReg argLhi, argLlo, argRhi, argRlo;
4355 iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
4356 iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
4357 HReg dstHi = newVRegV(env);
4358 HReg dstLo = newVRegV(env);
4359 addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4360 addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4361 /* XXXROUNDINGFIXME */
4362 /* set roundingmode here */
4363 addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
4364 addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
4365 *rHi = dstHi;
4366 *rLo = dstLo;
4367 return;
4368 }
4369
4370 case Iop_Add32Fx8: op = Asse_ADDF; goto do_32Fx8_w_rm;
4371 case Iop_Sub32Fx8: op = Asse_SUBF; goto do_32Fx8_w_rm;
4372 case Iop_Mul32Fx8: op = Asse_MULF; goto do_32Fx8_w_rm;
4373 case Iop_Div32Fx8: op = Asse_DIVF; goto do_32Fx8_w_rm;
4374 do_32Fx8_w_rm:
4375 {
4376 HReg argLhi, argLlo, argRhi, argRlo;
4377 iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
4378 iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
4379 HReg dstHi = newVRegV(env);
4380 HReg dstLo = newVRegV(env);
4381 addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4382 addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4383 /* XXXROUNDINGFIXME */
4384 /* set roundingmode here */
4385 addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
4386 addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
4387 *rHi = dstHi;
4388 *rLo = dstLo;
4389 return;
4390 }
4391
4392 default:
4393 break;
4394 } /* switch (triop->op) */
4395 } /* if (e->tag == Iex_Triop) */
4396
4397
4398 if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_64x4toV256) {
4399 const IRExpr* arg1 = e->Iex.Qop.details->arg1;
4400 const IRExpr* arg2 = e->Iex.Qop.details->arg2;
4401 const IRExpr* arg3 = e->Iex.Qop.details->arg3;
4402 const IRExpr* arg4 = e->Iex.Qop.details->arg4;
4403 // If the args are trivially the same (tmp or const), use the same
4404 // source register for all four, and only one movq since those are
4405 // (relatively) expensive.
4406 if (areAtomsAndEqual(arg1, arg2)
4407 && areAtomsAndEqual(arg1, arg3) && areAtomsAndEqual(arg1, arg4)) {
4408 HReg q3 = iselIntExpr_R(env, e->Iex.Qop.details->arg1);
4409 HReg tmp = newVRegV(env);
4410 HReg dst = newVRegV(env);
4411 addInstr(env, AMD64Instr_SseMOVQ(q3, dst, True/*toXMM*/));
4412 addInstr(env, mk_vMOVsd_RR(dst, tmp));
4413 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
4414 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
4415 *rHi = dst;
4416 *rLo = dst;
4417 } else {
4418 /* arg1 is the most significant (Q3), arg4 the least (Q0) */
4419 HReg q3 = iselIntExpr_R(env, arg1);
4420 HReg q2 = iselIntExpr_R(env, arg2);
4421 HReg q1 = iselIntExpr_R(env, arg3);
4422 HReg q0 = iselIntExpr_R(env, arg4);
4423 HReg tmp = newVRegV(env);
4424 HReg dstHi = newVRegV(env);
4425 HReg dstLo = newVRegV(env);
4426 addInstr(env, AMD64Instr_SseMOVQ(q3, dstHi, True/*toXMM*/));
4427 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstHi));
4428 addInstr(env, AMD64Instr_SseMOVQ(q2, tmp, True/*toXMM*/));
4429 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dstHi));
4430 addInstr(env, AMD64Instr_SseMOVQ(q1, dstLo, True/*toXMM*/));
4431 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstLo));
4432 addInstr(env, AMD64Instr_SseMOVQ(q0, tmp, True/*toXMM*/));
4433 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dstLo));
4434 *rHi = dstHi;
4435 *rLo = dstLo;
4436 }
4437 return;
4438 }
4439
4440 if (e->tag == Iex_ITE) {
4441 HReg r1Hi, r1Lo, r0Hi, r0Lo;
4442 iselDVecExpr(&r1Hi, &r1Lo, env, e->Iex.ITE.iftrue);
4443 iselDVecExpr(&r0Hi, &r0Lo, env, e->Iex.ITE.iffalse);
4444 HReg dstHi = newVRegV(env);
4445 HReg dstLo = newVRegV(env);
4446 addInstr(env, mk_vMOVsd_RR(r1Hi,dstHi));
4447 addInstr(env, mk_vMOVsd_RR(r1Lo,dstLo));
4448 AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
4449 addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Hi, dstHi));
4450 addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Lo, dstLo));
4451 *rHi = dstHi;
4452 *rLo = dstLo;
4453 return;
4454 }
4455
4456 //avx_fail:
4457 vex_printf("iselDVecExpr (amd64, subarch = %s): can't reduce\n",
4458 LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
4459 ppIRExpr(e);
4460 vpanic("iselDVecExpr_wrk");
4461 }
4462
4463
4464 /*---------------------------------------------------------*/
4465 /*--- ISEL: Statements ---*/
4466 /*---------------------------------------------------------*/
4467
iselStmt(ISelEnv * env,IRStmt * stmt)4468 static void iselStmt ( ISelEnv* env, IRStmt* stmt )
4469 {
4470 if (vex_traceflags & VEX_TRACE_VCODE) {
4471 vex_printf("\n-- ");
4472 ppIRStmt(stmt);
4473 vex_printf("\n");
4474 }
4475
4476 switch (stmt->tag) {
4477
4478 /* --------- LOADG (guarded load) --------- */
4479 case Ist_LoadG: {
4480 IRLoadG* lg = stmt->Ist.LoadG.details;
4481 if (lg->end != Iend_LE)
4482 goto stmt_fail;
4483
4484 UChar szB = 0; /* invalid */
4485 switch (lg->cvt) {
4486 case ILGop_Ident32: szB = 4; break;
4487 case ILGop_Ident64: szB = 8; break;
4488 case ILGop_IdentV128: szB = 16; break;
4489 default: break;
4490 }
4491 if (szB == 0)
4492 goto stmt_fail;
4493
4494 AMD64AMode* amAddr
4495 = iselIntExpr_AMode(env, lg->addr);
4496 HReg rAlt
4497 = szB == 16 ? iselVecExpr(env, lg->alt)
4498 : iselIntExpr_R(env, lg->alt);
4499 HReg rDst
4500 = lookupIRTemp(env, lg->dst);
4501
4502 /* Get the alt value into the dst. We'll do a conditional load
4503 which overwrites it -- or not -- with loaded data. */
4504 if (szB == 16) {
4505 addInstr(env, mk_vMOVsd_RR(rAlt, rDst));
4506 } else {
4507 addInstr(env, mk_iMOVsd_RR(rAlt, rDst));
4508 }
4509 AMD64CondCode cc = iselCondCode(env, lg->guard);
4510 if (szB == 16) {
4511 addInstr(env, AMD64Instr_SseCLoad(cc, amAddr, rDst));
4512 } else {
4513 addInstr(env, AMD64Instr_CLoad(cc, szB, amAddr, rDst));
4514 }
4515 return;
4516 }
4517
4518 /* --------- STOREG (guarded store) --------- */
4519 case Ist_StoreG: {
4520 IRStoreG* sg = stmt->Ist.StoreG.details;
4521 if (sg->end != Iend_LE)
4522 goto stmt_fail;
4523
4524 UChar szB = 0; /* invalid */
4525 switch (typeOfIRExpr(env->type_env, sg->data)) {
4526 case Ity_I32: szB = 4; break;
4527 case Ity_I64: szB = 8; break;
4528 case Ity_V128: szB = 16; break;
4529 default: break;
4530 }
4531 if (szB == 0)
4532 goto stmt_fail;
4533
4534 AMD64AMode* amAddr
4535 = iselIntExpr_AMode(env, sg->addr);
4536 HReg rSrc
4537 = szB == 16 ? iselVecExpr(env, sg->data)
4538 : iselIntExpr_R(env, sg->data);
4539 AMD64CondCode cc
4540 = iselCondCode(env, sg->guard);
4541 if (szB == 16) {
4542 addInstr(env, AMD64Instr_SseCStore(cc, rSrc, amAddr));
4543 } else {
4544 addInstr(env, AMD64Instr_CStore(cc, szB, rSrc, amAddr));
4545 }
4546 return;
4547 }
4548
4549 /* --------- STORE --------- */
4550 case Ist_Store: {
4551 IRType tya = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
4552 IRType tyd = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
4553 IREndness end = stmt->Ist.Store.end;
4554
4555 if (tya != Ity_I64 || end != Iend_LE)
4556 goto stmt_fail;
4557
4558 if (tyd == Ity_I64) {
4559 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4560 AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Store.data);
4561 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,ri,am));
4562 return;
4563 }
4564 if (tyd == Ity_I8 || tyd == Ity_I16 || tyd == Ity_I32) {
4565 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4566 HReg r = iselIntExpr_R(env, stmt->Ist.Store.data);
4567 addInstr(env, AMD64Instr_Store(
4568 toUChar(tyd==Ity_I8 ? 1 : (tyd==Ity_I16 ? 2 : 4)),
4569 r,am));
4570 return;
4571 }
4572 if (tyd == Ity_F64) {
4573 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4574 HReg r = iselDblExpr(env, stmt->Ist.Store.data);
4575 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, r, am));
4576 return;
4577 }
4578 if (tyd == Ity_F32) {
4579 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4580 HReg r = iselFltExpr(env, stmt->Ist.Store.data);
4581 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, r, am));
4582 return;
4583 }
4584 if (tyd == Ity_V128) {
4585 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4586 HReg r = iselVecExpr(env, stmt->Ist.Store.data);
4587 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, r, am));
4588 return;
4589 }
4590 if (tyd == Ity_V256) {
4591 HReg rA = iselIntExpr_R(env, stmt->Ist.Store.addr);
4592 AMD64AMode* am0 = AMD64AMode_IR(0, rA);
4593 AMD64AMode* am16 = AMD64AMode_IR(16, rA);
4594 HReg vHi, vLo;
4595 iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Store.data);
4596 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
4597 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
4598 return;
4599 }
4600 break;
4601 }
4602
4603 /* --------- PUT --------- */
4604 case Ist_Put: {
4605 IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
4606 if (ty == Ity_I64) {
4607 /* We're going to write to memory, so compute the RHS into an
4608 AMD64RI. */
4609 AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Put.data);
4610 addInstr(env,
4611 AMD64Instr_Alu64M(
4612 Aalu_MOV,
4613 ri,
4614 AMD64AMode_IR(stmt->Ist.Put.offset,
4615 hregAMD64_RBP())
4616 ));
4617 return;
4618 }
4619 if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
4620 HReg r = iselIntExpr_R(env, stmt->Ist.Put.data);
4621 addInstr(env, AMD64Instr_Store(
4622 toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
4623 r,
4624 AMD64AMode_IR(stmt->Ist.Put.offset,
4625 hregAMD64_RBP())));
4626 return;
4627 }
4628 if (ty == Ity_F32) {
4629 HReg f32 = iselFltExpr(env, stmt->Ist.Put.data);
4630 AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset, hregAMD64_RBP());
4631 set_SSE_rounding_default(env); /* paranoia */
4632 addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 4, f32, am ));
4633 return;
4634 }
4635 if (ty == Ity_F64) {
4636 HReg f64 = iselDblExpr(env, stmt->Ist.Put.data);
4637 AMD64AMode* am = AMD64AMode_IR( stmt->Ist.Put.offset,
4638 hregAMD64_RBP() );
4639 addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, f64, am ));
4640 return;
4641 }
4642 if (ty == Ity_V128) {
4643 HReg vec = iselVecExpr(env, stmt->Ist.Put.data);
4644 AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset,
4645 hregAMD64_RBP());
4646 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, am));
4647 return;
4648 }
4649 if (ty == Ity_V256) {
4650 HReg vHi, vLo;
4651 iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Put.data);
4652 HReg rbp = hregAMD64_RBP();
4653 AMD64AMode* am0 = AMD64AMode_IR(stmt->Ist.Put.offset + 0, rbp);
4654 AMD64AMode* am16 = AMD64AMode_IR(stmt->Ist.Put.offset + 16, rbp);
4655 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
4656 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
4657 return;
4658 }
4659 break;
4660 }
4661
4662 /* --------- Indexed PUT --------- */
4663 case Ist_PutI: {
4664 IRPutI *puti = stmt->Ist.PutI.details;
4665
4666 AMD64AMode* am
4667 = genGuestArrayOffset(
4668 env, puti->descr,
4669 puti->ix, puti->bias );
4670
4671 IRType ty = typeOfIRExpr(env->type_env, puti->data);
4672 if (ty == Ity_F64) {
4673 HReg val = iselDblExpr(env, puti->data);
4674 addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, val, am ));
4675 return;
4676 }
4677 if (ty == Ity_I8) {
4678 HReg r = iselIntExpr_R(env, puti->data);
4679 addInstr(env, AMD64Instr_Store( 1, r, am ));
4680 return;
4681 }
4682 if (ty == Ity_I64) {
4683 AMD64RI* ri = iselIntExpr_RI(env, puti->data);
4684 addInstr(env, AMD64Instr_Alu64M( Aalu_MOV, ri, am ));
4685 return;
4686 }
4687 break;
4688 }
4689
4690 /* --------- TMP --------- */
4691 case Ist_WrTmp: {
4692 IRTemp tmp = stmt->Ist.WrTmp.tmp;
4693 IRType ty = typeOfIRTemp(env->type_env, tmp);
4694
4695 /* optimisation: if stmt->Ist.WrTmp.data is Add64(..,..),
4696 compute it into an AMode and then use LEA. This usually
4697 produces fewer instructions, often because (for memcheck
4698 created IR) we get t = address-expression, (t is later used
4699 twice) and so doing this naturally turns address-expression
4700 back into an AMD64 amode. */
4701 if (ty == Ity_I64
4702 && stmt->Ist.WrTmp.data->tag == Iex_Binop
4703 && stmt->Ist.WrTmp.data->Iex.Binop.op == Iop_Add64) {
4704 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.WrTmp.data);
4705 HReg dst = lookupIRTemp(env, tmp);
4706 if (am->tag == Aam_IR && am->Aam.IR.imm == 0) {
4707 /* Hmm, iselIntExpr_AMode wimped out and just computed the
4708 value into a register. Just emit a normal reg-reg move
4709 so reg-alloc can coalesce it away in the usual way. */
4710 HReg src = am->Aam.IR.reg;
4711 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst));
4712 } else {
4713 addInstr(env, AMD64Instr_Lea64(am,dst));
4714 }
4715 return;
4716 }
4717
4718 if (ty == Ity_I64 || ty == Ity_I32
4719 || ty == Ity_I16 || ty == Ity_I8) {
4720 AMD64RMI* rmi = iselIntExpr_RMI(env, stmt->Ist.WrTmp.data);
4721 HReg dst = lookupIRTemp(env, tmp);
4722 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,rmi,dst));
4723 return;
4724 }
4725 if (ty == Ity_I128) {
4726 HReg rHi, rLo, dstHi, dstLo;
4727 iselInt128Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
4728 lookupIRTempPair( &dstHi, &dstLo, env, tmp);
4729 addInstr(env, mk_iMOVsd_RR(rHi,dstHi) );
4730 addInstr(env, mk_iMOVsd_RR(rLo,dstLo) );
4731 return;
4732 }
4733 if (ty == Ity_I1) {
4734 AMD64CondCode cond = iselCondCode(env, stmt->Ist.WrTmp.data);
4735 HReg dst = lookupIRTemp(env, tmp);
4736 addInstr(env, AMD64Instr_Set64(cond, dst));
4737 return;
4738 }
4739 if (ty == Ity_F64) {
4740 HReg dst = lookupIRTemp(env, tmp);
4741 HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
4742 addInstr(env, mk_vMOVsd_RR(src, dst));
4743 return;
4744 }
4745 if (ty == Ity_F32) {
4746 HReg dst = lookupIRTemp(env, tmp);
4747 HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
4748 addInstr(env, mk_vMOVsd_RR(src, dst));
4749 return;
4750 }
4751 if (ty == Ity_V128) {
4752 HReg dst = lookupIRTemp(env, tmp);
4753 HReg src = iselVecExpr(env, stmt->Ist.WrTmp.data);
4754 addInstr(env, mk_vMOVsd_RR(src, dst));
4755 return;
4756 }
4757 if (ty == Ity_V256) {
4758 HReg rHi, rLo, dstHi, dstLo;
4759 iselDVecExpr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
4760 lookupIRTempPair( &dstHi, &dstLo, env, tmp);
4761 addInstr(env, mk_vMOVsd_RR(rHi,dstHi) );
4762 addInstr(env, mk_vMOVsd_RR(rLo,dstLo) );
4763 return;
4764 }
4765 break;
4766 }
4767
4768 /* --------- Call to DIRTY helper --------- */
4769 case Ist_Dirty: {
4770 IRDirty* d = stmt->Ist.Dirty.details;
4771
4772 /* Figure out the return type, if any. */
4773 IRType retty = Ity_INVALID;
4774 if (d->tmp != IRTemp_INVALID)
4775 retty = typeOfIRTemp(env->type_env, d->tmp);
4776
4777 /* Throw out any return types we don't know about. */
4778 Bool retty_ok = False;
4779 switch (retty) {
4780 case Ity_INVALID: /* function doesn't return anything */
4781 case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
4782 case Ity_V128: case Ity_V256:
4783 retty_ok = True; break;
4784 default:
4785 break;
4786 }
4787 if (!retty_ok)
4788 break; /* will go to stmt_fail: */
4789
4790 /* Marshal args, do the call, and set the return value to
4791 0x555..555 if this is a conditional call that returns a value
4792 and the call is skipped. */
4793 UInt addToSp = 0;
4794 RetLoc rloc = mk_RetLoc_INVALID();
4795 doHelperCall( &addToSp, &rloc, env, d->guard, d->cee, retty, d->args );
4796 vassert(is_sane_RetLoc(rloc));
4797
4798 /* Now figure out what to do with the returned value, if any. */
4799 switch (retty) {
4800 case Ity_INVALID: {
4801 /* No return value. Nothing to do. */
4802 vassert(d->tmp == IRTemp_INVALID);
4803 vassert(rloc.pri == RLPri_None);
4804 vassert(addToSp == 0);
4805 return;
4806 }
4807 case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: {
4808 /* The returned value is in %rax. Park it in the register
4809 associated with tmp. */
4810 vassert(rloc.pri == RLPri_Int);
4811 vassert(addToSp == 0);
4812 HReg dst = lookupIRTemp(env, d->tmp);
4813 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(),dst) );
4814 return;
4815 }
4816 case Ity_V128: {
4817 /* The returned value is on the stack, and rloc.spOff
4818 tells us where. Fish it off the stack and then move
4819 the stack pointer upwards to clear it, as directed by
4820 doHelperCall. */
4821 vassert(rloc.pri == RLPri_V128SpRel);
4822 vassert(addToSp >= 16);
4823 HReg dst = lookupIRTemp(env, d->tmp);
4824 AMD64AMode* am = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP());
4825 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
4826 add_to_rsp(env, addToSp);
4827 return;
4828 }
4829 case Ity_V256: {
4830 /* See comments for Ity_V128. */
4831 vassert(rloc.pri == RLPri_V256SpRel);
4832 vassert(addToSp >= 32);
4833 HReg dstLo, dstHi;
4834 lookupIRTempPair(&dstHi, &dstLo, env, d->tmp);
4835 AMD64AMode* amLo = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP());
4836 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstLo, amLo ));
4837 AMD64AMode* amHi = AMD64AMode_IR(rloc.spOff+16, hregAMD64_RSP());
4838 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstHi, amHi ));
4839 add_to_rsp(env, addToSp);
4840 return;
4841 }
4842 default:
4843 /*NOTREACHED*/
4844 vassert(0);
4845 }
4846 break;
4847 }
4848
4849 /* --------- MEM FENCE --------- */
4850 case Ist_MBE:
4851 switch (stmt->Ist.MBE.event) {
4852 case Imbe_Fence:
4853 addInstr(env, AMD64Instr_MFence());
4854 return;
4855 default:
4856 break;
4857 }
4858 break;
4859
4860 /* --------- ACAS --------- */
4861 case Ist_CAS:
4862 if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
4863 /* "normal" singleton CAS */
4864 UChar sz;
4865 IRCAS* cas = stmt->Ist.CAS.details;
4866 IRType ty = typeOfIRExpr(env->type_env, cas->dataLo);
4867 /* get: cas->expd into %rax, and cas->data into %rbx */
4868 AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
4869 HReg rData = iselIntExpr_R(env, cas->dataLo);
4870 HReg rExpd = iselIntExpr_R(env, cas->expdLo);
4871 HReg rOld = lookupIRTemp(env, cas->oldLo);
4872 vassert(cas->expdHi == NULL);
4873 vassert(cas->dataHi == NULL);
4874 addInstr(env, mk_iMOVsd_RR(rExpd, rOld));
4875 addInstr(env, mk_iMOVsd_RR(rExpd, hregAMD64_RAX()));
4876 addInstr(env, mk_iMOVsd_RR(rData, hregAMD64_RBX()));
4877 switch (ty) {
4878 case Ity_I64: sz = 8; break;
4879 case Ity_I32: sz = 4; break;
4880 case Ity_I16: sz = 2; break;
4881 case Ity_I8: sz = 1; break;
4882 default: goto unhandled_cas;
4883 }
4884 addInstr(env, AMD64Instr_ACAS(am, sz));
4885 addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RAX(), rOld));
4886 return;
4887 } else {
4888 /* double CAS */
4889 UChar sz;
4890 IRCAS* cas = stmt->Ist.CAS.details;
4891 IRType ty = typeOfIRExpr(env->type_env, cas->dataLo);
4892 /* only 32-bit and 64-bit allowed in this case */
4893 /* get: cas->expdLo into %rax, and cas->dataLo into %rbx */
4894 /* get: cas->expdHi into %rdx, and cas->dataHi into %rcx */
4895 AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
4896 HReg rDataHi = iselIntExpr_R(env, cas->dataHi);
4897 HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
4898 HReg rExpdHi = iselIntExpr_R(env, cas->expdHi);
4899 HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
4900 HReg rOldHi = lookupIRTemp(env, cas->oldHi);
4901 HReg rOldLo = lookupIRTemp(env, cas->oldLo);
4902 switch (ty) {
4903 case Ity_I64:
4904 if (!(env->hwcaps & VEX_HWCAPS_AMD64_CX16))
4905 goto unhandled_cas; /* we'd have to generate
4906 cmpxchg16b, but the host
4907 doesn't support that */
4908 sz = 8;
4909 break;
4910 case Ity_I32:
4911 sz = 4;
4912 break;
4913 default:
4914 goto unhandled_cas;
4915 }
4916 addInstr(env, mk_iMOVsd_RR(rExpdHi, rOldHi));
4917 addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
4918 addInstr(env, mk_iMOVsd_RR(rExpdHi, hregAMD64_RDX()));
4919 addInstr(env, mk_iMOVsd_RR(rExpdLo, hregAMD64_RAX()));
4920 addInstr(env, mk_iMOVsd_RR(rDataHi, hregAMD64_RCX()));
4921 addInstr(env, mk_iMOVsd_RR(rDataLo, hregAMD64_RBX()));
4922 addInstr(env, AMD64Instr_DACAS(am, sz));
4923 addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RDX(), rOldHi));
4924 addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RAX(), rOldLo));
4925 return;
4926 }
4927 unhandled_cas:
4928 break;
4929
4930 /* --------- INSTR MARK --------- */
4931 /* Doesn't generate any executable code ... */
4932 case Ist_IMark:
4933 return;
4934
4935 /* --------- ABI HINT --------- */
4936 /* These have no meaning (denotation in the IR) and so we ignore
4937 them ... if any actually made it this far. */
4938 case Ist_AbiHint:
4939 return;
4940
4941 /* --------- NO-OP --------- */
4942 case Ist_NoOp:
4943 return;
4944
4945 /* --------- EXIT --------- */
4946 case Ist_Exit: {
4947 if (stmt->Ist.Exit.dst->tag != Ico_U64)
4948 vpanic("iselStmt(amd64): Ist_Exit: dst is not a 64-bit value");
4949
4950 AMD64CondCode cc = iselCondCode(env, stmt->Ist.Exit.guard);
4951 AMD64AMode* amRIP = AMD64AMode_IR(stmt->Ist.Exit.offsIP,
4952 hregAMD64_RBP());
4953
4954 /* Case: boring transfer to known address */
4955 if (stmt->Ist.Exit.jk == Ijk_Boring) {
4956 if (env->chainingAllowed) {
4957 /* .. almost always true .. */
4958 /* Skip the event check at the dst if this is a forwards
4959 edge. */
4960 Bool toFastEP
4961 = ((Addr64)stmt->Ist.Exit.dst->Ico.U64) > env->max_ga;
4962 if (0) vex_printf("%s", toFastEP ? "Y" : ",");
4963 addInstr(env, AMD64Instr_XDirect(stmt->Ist.Exit.dst->Ico.U64,
4964 amRIP, cc, toFastEP));
4965 } else {
4966 /* .. very occasionally .. */
4967 /* We can't use chaining, so ask for an assisted transfer,
4968 as that's the only alternative that is allowable. */
4969 HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
4970 addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, Ijk_Boring));
4971 }
4972 return;
4973 }
4974
4975 /* Case: assisted transfer to arbitrary address */
4976 switch (stmt->Ist.Exit.jk) {
4977 /* Keep this list in sync with that in iselNext below */
4978 case Ijk_ClientReq:
4979 case Ijk_EmWarn:
4980 case Ijk_NoDecode:
4981 case Ijk_NoRedir:
4982 case Ijk_SigSEGV:
4983 case Ijk_SigTRAP:
4984 case Ijk_Sys_syscall:
4985 case Ijk_Sys_int210:
4986 case Ijk_InvalICache:
4987 case Ijk_Yield:
4988 {
4989 HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
4990 addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, stmt->Ist.Exit.jk));
4991 return;
4992 }
4993 default:
4994 break;
4995 }
4996
4997 /* Do we ever expect to see any other kind? */
4998 goto stmt_fail;
4999 }
5000
5001 default: break;
5002 }
5003 stmt_fail:
5004 ppIRStmt(stmt);
5005 vpanic("iselStmt(amd64)");
5006 }
5007
5008
5009 /*---------------------------------------------------------*/
5010 /*--- ISEL: Basic block terminators (Nexts) ---*/
5011 /*---------------------------------------------------------*/
5012
iselNext(ISelEnv * env,IRExpr * next,IRJumpKind jk,Int offsIP)5013 static void iselNext ( ISelEnv* env,
5014 IRExpr* next, IRJumpKind jk, Int offsIP )
5015 {
5016 if (vex_traceflags & VEX_TRACE_VCODE) {
5017 vex_printf( "\n-- PUT(%d) = ", offsIP);
5018 ppIRExpr( next );
5019 vex_printf( "; exit-");
5020 ppIRJumpKind(jk);
5021 vex_printf( "\n");
5022 }
5023
5024 /* Case: boring transfer to known address */
5025 if (next->tag == Iex_Const) {
5026 IRConst* cdst = next->Iex.Const.con;
5027 vassert(cdst->tag == Ico_U64);
5028 if (jk == Ijk_Boring || jk == Ijk_Call) {
5029 /* Boring transfer to known address */
5030 AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
5031 if (env->chainingAllowed) {
5032 /* .. almost always true .. */
5033 /* Skip the event check at the dst if this is a forwards
5034 edge. */
5035 Bool toFastEP
5036 = ((Addr64)cdst->Ico.U64) > env->max_ga;
5037 if (0) vex_printf("%s", toFastEP ? "X" : ".");
5038 addInstr(env, AMD64Instr_XDirect(cdst->Ico.U64,
5039 amRIP, Acc_ALWAYS,
5040 toFastEP));
5041 } else {
5042 /* .. very occasionally .. */
5043 /* We can't use chaining, so ask for an indirect transfer,
5044 as that's the cheapest alternative that is
5045 allowable. */
5046 HReg r = iselIntExpr_R(env, next);
5047 addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
5048 Ijk_Boring));
5049 }
5050 return;
5051 }
5052 }
5053
5054 /* Case: call/return (==boring) transfer to any address */
5055 switch (jk) {
5056 case Ijk_Boring: case Ijk_Ret: case Ijk_Call: {
5057 HReg r = iselIntExpr_R(env, next);
5058 AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
5059 if (env->chainingAllowed) {
5060 addInstr(env, AMD64Instr_XIndir(r, amRIP, Acc_ALWAYS));
5061 } else {
5062 addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
5063 Ijk_Boring));
5064 }
5065 return;
5066 }
5067 default:
5068 break;
5069 }
5070
5071 /* Case: assisted transfer to arbitrary address */
5072 switch (jk) {
5073 /* Keep this list in sync with that for Ist_Exit above */
5074 case Ijk_ClientReq:
5075 case Ijk_EmWarn:
5076 case Ijk_NoDecode:
5077 case Ijk_NoRedir:
5078 case Ijk_SigSEGV:
5079 case Ijk_SigTRAP:
5080 case Ijk_Sys_syscall:
5081 case Ijk_Sys_int210:
5082 case Ijk_InvalICache:
5083 case Ijk_Yield: {
5084 HReg r = iselIntExpr_R(env, next);
5085 AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
5086 addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS, jk));
5087 return;
5088 }
5089 default:
5090 break;
5091 }
5092
5093 vex_printf( "\n-- PUT(%d) = ", offsIP);
5094 ppIRExpr( next );
5095 vex_printf( "; exit-");
5096 ppIRJumpKind(jk);
5097 vex_printf( "\n");
5098 vassert(0); // are we expecting any other kind?
5099 }
5100
5101
5102 /*---------------------------------------------------------*/
5103 /*--- Insn selector top-level ---*/
5104 /*---------------------------------------------------------*/
5105
5106 /* Translate an entire SB to amd64 code. */
5107
iselSB_AMD64(const IRSB * bb,VexArch arch_host,const VexArchInfo * archinfo_host,const VexAbiInfo * vbi,Int offs_Host_EvC_Counter,Int offs_Host_EvC_FailAddr,Bool chainingAllowed,Bool addProfInc,Addr max_ga)5108 HInstrArray* iselSB_AMD64 ( const IRSB* bb,
5109 VexArch arch_host,
5110 const VexArchInfo* archinfo_host,
5111 const VexAbiInfo* vbi/*UNUSED*/,
5112 Int offs_Host_EvC_Counter,
5113 Int offs_Host_EvC_FailAddr,
5114 Bool chainingAllowed,
5115 Bool addProfInc,
5116 Addr max_ga )
5117 {
5118 Int i, j;
5119 HReg hreg, hregHI;
5120 ISelEnv* env;
5121 UInt hwcaps_host = archinfo_host->hwcaps;
5122 AMD64AMode *amCounter, *amFailAddr;
5123
5124 /* sanity ... */
5125 vassert(arch_host == VexArchAMD64);
5126 vassert(0 == (hwcaps_host
5127 & ~(VEX_HWCAPS_AMD64_SSE3
5128 | VEX_HWCAPS_AMD64_SSSE3
5129 | VEX_HWCAPS_AMD64_CX16
5130 | VEX_HWCAPS_AMD64_LZCNT
5131 | VEX_HWCAPS_AMD64_AVX
5132 | VEX_HWCAPS_AMD64_RDTSCP
5133 | VEX_HWCAPS_AMD64_BMI
5134 | VEX_HWCAPS_AMD64_AVX2)));
5135
5136 /* Check that the host's endianness is as expected. */
5137 vassert(archinfo_host->endness == VexEndnessLE);
5138
5139 /* Make up an initial environment to use. */
5140 env = LibVEX_Alloc_inline(sizeof(ISelEnv));
5141 env->vreg_ctr = 0;
5142
5143 /* Set up output code array. */
5144 env->code = newHInstrArray();
5145
5146 /* Copy BB's type env. */
5147 env->type_env = bb->tyenv;
5148
5149 /* Make up an IRTemp -> virtual HReg mapping. This doesn't
5150 change as we go along. */
5151 env->n_vregmap = bb->tyenv->types_used;
5152 env->vregmap = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
5153 env->vregmapHI = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
5154
5155 /* and finally ... */
5156 env->chainingAllowed = chainingAllowed;
5157 env->hwcaps = hwcaps_host;
5158 env->max_ga = max_ga;
5159
5160 /* For each IR temporary, allocate a suitably-kinded virtual
5161 register. */
5162 j = 0;
5163 for (i = 0; i < env->n_vregmap; i++) {
5164 hregHI = hreg = INVALID_HREG;
5165 switch (bb->tyenv->types[i]) {
5166 case Ity_I1:
5167 case Ity_I8: case Ity_I16: case Ity_I32: case Ity_I64:
5168 hreg = mkHReg(True, HRcInt64, 0, j++);
5169 break;
5170 case Ity_I128:
5171 hreg = mkHReg(True, HRcInt64, 0, j++);
5172 hregHI = mkHReg(True, HRcInt64, 0, j++);
5173 break;
5174 case Ity_F32:
5175 case Ity_F64:
5176 case Ity_V128:
5177 hreg = mkHReg(True, HRcVec128, 0, j++);
5178 break;
5179 case Ity_V256:
5180 hreg = mkHReg(True, HRcVec128, 0, j++);
5181 hregHI = mkHReg(True, HRcVec128, 0, j++);
5182 break;
5183 default:
5184 ppIRType(bb->tyenv->types[i]);
5185 vpanic("iselBB(amd64): IRTemp type");
5186 }
5187 env->vregmap[i] = hreg;
5188 env->vregmapHI[i] = hregHI;
5189 }
5190 env->vreg_ctr = j;
5191
5192 /* The very first instruction must be an event check. */
5193 amCounter = AMD64AMode_IR(offs_Host_EvC_Counter, hregAMD64_RBP());
5194 amFailAddr = AMD64AMode_IR(offs_Host_EvC_FailAddr, hregAMD64_RBP());
5195 addInstr(env, AMD64Instr_EvCheck(amCounter, amFailAddr));
5196
5197 /* Possibly a block counter increment (for profiling). At this
5198 point we don't know the address of the counter, so just pretend
5199 it is zero. It will have to be patched later, but before this
5200 translation is used, by a call to LibVEX_patchProfCtr. */
5201 if (addProfInc) {
5202 addInstr(env, AMD64Instr_ProfInc());
5203 }
5204
5205 /* Ok, finally we can iterate over the statements. */
5206 for (i = 0; i < bb->stmts_used; i++)
5207 if (bb->stmts[i])
5208 iselStmt(env, bb->stmts[i]);
5209
5210 iselNext(env, bb->next, bb->jumpkind, bb->offsIP);
5211
5212 /* record the number of vregs we used. */
5213 env->code->n_vregs = env->vreg_ctr;
5214 return env->code;
5215 }
5216
5217
5218 /*---------------------------------------------------------------*/
5219 /*--- end host_amd64_isel.c ---*/
5220 /*---------------------------------------------------------------*/
5221