1 
2 /*---------------------------------------------------------------*/
3 /*--- begin                                 host_amd64_defs.c ---*/
4 /*---------------------------------------------------------------*/
5 
6 /*
7    This file is part of Valgrind, a dynamic binary instrumentation
8    framework.
9 
10    Copyright (C) 2004-2017 OpenWorks LLP
11       info@open-works.net
12 
13    This program is free software; you can redistribute it and/or
14    modify it under the terms of the GNU General Public License as
15    published by the Free Software Foundation; either version 2 of the
16    License, or (at your option) any later version.
17 
18    This program is distributed in the hope that it will be useful, but
19    WITHOUT ANY WARRANTY; without even the implied warranty of
20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21    General Public License for more details.
22 
23    You should have received a copy of the GNU General Public License
24    along with this program; if not, write to the Free Software
25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26    02110-1301, USA.
27 
28    The GNU General Public License is contained in the file COPYING.
29 
30    Neither the names of the U.S. Department of Energy nor the
31    University of California nor the names of its contributors may be
32    used to endorse or promote products derived from this software
33    without prior written permission.
34 */
35 
36 #include "libvex_basictypes.h"
37 #include "libvex.h"
38 #include "libvex_trc_values.h"
39 
40 #include "main_util.h"
41 #include "host_generic_regs.h"
42 #include "host_amd64_defs.h"
43 
44 
45 /* --------- Registers. --------- */
46 
getRRegUniverse_AMD64(void)47 const RRegUniverse* getRRegUniverse_AMD64 ( void )
48 {
49    /* The real-register universe is a big constant, so we just want to
50       initialise it once. */
51    static RRegUniverse rRegUniverse_AMD64;
52    static Bool         rRegUniverse_AMD64_initted = False;
53 
54    /* Handy shorthand, nothing more */
55    RRegUniverse* ru = &rRegUniverse_AMD64;
56 
57    /* This isn't thread-safe.  Sigh. */
58    if (LIKELY(rRegUniverse_AMD64_initted))
59       return ru;
60 
61    RRegUniverse__init(ru);
62 
63    /* Add the registers.  The initial segment of this array must be
64       those available for allocation by reg-alloc, and those that
65       follow are not available for allocation. */
66    ru->allocable_start[HRcInt64] = ru->size;
67    ru->regs[ru->size++] = hregAMD64_R12();
68    ru->regs[ru->size++] = hregAMD64_R13();
69    ru->regs[ru->size++] = hregAMD64_R14();
70    ru->regs[ru->size++] = hregAMD64_R15();
71    ru->regs[ru->size++] = hregAMD64_RBX();
72    ru->regs[ru->size++] = hregAMD64_RSI();
73    ru->regs[ru->size++] = hregAMD64_RDI();
74    ru->regs[ru->size++] = hregAMD64_R8();
75    ru->regs[ru->size++] = hregAMD64_R9();
76    ru->regs[ru->size++] = hregAMD64_R10();
77    ru->allocable_end[HRcInt64] = ru->size - 1;
78 
79    ru->allocable_start[HRcVec128] = ru->size;
80    ru->regs[ru->size++] = hregAMD64_XMM3();
81    ru->regs[ru->size++] = hregAMD64_XMM4();
82    ru->regs[ru->size++] = hregAMD64_XMM5();
83    ru->regs[ru->size++] = hregAMD64_XMM6();
84    ru->regs[ru->size++] = hregAMD64_XMM7();
85    ru->regs[ru->size++] = hregAMD64_XMM8();
86    ru->regs[ru->size++] = hregAMD64_XMM9();
87    ru->regs[ru->size++] = hregAMD64_XMM10();
88    ru->regs[ru->size++] = hregAMD64_XMM11();
89    ru->regs[ru->size++] = hregAMD64_XMM12();
90    ru->allocable_end[HRcVec128] = ru->size - 1;
91    ru->allocable = ru->size;
92 
93    /* And other regs, not available to the allocator. */
94    ru->regs[ru->size++] = hregAMD64_RAX();
95    ru->regs[ru->size++] = hregAMD64_RCX();
96    ru->regs[ru->size++] = hregAMD64_RDX();
97    ru->regs[ru->size++] = hregAMD64_RSP();
98    ru->regs[ru->size++] = hregAMD64_RBP();
99    ru->regs[ru->size++] = hregAMD64_R11();
100    ru->regs[ru->size++] = hregAMD64_XMM0();
101    ru->regs[ru->size++] = hregAMD64_XMM1();
102 
103    rRegUniverse_AMD64_initted = True;
104 
105    RRegUniverse__check_is_sane(ru);
106    return ru;
107 }
108 
109 
ppHRegAMD64(HReg reg)110 UInt ppHRegAMD64 ( HReg reg )
111 {
112    Int r;
113    static const HChar* ireg64_names[16]
114      = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
115          "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
116    /* Be generic for all virtual regs. */
117    if (hregIsVirtual(reg)) {
118       return ppHReg(reg);
119    }
120    /* But specific for real regs. */
121    switch (hregClass(reg)) {
122       case HRcInt64:
123          r = hregEncoding(reg);
124          vassert(r >= 0 && r < 16);
125          return vex_printf("%s", ireg64_names[r]);
126       case HRcVec128:
127          r = hregEncoding(reg);
128          vassert(r >= 0 && r < 16);
129          return vex_printf("%%xmm%d", r);
130       default:
131          vpanic("ppHRegAMD64");
132    }
133 }
134 
ppHRegAMD64_lo32(HReg reg)135 static UInt ppHRegAMD64_lo32 ( HReg reg )
136 {
137    Int r;
138    static const HChar* ireg32_names[16]
139      = { "%eax", "%ecx", "%edx",  "%ebx",  "%esp",  "%ebp",  "%esi",  "%edi",
140          "%r8d", "%r9d", "%r10d", "%r11d", "%r12d", "%r13d", "%r14d", "%r15d" };
141    /* Be generic for all virtual regs. */
142    if (hregIsVirtual(reg)) {
143       UInt written = ppHReg(reg);
144       written += vex_printf("d");
145       return written;
146    }
147    /* But specific for real regs. */
148    switch (hregClass(reg)) {
149       case HRcInt64:
150          r = hregEncoding(reg);
151          vassert(r >= 0 && r < 16);
152          return vex_printf("%s", ireg32_names[r]);
153       default:
154          vpanic("ppHRegAMD64_lo32: invalid regclass");
155    }
156 }
157 
158 
159 /* --------- Condition codes, Intel encoding. --------- */
160 
showAMD64CondCode(AMD64CondCode cond)161 const HChar* showAMD64CondCode ( AMD64CondCode cond )
162 {
163    switch (cond) {
164       case Acc_O:      return "o";
165       case Acc_NO:     return "no";
166       case Acc_B:      return "b";
167       case Acc_NB:     return "nb";
168       case Acc_Z:      return "z";
169       case Acc_NZ:     return "nz";
170       case Acc_BE:     return "be";
171       case Acc_NBE:    return "nbe";
172       case Acc_S:      return "s";
173       case Acc_NS:     return "ns";
174       case Acc_P:      return "p";
175       case Acc_NP:     return "np";
176       case Acc_L:      return "l";
177       case Acc_NL:     return "nl";
178       case Acc_LE:     return "le";
179       case Acc_NLE:    return "nle";
180       case Acc_ALWAYS: return "ALWAYS";
181       default: vpanic("ppAMD64CondCode");
182    }
183 }
184 
185 
186 /* --------- AMD64AMode: memory address expressions. --------- */
187 
AMD64AMode_IR(UInt imm32,HReg reg)188 AMD64AMode* AMD64AMode_IR ( UInt imm32, HReg reg ) {
189    AMD64AMode* am = LibVEX_Alloc_inline(sizeof(AMD64AMode));
190    am->tag        = Aam_IR;
191    am->Aam.IR.imm = imm32;
192    am->Aam.IR.reg = reg;
193    return am;
194 }
AMD64AMode_IRRS(UInt imm32,HReg base,HReg indEx,Int shift)195 AMD64AMode* AMD64AMode_IRRS ( UInt imm32, HReg base, HReg indEx, Int shift ) {
196    AMD64AMode* am = LibVEX_Alloc_inline(sizeof(AMD64AMode));
197    am->tag = Aam_IRRS;
198    am->Aam.IRRS.imm   = imm32;
199    am->Aam.IRRS.base  = base;
200    am->Aam.IRRS.index = indEx;
201    am->Aam.IRRS.shift = shift;
202    vassert(shift >= 0 && shift <= 3);
203    return am;
204 }
205 
ppAMD64AMode(AMD64AMode * am)206 void ppAMD64AMode ( AMD64AMode* am ) {
207    switch (am->tag) {
208       case Aam_IR:
209          if (am->Aam.IR.imm == 0)
210             vex_printf("(");
211          else
212             vex_printf("0x%x(", am->Aam.IR.imm);
213          ppHRegAMD64(am->Aam.IR.reg);
214          vex_printf(")");
215          return;
216       case Aam_IRRS:
217          vex_printf("0x%x(", am->Aam.IRRS.imm);
218          ppHRegAMD64(am->Aam.IRRS.base);
219          vex_printf(",");
220          ppHRegAMD64(am->Aam.IRRS.index);
221          vex_printf(",%d)", 1 << am->Aam.IRRS.shift);
222          return;
223       default:
224          vpanic("ppAMD64AMode");
225    }
226 }
227 
addRegUsage_AMD64AMode(HRegUsage * u,AMD64AMode * am)228 static void addRegUsage_AMD64AMode ( HRegUsage* u, AMD64AMode* am ) {
229    switch (am->tag) {
230       case Aam_IR:
231          addHRegUse(u, HRmRead, am->Aam.IR.reg);
232          return;
233       case Aam_IRRS:
234          addHRegUse(u, HRmRead, am->Aam.IRRS.base);
235          addHRegUse(u, HRmRead, am->Aam.IRRS.index);
236          return;
237       default:
238          vpanic("addRegUsage_AMD64AMode");
239    }
240 }
241 
mapRegs_AMD64AMode(HRegRemap * m,AMD64AMode * am)242 static void mapRegs_AMD64AMode ( HRegRemap* m, AMD64AMode* am ) {
243    switch (am->tag) {
244       case Aam_IR:
245          am->Aam.IR.reg = lookupHRegRemap(m, am->Aam.IR.reg);
246          return;
247       case Aam_IRRS:
248          am->Aam.IRRS.base = lookupHRegRemap(m, am->Aam.IRRS.base);
249          am->Aam.IRRS.index = lookupHRegRemap(m, am->Aam.IRRS.index);
250          return;
251       default:
252          vpanic("mapRegs_AMD64AMode");
253    }
254 }
255 
256 /* --------- Operand, which can be reg, immediate or memory. --------- */
257 
AMD64RMI_Imm(UInt imm32)258 AMD64RMI* AMD64RMI_Imm ( UInt imm32 ) {
259    AMD64RMI* op       = LibVEX_Alloc_inline(sizeof(AMD64RMI));
260    op->tag            = Armi_Imm;
261    op->Armi.Imm.imm32 = imm32;
262    return op;
263 }
AMD64RMI_Reg(HReg reg)264 AMD64RMI* AMD64RMI_Reg ( HReg reg ) {
265    AMD64RMI* op     = LibVEX_Alloc_inline(sizeof(AMD64RMI));
266    op->tag          = Armi_Reg;
267    op->Armi.Reg.reg = reg;
268    return op;
269 }
AMD64RMI_Mem(AMD64AMode * am)270 AMD64RMI* AMD64RMI_Mem ( AMD64AMode* am ) {
271    AMD64RMI* op    = LibVEX_Alloc_inline(sizeof(AMD64RMI));
272    op->tag         = Armi_Mem;
273    op->Armi.Mem.am = am;
274    return op;
275 }
276 
ppAMD64RMI_wrk(AMD64RMI * op,Bool lo32)277 static void ppAMD64RMI_wrk ( AMD64RMI* op, Bool lo32 ) {
278    switch (op->tag) {
279       case Armi_Imm:
280          vex_printf("$0x%x", op->Armi.Imm.imm32);
281          return;
282       case Armi_Reg:
283          if (lo32)
284             ppHRegAMD64_lo32(op->Armi.Reg.reg);
285          else
286             ppHRegAMD64(op->Armi.Reg.reg);
287          return;
288       case Armi_Mem:
289          ppAMD64AMode(op->Armi.Mem.am);
290          return;
291      default:
292          vpanic("ppAMD64RMI");
293    }
294 }
ppAMD64RMI(AMD64RMI * op)295 void ppAMD64RMI ( AMD64RMI* op ) {
296    ppAMD64RMI_wrk(op, False/*!lo32*/);
297 }
ppAMD64RMI_lo32(AMD64RMI * op)298 void ppAMD64RMI_lo32 ( AMD64RMI* op ) {
299    ppAMD64RMI_wrk(op, True/*lo32*/);
300 }
301 
302 /* An AMD64RMI can only be used in a "read" context (what would it mean
303    to write or modify a literal?) and so we enumerate its registers
304    accordingly. */
addRegUsage_AMD64RMI(HRegUsage * u,AMD64RMI * op)305 static void addRegUsage_AMD64RMI ( HRegUsage* u, AMD64RMI* op ) {
306    switch (op->tag) {
307       case Armi_Imm:
308          return;
309       case Armi_Reg:
310          addHRegUse(u, HRmRead, op->Armi.Reg.reg);
311          return;
312       case Armi_Mem:
313          addRegUsage_AMD64AMode(u, op->Armi.Mem.am);
314          return;
315       default:
316          vpanic("addRegUsage_AMD64RMI");
317    }
318 }
319 
mapRegs_AMD64RMI(HRegRemap * m,AMD64RMI * op)320 static void mapRegs_AMD64RMI ( HRegRemap* m, AMD64RMI* op ) {
321    switch (op->tag) {
322       case Armi_Imm:
323          return;
324       case Armi_Reg:
325          op->Armi.Reg.reg = lookupHRegRemap(m, op->Armi.Reg.reg);
326          return;
327       case Armi_Mem:
328          mapRegs_AMD64AMode(m, op->Armi.Mem.am);
329          return;
330       default:
331          vpanic("mapRegs_AMD64RMI");
332    }
333 }
334 
335 
336 /* --------- Operand, which can be reg or immediate only. --------- */
337 
AMD64RI_Imm(UInt imm32)338 AMD64RI* AMD64RI_Imm ( UInt imm32 ) {
339    AMD64RI* op       = LibVEX_Alloc_inline(sizeof(AMD64RI));
340    op->tag           = Ari_Imm;
341    op->Ari.Imm.imm32 = imm32;
342    return op;
343 }
AMD64RI_Reg(HReg reg)344 AMD64RI* AMD64RI_Reg ( HReg reg ) {
345    AMD64RI* op     = LibVEX_Alloc_inline(sizeof(AMD64RI));
346    op->tag         = Ari_Reg;
347    op->Ari.Reg.reg = reg;
348    return op;
349 }
350 
ppAMD64RI(AMD64RI * op)351 void ppAMD64RI ( AMD64RI* op ) {
352    switch (op->tag) {
353       case Ari_Imm:
354          vex_printf("$0x%x", op->Ari.Imm.imm32);
355          return;
356       case Ari_Reg:
357          ppHRegAMD64(op->Ari.Reg.reg);
358          return;
359      default:
360          vpanic("ppAMD64RI");
361    }
362 }
363 
364 /* An AMD64RI can only be used in a "read" context (what would it mean
365    to write or modify a literal?) and so we enumerate its registers
366    accordingly. */
addRegUsage_AMD64RI(HRegUsage * u,AMD64RI * op)367 static void addRegUsage_AMD64RI ( HRegUsage* u, AMD64RI* op ) {
368    switch (op->tag) {
369       case Ari_Imm:
370          return;
371       case Ari_Reg:
372          addHRegUse(u, HRmRead, op->Ari.Reg.reg);
373          return;
374       default:
375          vpanic("addRegUsage_AMD64RI");
376    }
377 }
378 
mapRegs_AMD64RI(HRegRemap * m,AMD64RI * op)379 static void mapRegs_AMD64RI ( HRegRemap* m, AMD64RI* op ) {
380    switch (op->tag) {
381       case Ari_Imm:
382          return;
383       case Ari_Reg:
384          op->Ari.Reg.reg = lookupHRegRemap(m, op->Ari.Reg.reg);
385          return;
386       default:
387          vpanic("mapRegs_AMD64RI");
388    }
389 }
390 
391 
392 /* --------- Operand, which can be reg or memory only. --------- */
393 
AMD64RM_Reg(HReg reg)394 AMD64RM* AMD64RM_Reg ( HReg reg ) {
395    AMD64RM* op       = LibVEX_Alloc_inline(sizeof(AMD64RM));
396    op->tag         = Arm_Reg;
397    op->Arm.Reg.reg = reg;
398    return op;
399 }
AMD64RM_Mem(AMD64AMode * am)400 AMD64RM* AMD64RM_Mem ( AMD64AMode* am ) {
401    AMD64RM* op    = LibVEX_Alloc_inline(sizeof(AMD64RM));
402    op->tag        = Arm_Mem;
403    op->Arm.Mem.am = am;
404    return op;
405 }
406 
ppAMD64RM(AMD64RM * op)407 void ppAMD64RM ( AMD64RM* op ) {
408    switch (op->tag) {
409       case Arm_Mem:
410          ppAMD64AMode(op->Arm.Mem.am);
411          return;
412       case Arm_Reg:
413          ppHRegAMD64(op->Arm.Reg.reg);
414          return;
415      default:
416          vpanic("ppAMD64RM");
417    }
418 }
419 
420 /* Because an AMD64RM can be both a source or destination operand, we
421    have to supply a mode -- pertaining to the operand as a whole --
422    indicating how it's being used. */
addRegUsage_AMD64RM(HRegUsage * u,AMD64RM * op,HRegMode mode)423 static void addRegUsage_AMD64RM ( HRegUsage* u, AMD64RM* op, HRegMode mode ) {
424    switch (op->tag) {
425       case Arm_Mem:
426          /* Memory is read, written or modified.  So we just want to
427             know the regs read by the amode. */
428          addRegUsage_AMD64AMode(u, op->Arm.Mem.am);
429          return;
430       case Arm_Reg:
431          /* reg is read, written or modified.  Add it in the
432             appropriate way. */
433          addHRegUse(u, mode, op->Arm.Reg.reg);
434          return;
435      default:
436          vpanic("addRegUsage_AMD64RM");
437    }
438 }
439 
mapRegs_AMD64RM(HRegRemap * m,AMD64RM * op)440 static void mapRegs_AMD64RM ( HRegRemap* m, AMD64RM* op )
441 {
442    switch (op->tag) {
443       case Arm_Mem:
444          mapRegs_AMD64AMode(m, op->Arm.Mem.am);
445          return;
446       case Arm_Reg:
447          op->Arm.Reg.reg = lookupHRegRemap(m, op->Arm.Reg.reg);
448          return;
449      default:
450          vpanic("mapRegs_AMD64RM");
451    }
452 }
453 
454 
455 /* --------- Instructions. --------- */
456 
showAMD64ScalarSz(Int sz)457 static const HChar* showAMD64ScalarSz ( Int sz ) {
458    switch (sz) {
459       case 2: return "w";
460       case 4: return "l";
461       case 8: return "q";
462       default: vpanic("showAMD64ScalarSz");
463    }
464 }
465 
showAMD64UnaryOp(AMD64UnaryOp op)466 const HChar* showAMD64UnaryOp ( AMD64UnaryOp op ) {
467    switch (op) {
468       case Aun_NOT: return "not";
469       case Aun_NEG: return "neg";
470       default: vpanic("showAMD64UnaryOp");
471    }
472 }
473 
showAMD64AluOp(AMD64AluOp op)474 const HChar* showAMD64AluOp ( AMD64AluOp op ) {
475    switch (op) {
476       case Aalu_MOV:  return "mov";
477       case Aalu_CMP:  return "cmp";
478       case Aalu_ADD:  return "add";
479       case Aalu_SUB:  return "sub";
480       case Aalu_ADC:  return "adc";
481       case Aalu_SBB:  return "sbb";
482       case Aalu_AND:  return "and";
483       case Aalu_OR:   return "or";
484       case Aalu_XOR:  return "xor";
485       case Aalu_MUL:  return "imul";
486       default: vpanic("showAMD64AluOp");
487    }
488 }
489 
showAMD64ShiftOp(AMD64ShiftOp op)490 const HChar* showAMD64ShiftOp ( AMD64ShiftOp op ) {
491    switch (op) {
492       case Ash_SHL: return "shl";
493       case Ash_SHR: return "shr";
494       case Ash_SAR: return "sar";
495       default: vpanic("showAMD64ShiftOp");
496    }
497 }
498 
showA87FpOp(A87FpOp op)499 const HChar* showA87FpOp ( A87FpOp op ) {
500    switch (op) {
501       case Afp_SCALE:  return "scale";
502       case Afp_ATAN:   return "atan";
503       case Afp_YL2X:   return "yl2x";
504       case Afp_YL2XP1: return "yl2xp1";
505       case Afp_PREM:   return "prem";
506       case Afp_PREM1:  return "prem1";
507       case Afp_SQRT:   return "sqrt";
508       case Afp_SIN:    return "sin";
509       case Afp_COS:    return "cos";
510       case Afp_TAN:    return "tan";
511       case Afp_ROUND:  return "round";
512       case Afp_2XM1:   return "2xm1";
513       default: vpanic("showA87FpOp");
514    }
515 }
516 
showAMD64SseOp(AMD64SseOp op)517 const HChar* showAMD64SseOp ( AMD64SseOp op ) {
518    switch (op) {
519       case Asse_MOV:      return "movups";
520       case Asse_ADDF:     return "add";
521       case Asse_SUBF:     return "sub";
522       case Asse_MULF:     return "mul";
523       case Asse_DIVF:     return "div";
524       case Asse_MAXF:     return "max";
525       case Asse_MINF:     return "min";
526       case Asse_CMPEQF:   return "cmpFeq";
527       case Asse_CMPLTF:   return "cmpFlt";
528       case Asse_CMPLEF:   return "cmpFle";
529       case Asse_CMPUNF:   return "cmpFun";
530       case Asse_RCPF:     return "rcp";
531       case Asse_RSQRTF:   return "rsqrt";
532       case Asse_SQRTF:    return "sqrt";
533       case Asse_I2F:      return "cvtdq2ps.";
534       case Asse_F2I:      return "cvtps2dq.";
535       case Asse_AND:      return "and";
536       case Asse_OR:       return "or";
537       case Asse_XOR:      return "xor";
538       case Asse_ANDN:     return "andn";
539       case Asse_ADD8:     return "paddb";
540       case Asse_ADD16:    return "paddw";
541       case Asse_ADD32:    return "paddd";
542       case Asse_ADD64:    return "paddq";
543       case Asse_QADD8U:   return "paddusb";
544       case Asse_QADD16U:  return "paddusw";
545       case Asse_QADD8S:   return "paddsb";
546       case Asse_QADD16S:  return "paddsw";
547       case Asse_SUB8:     return "psubb";
548       case Asse_SUB16:    return "psubw";
549       case Asse_SUB32:    return "psubd";
550       case Asse_SUB64:    return "psubq";
551       case Asse_QSUB8U:   return "psubusb";
552       case Asse_QSUB16U:  return "psubusw";
553       case Asse_QSUB8S:   return "psubsb";
554       case Asse_QSUB16S:  return "psubsw";
555       case Asse_MUL16:    return "pmullw";
556       case Asse_MULHI16U: return "pmulhuw";
557       case Asse_MULHI16S: return "pmulhw";
558       case Asse_AVG8U:    return "pavgb";
559       case Asse_AVG16U:   return "pavgw";
560       case Asse_MAX16S:   return "pmaxw";
561       case Asse_MAX8U:    return "pmaxub";
562       case Asse_MIN16S:   return "pminw";
563       case Asse_MIN8U:    return "pminub";
564       case Asse_CMPEQ8:   return "pcmpeqb";
565       case Asse_CMPEQ16:  return "pcmpeqw";
566       case Asse_CMPEQ32:  return "pcmpeqd";
567       case Asse_CMPGT8S:  return "pcmpgtb";
568       case Asse_CMPGT16S: return "pcmpgtw";
569       case Asse_CMPGT32S: return "pcmpgtd";
570       case Asse_SHL16:    return "psllw";
571       case Asse_SHL32:    return "pslld";
572       case Asse_SHL64:    return "psllq";
573       case Asse_SHL128:   return "pslldq";
574       case Asse_SHR16:    return "psrlw";
575       case Asse_SHR32:    return "psrld";
576       case Asse_SHR64:    return "psrlq";
577       case Asse_SHR128:   return "psrldq";
578       case Asse_SAR16:    return "psraw";
579       case Asse_SAR32:    return "psrad";
580       case Asse_PACKSSD:  return "packssdw";
581       case Asse_PACKSSW:  return "packsswb";
582       case Asse_PACKUSW:  return "packuswb";
583       case Asse_UNPCKHB:  return "punpckhb";
584       case Asse_UNPCKHW:  return "punpckhw";
585       case Asse_UNPCKHD:  return "punpckhd";
586       case Asse_UNPCKHQ:  return "punpckhq";
587       case Asse_UNPCKLB:  return "punpcklb";
588       case Asse_UNPCKLW:  return "punpcklw";
589       case Asse_UNPCKLD:  return "punpckld";
590       case Asse_UNPCKLQ:  return "punpcklq";
591       case Asse_PSHUFB:   return "pshufb";
592       default: vpanic("showAMD64SseOp");
593    }
594 }
595 
AMD64Instr_Imm64(ULong imm64,HReg dst)596 AMD64Instr* AMD64Instr_Imm64 ( ULong imm64, HReg dst ) {
597    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
598    i->tag             = Ain_Imm64;
599    i->Ain.Imm64.imm64 = imm64;
600    i->Ain.Imm64.dst   = dst;
601    return i;
602 }
AMD64Instr_Alu64R(AMD64AluOp op,AMD64RMI * src,HReg dst)603 AMD64Instr* AMD64Instr_Alu64R ( AMD64AluOp op, AMD64RMI* src, HReg dst ) {
604    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
605    i->tag            = Ain_Alu64R;
606    i->Ain.Alu64R.op  = op;
607    i->Ain.Alu64R.src = src;
608    i->Ain.Alu64R.dst = dst;
609    return i;
610 }
AMD64Instr_Alu64M(AMD64AluOp op,AMD64RI * src,AMD64AMode * dst)611 AMD64Instr* AMD64Instr_Alu64M ( AMD64AluOp op, AMD64RI* src, AMD64AMode* dst ) {
612    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
613    i->tag            = Ain_Alu64M;
614    i->Ain.Alu64M.op  = op;
615    i->Ain.Alu64M.src = src;
616    i->Ain.Alu64M.dst = dst;
617    vassert(op != Aalu_MUL);
618    return i;
619 }
AMD64Instr_Sh64(AMD64ShiftOp op,UInt src,HReg dst)620 AMD64Instr* AMD64Instr_Sh64 ( AMD64ShiftOp op, UInt src, HReg dst ) {
621    AMD64Instr* i   = LibVEX_Alloc_inline(sizeof(AMD64Instr));
622    i->tag          = Ain_Sh64;
623    i->Ain.Sh64.op  = op;
624    i->Ain.Sh64.src = src;
625    i->Ain.Sh64.dst = dst;
626    return i;
627 }
AMD64Instr_Test64(UInt imm32,HReg dst)628 AMD64Instr* AMD64Instr_Test64 ( UInt imm32, HReg dst ) {
629    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
630    i->tag              = Ain_Test64;
631    i->Ain.Test64.imm32 = imm32;
632    i->Ain.Test64.dst   = dst;
633    return i;
634 }
AMD64Instr_Unary64(AMD64UnaryOp op,HReg dst)635 AMD64Instr* AMD64Instr_Unary64 ( AMD64UnaryOp op, HReg dst ) {
636    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
637    i->tag             = Ain_Unary64;
638    i->Ain.Unary64.op  = op;
639    i->Ain.Unary64.dst = dst;
640    return i;
641 }
AMD64Instr_Lea64(AMD64AMode * am,HReg dst)642 AMD64Instr* AMD64Instr_Lea64 ( AMD64AMode* am, HReg dst ) {
643    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
644    i->tag             = Ain_Lea64;
645    i->Ain.Lea64.am    = am;
646    i->Ain.Lea64.dst   = dst;
647    return i;
648 }
AMD64Instr_Alu32R(AMD64AluOp op,AMD64RMI * src,HReg dst)649 AMD64Instr* AMD64Instr_Alu32R ( AMD64AluOp op, AMD64RMI* src, HReg dst ) {
650    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
651    i->tag            = Ain_Alu32R;
652    i->Ain.Alu32R.op  = op;
653    i->Ain.Alu32R.src = src;
654    i->Ain.Alu32R.dst = dst;
655    switch (op) {
656       case Aalu_ADD: case Aalu_SUB: case Aalu_CMP:
657       case Aalu_AND: case Aalu_OR:  case Aalu_XOR: break;
658       default: vassert(0);
659    }
660    return i;
661 }
AMD64Instr_MulL(Bool syned,AMD64RM * src)662 AMD64Instr* AMD64Instr_MulL ( Bool syned, AMD64RM* src ) {
663    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
664    i->tag            = Ain_MulL;
665    i->Ain.MulL.syned = syned;
666    i->Ain.MulL.src   = src;
667    return i;
668 }
AMD64Instr_Div(Bool syned,Int sz,AMD64RM * src)669 AMD64Instr* AMD64Instr_Div ( Bool syned, Int sz, AMD64RM* src ) {
670    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
671    i->tag            = Ain_Div;
672    i->Ain.Div.syned  = syned;
673    i->Ain.Div.sz     = sz;
674    i->Ain.Div.src    = src;
675    vassert(sz == 4 || sz == 8);
676    return i;
677 }
AMD64Instr_Push(AMD64RMI * src)678 AMD64Instr* AMD64Instr_Push( AMD64RMI* src ) {
679    AMD64Instr* i   = LibVEX_Alloc_inline(sizeof(AMD64Instr));
680    i->tag          = Ain_Push;
681    i->Ain.Push.src = src;
682    return i;
683 }
AMD64Instr_Call(AMD64CondCode cond,Addr64 target,Int regparms,RetLoc rloc)684 AMD64Instr* AMD64Instr_Call ( AMD64CondCode cond, Addr64 target, Int regparms,
685                               RetLoc rloc ) {
686    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
687    i->tag               = Ain_Call;
688    i->Ain.Call.cond     = cond;
689    i->Ain.Call.target   = target;
690    i->Ain.Call.regparms = regparms;
691    i->Ain.Call.rloc     = rloc;
692    vassert(regparms >= 0 && regparms <= 6);
693    vassert(is_sane_RetLoc(rloc));
694    return i;
695 }
696 
AMD64Instr_XDirect(Addr64 dstGA,AMD64AMode * amRIP,AMD64CondCode cond,Bool toFastEP)697 AMD64Instr* AMD64Instr_XDirect ( Addr64 dstGA, AMD64AMode* amRIP,
698                                  AMD64CondCode cond, Bool toFastEP ) {
699    AMD64Instr* i           = LibVEX_Alloc_inline(sizeof(AMD64Instr));
700    i->tag                  = Ain_XDirect;
701    i->Ain.XDirect.dstGA    = dstGA;
702    i->Ain.XDirect.amRIP    = amRIP;
703    i->Ain.XDirect.cond     = cond;
704    i->Ain.XDirect.toFastEP = toFastEP;
705    return i;
706 }
AMD64Instr_XIndir(HReg dstGA,AMD64AMode * amRIP,AMD64CondCode cond)707 AMD64Instr* AMD64Instr_XIndir ( HReg dstGA, AMD64AMode* amRIP,
708                                 AMD64CondCode cond ) {
709    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
710    i->tag              = Ain_XIndir;
711    i->Ain.XIndir.dstGA = dstGA;
712    i->Ain.XIndir.amRIP = amRIP;
713    i->Ain.XIndir.cond  = cond;
714    return i;
715 }
AMD64Instr_XAssisted(HReg dstGA,AMD64AMode * amRIP,AMD64CondCode cond,IRJumpKind jk)716 AMD64Instr* AMD64Instr_XAssisted ( HReg dstGA, AMD64AMode* amRIP,
717                                    AMD64CondCode cond, IRJumpKind jk ) {
718    AMD64Instr* i          = LibVEX_Alloc_inline(sizeof(AMD64Instr));
719    i->tag                 = Ain_XAssisted;
720    i->Ain.XAssisted.dstGA = dstGA;
721    i->Ain.XAssisted.amRIP = amRIP;
722    i->Ain.XAssisted.cond  = cond;
723    i->Ain.XAssisted.jk    = jk;
724    return i;
725 }
726 
AMD64Instr_CMov64(AMD64CondCode cond,HReg src,HReg dst)727 AMD64Instr* AMD64Instr_CMov64 ( AMD64CondCode cond, HReg src, HReg dst ) {
728    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
729    i->tag             = Ain_CMov64;
730    i->Ain.CMov64.cond = cond;
731    i->Ain.CMov64.src  = src;
732    i->Ain.CMov64.dst  = dst;
733    vassert(cond != Acc_ALWAYS);
734    return i;
735 }
AMD64Instr_CLoad(AMD64CondCode cond,UChar szB,AMD64AMode * addr,HReg dst)736 AMD64Instr* AMD64Instr_CLoad ( AMD64CondCode cond, UChar szB,
737                                AMD64AMode* addr, HReg dst ) {
738    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
739    i->tag            = Ain_CLoad;
740    i->Ain.CLoad.cond = cond;
741    i->Ain.CLoad.szB  = szB;
742    i->Ain.CLoad.addr = addr;
743    i->Ain.CLoad.dst  = dst;
744    vassert(cond != Acc_ALWAYS && (szB == 4 || szB == 8));
745    return i;
746 }
AMD64Instr_CStore(AMD64CondCode cond,UChar szB,HReg src,AMD64AMode * addr)747 AMD64Instr* AMD64Instr_CStore ( AMD64CondCode cond, UChar szB,
748                                 HReg src, AMD64AMode* addr ) {
749    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
750    i->tag             = Ain_CStore;
751    i->Ain.CStore.cond = cond;
752    i->Ain.CStore.szB  = szB;
753    i->Ain.CStore.src  = src;
754    i->Ain.CStore.addr = addr;
755    vassert(cond != Acc_ALWAYS && (szB == 4 || szB == 8));
756    return i;
757 }
AMD64Instr_MovxLQ(Bool syned,HReg src,HReg dst)758 AMD64Instr* AMD64Instr_MovxLQ ( Bool syned, HReg src, HReg dst ) {
759    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
760    i->tag              = Ain_MovxLQ;
761    i->Ain.MovxLQ.syned = syned;
762    i->Ain.MovxLQ.src   = src;
763    i->Ain.MovxLQ.dst   = dst;
764    return i;
765 }
AMD64Instr_LoadEX(UChar szSmall,Bool syned,AMD64AMode * src,HReg dst)766 AMD64Instr* AMD64Instr_LoadEX ( UChar szSmall, Bool syned,
767                                 AMD64AMode* src, HReg dst ) {
768    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
769    i->tag                = Ain_LoadEX;
770    i->Ain.LoadEX.szSmall = szSmall;
771    i->Ain.LoadEX.syned   = syned;
772    i->Ain.LoadEX.src     = src;
773    i->Ain.LoadEX.dst     = dst;
774    vassert(szSmall == 1 || szSmall == 2 || szSmall == 4);
775    return i;
776 }
AMD64Instr_Store(UChar sz,HReg src,AMD64AMode * dst)777 AMD64Instr* AMD64Instr_Store ( UChar sz, HReg src, AMD64AMode* dst ) {
778    AMD64Instr* i    = LibVEX_Alloc_inline(sizeof(AMD64Instr));
779    i->tag           = Ain_Store;
780    i->Ain.Store.sz  = sz;
781    i->Ain.Store.src = src;
782    i->Ain.Store.dst = dst;
783    vassert(sz == 1 || sz == 2 || sz == 4);
784    return i;
785 }
AMD64Instr_Set64(AMD64CondCode cond,HReg dst)786 AMD64Instr* AMD64Instr_Set64 ( AMD64CondCode cond, HReg dst ) {
787    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
788    i->tag            = Ain_Set64;
789    i->Ain.Set64.cond = cond;
790    i->Ain.Set64.dst  = dst;
791    return i;
792 }
AMD64Instr_Bsfr64(Bool isFwds,HReg src,HReg dst)793 AMD64Instr* AMD64Instr_Bsfr64 ( Bool isFwds, HReg src, HReg dst ) {
794    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
795    i->tag               = Ain_Bsfr64;
796    i->Ain.Bsfr64.isFwds = isFwds;
797    i->Ain.Bsfr64.src    = src;
798    i->Ain.Bsfr64.dst    = dst;
799    return i;
800 }
AMD64Instr_MFence(void)801 AMD64Instr* AMD64Instr_MFence ( void ) {
802    AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr));
803    i->tag        = Ain_MFence;
804    return i;
805 }
AMD64Instr_ACAS(AMD64AMode * addr,UChar sz)806 AMD64Instr* AMD64Instr_ACAS ( AMD64AMode* addr, UChar sz ) {
807    AMD64Instr* i    = LibVEX_Alloc_inline(sizeof(AMD64Instr));
808    i->tag           = Ain_ACAS;
809    i->Ain.ACAS.addr = addr;
810    i->Ain.ACAS.sz   = sz;
811    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
812    return i;
813 }
AMD64Instr_DACAS(AMD64AMode * addr,UChar sz)814 AMD64Instr* AMD64Instr_DACAS ( AMD64AMode* addr, UChar sz ) {
815    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
816    i->tag            = Ain_DACAS;
817    i->Ain.DACAS.addr = addr;
818    i->Ain.DACAS.sz   = sz;
819    vassert(sz == 8 || sz == 4);
820    return i;
821 }
822 
AMD64Instr_A87Free(Int nregs)823 AMD64Instr* AMD64Instr_A87Free ( Int nregs )
824 {
825    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
826    i->tag               = Ain_A87Free;
827    i->Ain.A87Free.nregs = nregs;
828    vassert(nregs >= 1 && nregs <= 7);
829    return i;
830 }
AMD64Instr_A87PushPop(AMD64AMode * addr,Bool isPush,UChar szB)831 AMD64Instr* AMD64Instr_A87PushPop ( AMD64AMode* addr, Bool isPush, UChar szB )
832 {
833    AMD64Instr* i            = LibVEX_Alloc_inline(sizeof(AMD64Instr));
834    i->tag                   = Ain_A87PushPop;
835    i->Ain.A87PushPop.addr   = addr;
836    i->Ain.A87PushPop.isPush = isPush;
837    i->Ain.A87PushPop.szB    = szB;
838    vassert(szB == 8 || szB == 4);
839    return i;
840 }
AMD64Instr_A87FpOp(A87FpOp op)841 AMD64Instr* AMD64Instr_A87FpOp ( A87FpOp op )
842 {
843    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
844    i->tag            = Ain_A87FpOp;
845    i->Ain.A87FpOp.op = op;
846    return i;
847 }
AMD64Instr_A87LdCW(AMD64AMode * addr)848 AMD64Instr* AMD64Instr_A87LdCW ( AMD64AMode* addr )
849 {
850    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
851    i->tag              = Ain_A87LdCW;
852    i->Ain.A87LdCW.addr = addr;
853    return i;
854 }
AMD64Instr_A87StSW(AMD64AMode * addr)855 AMD64Instr* AMD64Instr_A87StSW ( AMD64AMode* addr )
856 {
857    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
858    i->tag              = Ain_A87StSW;
859    i->Ain.A87StSW.addr = addr;
860    return i;
861 }
AMD64Instr_LdMXCSR(AMD64AMode * addr)862 AMD64Instr* AMD64Instr_LdMXCSR ( AMD64AMode* addr ) {
863    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
864    i->tag                = Ain_LdMXCSR;
865    i->Ain.LdMXCSR.addr   = addr;
866    return i;
867 }
AMD64Instr_SseUComIS(Int sz,HReg srcL,HReg srcR,HReg dst)868 AMD64Instr* AMD64Instr_SseUComIS ( Int sz, HReg srcL, HReg srcR, HReg dst ) {
869    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
870    i->tag                = Ain_SseUComIS;
871    i->Ain.SseUComIS.sz   = toUChar(sz);
872    i->Ain.SseUComIS.srcL = srcL;
873    i->Ain.SseUComIS.srcR = srcR;
874    i->Ain.SseUComIS.dst  = dst;
875    vassert(sz == 4 || sz == 8);
876    return i;
877 }
AMD64Instr_SseSI2SF(Int szS,Int szD,HReg src,HReg dst)878 AMD64Instr* AMD64Instr_SseSI2SF ( Int szS, Int szD, HReg src, HReg dst ) {
879    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
880    i->tag              = Ain_SseSI2SF;
881    i->Ain.SseSI2SF.szS = toUChar(szS);
882    i->Ain.SseSI2SF.szD = toUChar(szD);
883    i->Ain.SseSI2SF.src = src;
884    i->Ain.SseSI2SF.dst = dst;
885    vassert(szS == 4 || szS == 8);
886    vassert(szD == 4 || szD == 8);
887    return i;
888 }
AMD64Instr_SseSF2SI(Int szS,Int szD,HReg src,HReg dst)889 AMD64Instr* AMD64Instr_SseSF2SI ( Int szS, Int szD, HReg src, HReg dst ) {
890    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
891    i->tag              = Ain_SseSF2SI;
892    i->Ain.SseSF2SI.szS = toUChar(szS);
893    i->Ain.SseSF2SI.szD = toUChar(szD);
894    i->Ain.SseSF2SI.src = src;
895    i->Ain.SseSF2SI.dst = dst;
896    vassert(szS == 4 || szS == 8);
897    vassert(szD == 4 || szD == 8);
898    return i;
899 }
AMD64Instr_SseSDSS(Bool from64,HReg src,HReg dst)900 AMD64Instr* AMD64Instr_SseSDSS   ( Bool from64, HReg src, HReg dst )
901 {
902    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
903    i->tag                = Ain_SseSDSS;
904    i->Ain.SseSDSS.from64 = from64;
905    i->Ain.SseSDSS.src    = src;
906    i->Ain.SseSDSS.dst    = dst;
907    return i;
908 }
AMD64Instr_SseLdSt(Bool isLoad,Int sz,HReg reg,AMD64AMode * addr)909 AMD64Instr* AMD64Instr_SseLdSt ( Bool isLoad, Int sz,
910                                  HReg reg, AMD64AMode* addr ) {
911    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
912    i->tag                = Ain_SseLdSt;
913    i->Ain.SseLdSt.isLoad = isLoad;
914    i->Ain.SseLdSt.sz     = toUChar(sz);
915    i->Ain.SseLdSt.reg    = reg;
916    i->Ain.SseLdSt.addr   = addr;
917    vassert(sz == 4 || sz == 8 || sz == 16);
918    return i;
919 }
AMD64Instr_SseCStore(AMD64CondCode cond,HReg src,AMD64AMode * addr)920 AMD64Instr* AMD64Instr_SseCStore ( AMD64CondCode cond,
921                                    HReg src, AMD64AMode* addr )
922 {
923    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
924    i->tag                = Ain_SseCStore;
925    i->Ain.SseCStore.cond = cond;
926    i->Ain.SseCStore.src  = src;
927    i->Ain.SseCStore.addr = addr;
928    vassert(cond != Acc_ALWAYS);
929    return i;
930 }
AMD64Instr_SseCLoad(AMD64CondCode cond,AMD64AMode * addr,HReg dst)931 AMD64Instr* AMD64Instr_SseCLoad ( AMD64CondCode cond,
932                                   AMD64AMode* addr, HReg dst )
933 {
934    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
935    i->tag               = Ain_SseCLoad;
936    i->Ain.SseCLoad.cond = cond;
937    i->Ain.SseCLoad.addr = addr;
938    i->Ain.SseCLoad.dst  = dst;
939    vassert(cond != Acc_ALWAYS);
940    return i;
941 }
AMD64Instr_SseLdzLO(Int sz,HReg reg,AMD64AMode * addr)942 AMD64Instr* AMD64Instr_SseLdzLO  ( Int sz, HReg reg, AMD64AMode* addr )
943 {
944    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
945    i->tag                = Ain_SseLdzLO;
946    i->Ain.SseLdzLO.sz    = sz;
947    i->Ain.SseLdzLO.reg   = reg;
948    i->Ain.SseLdzLO.addr  = addr;
949    vassert(sz == 4 || sz == 8);
950    return i;
951 }
AMD64Instr_Sse32Fx4(AMD64SseOp op,HReg src,HReg dst)952 AMD64Instr* AMD64Instr_Sse32Fx4 ( AMD64SseOp op, HReg src, HReg dst ) {
953    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
954    i->tag              = Ain_Sse32Fx4;
955    i->Ain.Sse32Fx4.op  = op;
956    i->Ain.Sse32Fx4.src = src;
957    i->Ain.Sse32Fx4.dst = dst;
958    vassert(op != Asse_MOV);
959    return i;
960 }
AMD64Instr_Sse32FLo(AMD64SseOp op,HReg src,HReg dst)961 AMD64Instr* AMD64Instr_Sse32FLo ( AMD64SseOp op, HReg src, HReg dst ) {
962    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
963    i->tag              = Ain_Sse32FLo;
964    i->Ain.Sse32FLo.op  = op;
965    i->Ain.Sse32FLo.src = src;
966    i->Ain.Sse32FLo.dst = dst;
967    vassert(op != Asse_MOV);
968    return i;
969 }
AMD64Instr_Sse64Fx2(AMD64SseOp op,HReg src,HReg dst)970 AMD64Instr* AMD64Instr_Sse64Fx2 ( AMD64SseOp op, HReg src, HReg dst ) {
971    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
972    i->tag              = Ain_Sse64Fx2;
973    i->Ain.Sse64Fx2.op  = op;
974    i->Ain.Sse64Fx2.src = src;
975    i->Ain.Sse64Fx2.dst = dst;
976    vassert(op != Asse_MOV);
977    return i;
978 }
AMD64Instr_Sse64FLo(AMD64SseOp op,HReg src,HReg dst)979 AMD64Instr* AMD64Instr_Sse64FLo ( AMD64SseOp op, HReg src, HReg dst ) {
980    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
981    i->tag              = Ain_Sse64FLo;
982    i->Ain.Sse64FLo.op  = op;
983    i->Ain.Sse64FLo.src = src;
984    i->Ain.Sse64FLo.dst = dst;
985    vassert(op != Asse_MOV);
986    return i;
987 }
AMD64Instr_SseReRg(AMD64SseOp op,HReg re,HReg rg)988 AMD64Instr* AMD64Instr_SseReRg ( AMD64SseOp op, HReg re, HReg rg ) {
989    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
990    i->tag             = Ain_SseReRg;
991    i->Ain.SseReRg.op  = op;
992    i->Ain.SseReRg.src = re;
993    i->Ain.SseReRg.dst = rg;
994    return i;
995 }
AMD64Instr_SseCMov(AMD64CondCode cond,HReg src,HReg dst)996 AMD64Instr* AMD64Instr_SseCMov ( AMD64CondCode cond, HReg src, HReg dst ) {
997    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
998    i->tag              = Ain_SseCMov;
999    i->Ain.SseCMov.cond = cond;
1000    i->Ain.SseCMov.src  = src;
1001    i->Ain.SseCMov.dst  = dst;
1002    vassert(cond != Acc_ALWAYS);
1003    return i;
1004 }
AMD64Instr_SseShuf(Int order,HReg src,HReg dst)1005 AMD64Instr* AMD64Instr_SseShuf ( Int order, HReg src, HReg dst ) {
1006    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1007    i->tag               = Ain_SseShuf;
1008    i->Ain.SseShuf.order = order;
1009    i->Ain.SseShuf.src   = src;
1010    i->Ain.SseShuf.dst   = dst;
1011    vassert(order >= 0 && order <= 0xFF);
1012    return i;
1013 }
AMD64Instr_SseShiftN(AMD64SseOp op,UInt shiftBits,HReg dst)1014 AMD64Instr* AMD64Instr_SseShiftN ( AMD64SseOp op,
1015                                    UInt shiftBits, HReg dst ) {
1016    AMD64Instr* i              = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1017    i->tag                     = Ain_SseShiftN;
1018    i->Ain.SseShiftN.op        = op;
1019    i->Ain.SseShiftN.shiftBits = shiftBits;
1020    i->Ain.SseShiftN.dst       = dst;
1021    return i;
1022 }
AMD64Instr_SseMOVQ(HReg gpr,HReg xmm,Bool toXMM)1023 AMD64Instr* AMD64Instr_SseMOVQ ( HReg gpr, HReg xmm, Bool toXMM ) {
1024    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1025    i->tag               = Ain_SseMOVQ;
1026    i->Ain.SseMOVQ.gpr   = gpr;
1027    i->Ain.SseMOVQ.xmm   = xmm;
1028    i->Ain.SseMOVQ.toXMM = toXMM;
1029    vassert(hregClass(gpr) == HRcInt64);
1030    vassert(hregClass(xmm) == HRcVec128);
1031    return i;
1032 }
1033 //uu AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad,
1034 //uu                                  HReg reg, AMD64AMode* addr ) {
1035 //uu    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1036 //uu    i->tag                = Ain_AvxLdSt;
1037 //uu    i->Ain.AvxLdSt.isLoad = isLoad;
1038 //uu    i->Ain.AvxLdSt.reg    = reg;
1039 //uu    i->Ain.AvxLdSt.addr   = addr;
1040 //uu    return i;
1041 //uu }
1042 //uu AMD64Instr* AMD64Instr_AvxReRg ( AMD64SseOp op, HReg re, HReg rg ) {
1043 //uu    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1044 //uu    i->tag             = Ain_AvxReRg;
1045 //uu    i->Ain.AvxReRg.op  = op;
1046 //uu    i->Ain.AvxReRg.src = re;
1047 //uu    i->Ain.AvxReRg.dst = rg;
1048 //uu    return i;
1049 //uu }
AMD64Instr_EvCheck(AMD64AMode * amCounter,AMD64AMode * amFailAddr)1050 AMD64Instr* AMD64Instr_EvCheck ( AMD64AMode* amCounter,
1051                                  AMD64AMode* amFailAddr ) {
1052    AMD64Instr* i             = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1053    i->tag                    = Ain_EvCheck;
1054    i->Ain.EvCheck.amCounter  = amCounter;
1055    i->Ain.EvCheck.amFailAddr = amFailAddr;
1056    return i;
1057 }
AMD64Instr_ProfInc(void)1058 AMD64Instr* AMD64Instr_ProfInc ( void ) {
1059    AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1060    i->tag        = Ain_ProfInc;
1061    return i;
1062 }
1063 
ppAMD64Instr(const AMD64Instr * i,Bool mode64)1064 void ppAMD64Instr ( const AMD64Instr* i, Bool mode64 )
1065 {
1066    vassert(mode64 == True);
1067    switch (i->tag) {
1068       case Ain_Imm64:
1069          vex_printf("movabsq $0x%llx,", i->Ain.Imm64.imm64);
1070          ppHRegAMD64(i->Ain.Imm64.dst);
1071          return;
1072       case Ain_Alu64R:
1073          vex_printf("%sq ", showAMD64AluOp(i->Ain.Alu64R.op));
1074          ppAMD64RMI(i->Ain.Alu64R.src);
1075          vex_printf(",");
1076          ppHRegAMD64(i->Ain.Alu64R.dst);
1077          return;
1078       case Ain_Alu64M:
1079          vex_printf("%sq ", showAMD64AluOp(i->Ain.Alu64M.op));
1080          ppAMD64RI(i->Ain.Alu64M.src);
1081          vex_printf(",");
1082          ppAMD64AMode(i->Ain.Alu64M.dst);
1083          return;
1084       case Ain_Sh64:
1085          vex_printf("%sq ", showAMD64ShiftOp(i->Ain.Sh64.op));
1086          if (i->Ain.Sh64.src == 0)
1087             vex_printf("%%cl,");
1088          else
1089             vex_printf("$%d,", (Int)i->Ain.Sh64.src);
1090          ppHRegAMD64(i->Ain.Sh64.dst);
1091          return;
1092       case Ain_Test64:
1093          vex_printf("testq $%d,", (Int)i->Ain.Test64.imm32);
1094          ppHRegAMD64(i->Ain.Test64.dst);
1095          return;
1096       case Ain_Unary64:
1097          vex_printf("%sq ", showAMD64UnaryOp(i->Ain.Unary64.op));
1098          ppHRegAMD64(i->Ain.Unary64.dst);
1099          return;
1100       case Ain_Lea64:
1101          vex_printf("leaq ");
1102          ppAMD64AMode(i->Ain.Lea64.am);
1103          vex_printf(",");
1104          ppHRegAMD64(i->Ain.Lea64.dst);
1105          return;
1106       case Ain_Alu32R:
1107          vex_printf("%sl ", showAMD64AluOp(i->Ain.Alu32R.op));
1108          ppAMD64RMI_lo32(i->Ain.Alu32R.src);
1109          vex_printf(",");
1110          ppHRegAMD64_lo32(i->Ain.Alu32R.dst);
1111          return;
1112       case Ain_MulL:
1113          vex_printf("%cmulq ", i->Ain.MulL.syned ? 's' : 'u');
1114          ppAMD64RM(i->Ain.MulL.src);
1115          return;
1116       case Ain_Div:
1117          vex_printf("%cdiv%s ",
1118                     i->Ain.Div.syned ? 's' : 'u',
1119                     showAMD64ScalarSz(i->Ain.Div.sz));
1120          ppAMD64RM(i->Ain.Div.src);
1121          return;
1122       case Ain_Push:
1123          vex_printf("pushq ");
1124          ppAMD64RMI(i->Ain.Push.src);
1125          return;
1126       case Ain_Call:
1127          vex_printf("call%s[%d,",
1128                     i->Ain.Call.cond==Acc_ALWAYS
1129                        ? "" : showAMD64CondCode(i->Ain.Call.cond),
1130                     i->Ain.Call.regparms );
1131          ppRetLoc(i->Ain.Call.rloc);
1132          vex_printf("] 0x%llx", i->Ain.Call.target);
1133          break;
1134 
1135       case Ain_XDirect:
1136          vex_printf("(xDirect) ");
1137          vex_printf("if (%%rflags.%s) { ",
1138                     showAMD64CondCode(i->Ain.XDirect.cond));
1139          vex_printf("movabsq $0x%llx,%%r11; ", i->Ain.XDirect.dstGA);
1140          vex_printf("movq %%r11,");
1141          ppAMD64AMode(i->Ain.XDirect.amRIP);
1142          vex_printf("; ");
1143          vex_printf("movabsq $disp_cp_chain_me_to_%sEP,%%r11; call *%%r11 }",
1144                     i->Ain.XDirect.toFastEP ? "fast" : "slow");
1145          return;
1146       case Ain_XIndir:
1147          vex_printf("(xIndir) ");
1148          vex_printf("if (%%rflags.%s) { ",
1149                     showAMD64CondCode(i->Ain.XIndir.cond));
1150          vex_printf("movq ");
1151          ppHRegAMD64(i->Ain.XIndir.dstGA);
1152          vex_printf(",");
1153          ppAMD64AMode(i->Ain.XIndir.amRIP);
1154          vex_printf("; movabsq $disp_indir,%%r11; jmp *%%r11 }");
1155          return;
1156       case Ain_XAssisted:
1157          vex_printf("(xAssisted) ");
1158          vex_printf("if (%%rflags.%s) { ",
1159                     showAMD64CondCode(i->Ain.XAssisted.cond));
1160          vex_printf("movq ");
1161          ppHRegAMD64(i->Ain.XAssisted.dstGA);
1162          vex_printf(",");
1163          ppAMD64AMode(i->Ain.XAssisted.amRIP);
1164          vex_printf("; movl $IRJumpKind_to_TRCVAL(%d),%%rbp",
1165                     (Int)i->Ain.XAssisted.jk);
1166          vex_printf("; movabsq $disp_assisted,%%r11; jmp *%%r11 }");
1167          return;
1168 
1169       case Ain_CMov64:
1170          vex_printf("cmov%s ", showAMD64CondCode(i->Ain.CMov64.cond));
1171          ppHRegAMD64(i->Ain.CMov64.src);
1172          vex_printf(",");
1173          ppHRegAMD64(i->Ain.CMov64.dst);
1174          return;
1175       case Ain_CLoad:
1176          vex_printf("if (%%rflags.%s) { ",
1177                     showAMD64CondCode(i->Ain.CLoad.cond));
1178          vex_printf("mov%c ", i->Ain.CLoad.szB == 4 ? 'l' : 'q');
1179          ppAMD64AMode(i->Ain.CLoad.addr);
1180          vex_printf(", ");
1181          (i->Ain.CLoad.szB == 4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1182             (i->Ain.CLoad.dst);
1183          vex_printf(" }");
1184          return;
1185       case Ain_CStore:
1186          vex_printf("if (%%rflags.%s) { ",
1187                     showAMD64CondCode(i->Ain.CStore.cond));
1188          vex_printf("mov%c ", i->Ain.CStore.szB == 4 ? 'l' : 'q');
1189          (i->Ain.CStore.szB == 4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1190             (i->Ain.CStore.src);
1191          vex_printf(", ");
1192          ppAMD64AMode(i->Ain.CStore.addr);
1193          vex_printf(" }");
1194          return;
1195 
1196       case Ain_MovxLQ:
1197          vex_printf("mov%clq ", i->Ain.MovxLQ.syned ? 's' : 'z');
1198          ppHRegAMD64_lo32(i->Ain.MovxLQ.src);
1199          vex_printf(",");
1200          ppHRegAMD64(i->Ain.MovxLQ.dst);
1201          return;
1202       case Ain_LoadEX:
1203          if (i->Ain.LoadEX.szSmall==4 && !i->Ain.LoadEX.syned) {
1204             vex_printf("movl ");
1205             ppAMD64AMode(i->Ain.LoadEX.src);
1206             vex_printf(",");
1207             ppHRegAMD64_lo32(i->Ain.LoadEX.dst);
1208          } else {
1209             vex_printf("mov%c%cq ",
1210                        i->Ain.LoadEX.syned ? 's' : 'z',
1211                        i->Ain.LoadEX.szSmall==1
1212                           ? 'b'
1213                           : (i->Ain.LoadEX.szSmall==2 ? 'w' : 'l'));
1214             ppAMD64AMode(i->Ain.LoadEX.src);
1215             vex_printf(",");
1216             ppHRegAMD64(i->Ain.LoadEX.dst);
1217          }
1218          return;
1219       case Ain_Store:
1220          vex_printf("mov%c ", i->Ain.Store.sz==1 ? 'b'
1221                               : (i->Ain.Store.sz==2 ? 'w' : 'l'));
1222          ppHRegAMD64(i->Ain.Store.src);
1223          vex_printf(",");
1224          ppAMD64AMode(i->Ain.Store.dst);
1225          return;
1226       case Ain_Set64:
1227          vex_printf("setq%s ", showAMD64CondCode(i->Ain.Set64.cond));
1228          ppHRegAMD64(i->Ain.Set64.dst);
1229          return;
1230       case Ain_Bsfr64:
1231          vex_printf("bs%cq ", i->Ain.Bsfr64.isFwds ? 'f' : 'r');
1232          ppHRegAMD64(i->Ain.Bsfr64.src);
1233          vex_printf(",");
1234          ppHRegAMD64(i->Ain.Bsfr64.dst);
1235          return;
1236       case Ain_MFence:
1237          vex_printf("mfence" );
1238          return;
1239       case Ain_ACAS:
1240          vex_printf("lock cmpxchg%c ",
1241                      i->Ain.ACAS.sz==1 ? 'b' : i->Ain.ACAS.sz==2 ? 'w'
1242                      : i->Ain.ACAS.sz==4 ? 'l' : 'q' );
1243          vex_printf("{%%rax->%%rbx},");
1244          ppAMD64AMode(i->Ain.ACAS.addr);
1245          return;
1246       case Ain_DACAS:
1247          vex_printf("lock cmpxchg%db {%%rdx:%%rax->%%rcx:%%rbx},",
1248                     (Int)(2 * i->Ain.DACAS.sz));
1249          ppAMD64AMode(i->Ain.DACAS.addr);
1250          return;
1251       case Ain_A87Free:
1252          vex_printf("ffree %%st(7..%d)", 8 - i->Ain.A87Free.nregs );
1253          break;
1254       case Ain_A87PushPop:
1255          vex_printf(i->Ain.A87PushPop.isPush ? "fld%c " : "fstp%c ",
1256                     i->Ain.A87PushPop.szB == 4 ? 's' : 'l');
1257          ppAMD64AMode(i->Ain.A87PushPop.addr);
1258          break;
1259       case Ain_A87FpOp:
1260          vex_printf("f%s", showA87FpOp(i->Ain.A87FpOp.op));
1261          break;
1262       case Ain_A87LdCW:
1263          vex_printf("fldcw ");
1264          ppAMD64AMode(i->Ain.A87LdCW.addr);
1265          break;
1266       case Ain_A87StSW:
1267          vex_printf("fstsw ");
1268          ppAMD64AMode(i->Ain.A87StSW.addr);
1269          break;
1270       case Ain_LdMXCSR:
1271          vex_printf("ldmxcsr ");
1272          ppAMD64AMode(i->Ain.LdMXCSR.addr);
1273          break;
1274       case Ain_SseUComIS:
1275          vex_printf("ucomis%s ", i->Ain.SseUComIS.sz==4 ? "s" : "d");
1276          ppHRegAMD64(i->Ain.SseUComIS.srcL);
1277          vex_printf(",");
1278          ppHRegAMD64(i->Ain.SseUComIS.srcR);
1279          vex_printf(" ; pushfq ; popq ");
1280          ppHRegAMD64(i->Ain.SseUComIS.dst);
1281          break;
1282       case Ain_SseSI2SF:
1283          vex_printf("cvtsi2s%s ", i->Ain.SseSI2SF.szD==4 ? "s" : "d");
1284          (i->Ain.SseSI2SF.szS==4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1285             (i->Ain.SseSI2SF.src);
1286          vex_printf(",");
1287          ppHRegAMD64(i->Ain.SseSI2SF.dst);
1288          break;
1289       case Ain_SseSF2SI:
1290          vex_printf("cvts%s2si ", i->Ain.SseSF2SI.szS==4 ? "s" : "d");
1291          ppHRegAMD64(i->Ain.SseSF2SI.src);
1292          vex_printf(",");
1293          (i->Ain.SseSF2SI.szD==4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1294             (i->Ain.SseSF2SI.dst);
1295          break;
1296       case Ain_SseSDSS:
1297          vex_printf(i->Ain.SseSDSS.from64 ? "cvtsd2ss " : "cvtss2sd ");
1298          ppHRegAMD64(i->Ain.SseSDSS.src);
1299          vex_printf(",");
1300          ppHRegAMD64(i->Ain.SseSDSS.dst);
1301          break;
1302       case Ain_SseLdSt:
1303          switch (i->Ain.SseLdSt.sz) {
1304             case 4:  vex_printf("movss "); break;
1305             case 8:  vex_printf("movsd "); break;
1306             case 16: vex_printf("movups "); break;
1307             default: vassert(0);
1308          }
1309          if (i->Ain.SseLdSt.isLoad) {
1310             ppAMD64AMode(i->Ain.SseLdSt.addr);
1311             vex_printf(",");
1312             ppHRegAMD64(i->Ain.SseLdSt.reg);
1313          } else {
1314             ppHRegAMD64(i->Ain.SseLdSt.reg);
1315             vex_printf(",");
1316             ppAMD64AMode(i->Ain.SseLdSt.addr);
1317          }
1318          return;
1319       case Ain_SseCStore:
1320          vex_printf("if (%%rflags.%s) { ",
1321                     showAMD64CondCode(i->Ain.SseCStore.cond));
1322          vex_printf("movups ");
1323          ppHRegAMD64(i->Ain.SseCStore.src);
1324          vex_printf(", ");
1325          ppAMD64AMode(i->Ain.SseCStore.addr);
1326          vex_printf(" }");
1327          return;
1328       case Ain_SseCLoad:
1329          vex_printf("if (%%rflags.%s) { ",
1330                     showAMD64CondCode(i->Ain.SseCLoad.cond));
1331          vex_printf("movups ");
1332          ppAMD64AMode(i->Ain.SseCLoad.addr);
1333          vex_printf(", ");
1334          ppHRegAMD64(i->Ain.SseCLoad.dst);
1335          vex_printf(" }");
1336          return;
1337       case Ain_SseLdzLO:
1338          vex_printf("movs%s ", i->Ain.SseLdzLO.sz==4 ? "s" : "d");
1339          ppAMD64AMode(i->Ain.SseLdzLO.addr);
1340          vex_printf(",");
1341          ppHRegAMD64(i->Ain.SseLdzLO.reg);
1342          return;
1343       case Ain_Sse32Fx4:
1344          vex_printf("%sps ", showAMD64SseOp(i->Ain.Sse32Fx4.op));
1345          ppHRegAMD64(i->Ain.Sse32Fx4.src);
1346          vex_printf(",");
1347          ppHRegAMD64(i->Ain.Sse32Fx4.dst);
1348          return;
1349       case Ain_Sse32FLo:
1350          vex_printf("%sss ", showAMD64SseOp(i->Ain.Sse32FLo.op));
1351          ppHRegAMD64(i->Ain.Sse32FLo.src);
1352          vex_printf(",");
1353          ppHRegAMD64(i->Ain.Sse32FLo.dst);
1354          return;
1355       case Ain_Sse64Fx2:
1356          vex_printf("%spd ", showAMD64SseOp(i->Ain.Sse64Fx2.op));
1357          ppHRegAMD64(i->Ain.Sse64Fx2.src);
1358          vex_printf(",");
1359          ppHRegAMD64(i->Ain.Sse64Fx2.dst);
1360          return;
1361       case Ain_Sse64FLo:
1362          vex_printf("%ssd ", showAMD64SseOp(i->Ain.Sse64FLo.op));
1363          ppHRegAMD64(i->Ain.Sse64FLo.src);
1364          vex_printf(",");
1365          ppHRegAMD64(i->Ain.Sse64FLo.dst);
1366          return;
1367       case Ain_SseReRg:
1368          vex_printf("%s ", showAMD64SseOp(i->Ain.SseReRg.op));
1369          ppHRegAMD64(i->Ain.SseReRg.src);
1370          vex_printf(",");
1371          ppHRegAMD64(i->Ain.SseReRg.dst);
1372          return;
1373       case Ain_SseCMov:
1374          vex_printf("cmov%s ", showAMD64CondCode(i->Ain.SseCMov.cond));
1375          ppHRegAMD64(i->Ain.SseCMov.src);
1376          vex_printf(",");
1377          ppHRegAMD64(i->Ain.SseCMov.dst);
1378          return;
1379       case Ain_SseShuf:
1380          vex_printf("pshufd $0x%x,", (UInt)i->Ain.SseShuf.order);
1381          ppHRegAMD64(i->Ain.SseShuf.src);
1382          vex_printf(",");
1383          ppHRegAMD64(i->Ain.SseShuf.dst);
1384          return;
1385       case Ain_SseShiftN:
1386          vex_printf("%s $%u, ", showAMD64SseOp(i->Ain.SseShiftN.op),
1387                                 i->Ain.SseShiftN.shiftBits);
1388          ppHRegAMD64(i->Ain.SseShiftN.dst);
1389          return;
1390       case Ain_SseMOVQ:
1391          vex_printf("movq ");
1392          if (i->Ain.SseMOVQ.toXMM) {
1393             ppHRegAMD64(i->Ain.SseMOVQ.gpr);
1394             vex_printf(",");
1395             ppHRegAMD64(i->Ain.SseMOVQ.xmm);
1396          } else {
1397             ppHRegAMD64(i->Ain.SseMOVQ.xmm);
1398             vex_printf(",");
1399             ppHRegAMD64(i->Ain.SseMOVQ.gpr);
1400          };
1401          return;
1402       //uu case Ain_AvxLdSt:
1403       //uu    vex_printf("vmovups ");
1404       //uu    if (i->Ain.AvxLdSt.isLoad) {
1405       //uu       ppAMD64AMode(i->Ain.AvxLdSt.addr);
1406       //uu       vex_printf(",");
1407       //uu       ppHRegAMD64(i->Ain.AvxLdSt.reg);
1408       //uu    } else {
1409       //uu       ppHRegAMD64(i->Ain.AvxLdSt.reg);
1410       //uu       vex_printf(",");
1411       //uu       ppAMD64AMode(i->Ain.AvxLdSt.addr);
1412       //uu    }
1413       //uu    return;
1414       //uu case Ain_AvxReRg:
1415       //uu    vex_printf("v%s ", showAMD64SseOp(i->Ain.SseReRg.op));
1416       //uu    ppHRegAMD64(i->Ain.AvxReRg.src);
1417       //uu    vex_printf(",");
1418       //uu    ppHRegAMD64(i->Ain.AvxReRg.dst);
1419       //uu    return;
1420       case Ain_EvCheck:
1421          vex_printf("(evCheck) decl ");
1422          ppAMD64AMode(i->Ain.EvCheck.amCounter);
1423          vex_printf("; jns nofail; jmp *");
1424          ppAMD64AMode(i->Ain.EvCheck.amFailAddr);
1425          vex_printf("; nofail:");
1426          return;
1427       case Ain_ProfInc:
1428          vex_printf("(profInc) movabsq $NotKnownYet, %%r11; incq (%%r11)");
1429          return;
1430       default:
1431          vpanic("ppAMD64Instr");
1432    }
1433 }
1434 
1435 /* --------- Helpers for register allocation. --------- */
1436 
getRegUsage_AMD64Instr(HRegUsage * u,const AMD64Instr * i,Bool mode64)1437 void getRegUsage_AMD64Instr ( HRegUsage* u, const AMD64Instr* i, Bool mode64 )
1438 {
1439    Bool unary;
1440    vassert(mode64 == True);
1441    initHRegUsage(u);
1442    switch (i->tag) {
1443       case Ain_Imm64:
1444          addHRegUse(u, HRmWrite, i->Ain.Imm64.dst);
1445          return;
1446       case Ain_Alu64R:
1447          addRegUsage_AMD64RMI(u, i->Ain.Alu64R.src);
1448          if (i->Ain.Alu64R.op == Aalu_MOV) {
1449             addHRegUse(u, HRmWrite, i->Ain.Alu64R.dst);
1450 
1451             if (i->Ain.Alu64R.src->tag == Armi_Reg) {
1452                u->isRegRegMove = True;
1453                u->regMoveSrc   = i->Ain.Alu64R.src->Armi.Reg.reg;
1454                u->regMoveDst   = i->Ain.Alu64R.dst;
1455             }
1456             return;
1457          }
1458          if (i->Ain.Alu64R.op == Aalu_CMP) {
1459             addHRegUse(u, HRmRead, i->Ain.Alu64R.dst);
1460             return;
1461          }
1462          addHRegUse(u, HRmModify, i->Ain.Alu64R.dst);
1463          return;
1464       case Ain_Alu64M:
1465          addRegUsage_AMD64RI(u, i->Ain.Alu64M.src);
1466          addRegUsage_AMD64AMode(u, i->Ain.Alu64M.dst);
1467          return;
1468       case Ain_Sh64:
1469          addHRegUse(u, HRmModify, i->Ain.Sh64.dst);
1470          if (i->Ain.Sh64.src == 0)
1471             addHRegUse(u, HRmRead, hregAMD64_RCX());
1472          return;
1473       case Ain_Test64:
1474          addHRegUse(u, HRmRead, i->Ain.Test64.dst);
1475          return;
1476       case Ain_Unary64:
1477          addHRegUse(u, HRmModify, i->Ain.Unary64.dst);
1478          return;
1479       case Ain_Lea64:
1480          addRegUsage_AMD64AMode(u, i->Ain.Lea64.am);
1481          addHRegUse(u, HRmWrite, i->Ain.Lea64.dst);
1482          return;
1483       case Ain_Alu32R:
1484          vassert(i->Ain.Alu32R.op != Aalu_MOV);
1485          addRegUsage_AMD64RMI(u, i->Ain.Alu32R.src);
1486          if (i->Ain.Alu32R.op == Aalu_CMP) {
1487             addHRegUse(u, HRmRead, i->Ain.Alu32R.dst);
1488             return;
1489          }
1490          addHRegUse(u, HRmModify, i->Ain.Alu32R.dst);
1491          return;
1492       case Ain_MulL:
1493          addRegUsage_AMD64RM(u, i->Ain.MulL.src, HRmRead);
1494          addHRegUse(u, HRmModify, hregAMD64_RAX());
1495          addHRegUse(u, HRmWrite, hregAMD64_RDX());
1496          return;
1497       case Ain_Div:
1498          addRegUsage_AMD64RM(u, i->Ain.Div.src, HRmRead);
1499          addHRegUse(u, HRmModify, hregAMD64_RAX());
1500          addHRegUse(u, HRmModify, hregAMD64_RDX());
1501          return;
1502       case Ain_Push:
1503          addRegUsage_AMD64RMI(u, i->Ain.Push.src);
1504          addHRegUse(u, HRmModify, hregAMD64_RSP());
1505          return;
1506       case Ain_Call:
1507          /* This is a bit subtle. */
1508          /* First off, claim it trashes all the caller-saved regs
1509             which fall within the register allocator's jurisdiction.
1510             These I believe to be: rax rcx rdx rdi rsi r8 r9 r10
1511             and all the xmm registers. */
1512          addHRegUse(u, HRmWrite, hregAMD64_RAX());
1513          addHRegUse(u, HRmWrite, hregAMD64_RCX());
1514          addHRegUse(u, HRmWrite, hregAMD64_RDX());
1515          addHRegUse(u, HRmWrite, hregAMD64_RDI());
1516          addHRegUse(u, HRmWrite, hregAMD64_RSI());
1517          addHRegUse(u, HRmWrite, hregAMD64_R8());
1518          addHRegUse(u, HRmWrite, hregAMD64_R9());
1519          addHRegUse(u, HRmWrite, hregAMD64_R10());
1520          addHRegUse(u, HRmWrite, hregAMD64_XMM0());
1521          addHRegUse(u, HRmWrite, hregAMD64_XMM1());
1522          addHRegUse(u, HRmWrite, hregAMD64_XMM3());
1523          addHRegUse(u, HRmWrite, hregAMD64_XMM4());
1524          addHRegUse(u, HRmWrite, hregAMD64_XMM5());
1525          addHRegUse(u, HRmWrite, hregAMD64_XMM6());
1526          addHRegUse(u, HRmWrite, hregAMD64_XMM7());
1527          addHRegUse(u, HRmWrite, hregAMD64_XMM8());
1528          addHRegUse(u, HRmWrite, hregAMD64_XMM9());
1529          addHRegUse(u, HRmWrite, hregAMD64_XMM10());
1530          addHRegUse(u, HRmWrite, hregAMD64_XMM11());
1531          addHRegUse(u, HRmWrite, hregAMD64_XMM12());
1532 
1533          /* Now we have to state any parameter-carrying registers
1534             which might be read.  This depends on the regparmness. */
1535          switch (i->Ain.Call.regparms) {
1536             case 6: addHRegUse(u, HRmRead, hregAMD64_R9());  /*fallthru*/
1537             case 5: addHRegUse(u, HRmRead, hregAMD64_R8());  /*fallthru*/
1538             case 4: addHRegUse(u, HRmRead, hregAMD64_RCX()); /*fallthru*/
1539             case 3: addHRegUse(u, HRmRead, hregAMD64_RDX()); /*fallthru*/
1540             case 2: addHRegUse(u, HRmRead, hregAMD64_RSI()); /*fallthru*/
1541             case 1: addHRegUse(u, HRmRead, hregAMD64_RDI()); break;
1542             case 0: break;
1543             default: vpanic("getRegUsage_AMD64Instr:Call:regparms");
1544          }
1545          /* Finally, there is the issue that the insn trashes a
1546             register because the literal target address has to be
1547             loaded into a register.  Fortunately, r11 is stated in the
1548             ABI as a scratch register, and so seems a suitable victim.  */
1549          addHRegUse(u, HRmWrite, hregAMD64_R11());
1550          /* Upshot of this is that the assembler really must use r11,
1551             and no other, as a destination temporary. */
1552          return;
1553       /* XDirect/XIndir/XAssisted are also a bit subtle.  They
1554          conditionally exit the block.  Hence we only need to list (1)
1555          the registers that they read, and (2) the registers that they
1556          write in the case where the block is not exited.  (2) is
1557          empty, hence only (1) is relevant here. */
1558       case Ain_XDirect:
1559          /* Don't bother to mention the write to %r11, since it is not
1560             available to the allocator. */
1561          addRegUsage_AMD64AMode(u, i->Ain.XDirect.amRIP);
1562          return;
1563       case Ain_XIndir:
1564          /* Ditto re %r11 */
1565          addHRegUse(u, HRmRead, i->Ain.XIndir.dstGA);
1566          addRegUsage_AMD64AMode(u, i->Ain.XIndir.amRIP);
1567          return;
1568       case Ain_XAssisted:
1569          /* Ditto re %r11 and %rbp (the baseblock ptr) */
1570          addHRegUse(u, HRmRead, i->Ain.XAssisted.dstGA);
1571          addRegUsage_AMD64AMode(u, i->Ain.XAssisted.amRIP);
1572          return;
1573       case Ain_CMov64:
1574          addHRegUse(u, HRmRead,   i->Ain.CMov64.src);
1575          addHRegUse(u, HRmModify, i->Ain.CMov64.dst);
1576          return;
1577       case Ain_CLoad:
1578          addRegUsage_AMD64AMode(u, i->Ain.CLoad.addr);
1579          addHRegUse(u, HRmModify, i->Ain.CLoad.dst);
1580          return;
1581       case Ain_CStore:
1582          addRegUsage_AMD64AMode(u, i->Ain.CStore.addr);
1583          addHRegUse(u, HRmRead, i->Ain.CStore.src);
1584          return;
1585       case Ain_MovxLQ:
1586          addHRegUse(u, HRmRead,  i->Ain.MovxLQ.src);
1587          addHRegUse(u, HRmWrite, i->Ain.MovxLQ.dst);
1588          return;
1589       case Ain_LoadEX:
1590          addRegUsage_AMD64AMode(u, i->Ain.LoadEX.src);
1591          addHRegUse(u, HRmWrite, i->Ain.LoadEX.dst);
1592          return;
1593       case Ain_Store:
1594          addHRegUse(u, HRmRead, i->Ain.Store.src);
1595          addRegUsage_AMD64AMode(u, i->Ain.Store.dst);
1596          return;
1597       case Ain_Set64:
1598          addHRegUse(u, HRmWrite, i->Ain.Set64.dst);
1599          return;
1600       case Ain_Bsfr64:
1601          addHRegUse(u, HRmRead, i->Ain.Bsfr64.src);
1602          addHRegUse(u, HRmWrite, i->Ain.Bsfr64.dst);
1603          return;
1604       case Ain_MFence:
1605          return;
1606       case Ain_ACAS:
1607          addRegUsage_AMD64AMode(u, i->Ain.ACAS.addr);
1608          addHRegUse(u, HRmRead, hregAMD64_RBX());
1609          addHRegUse(u, HRmModify, hregAMD64_RAX());
1610          return;
1611       case Ain_DACAS:
1612          addRegUsage_AMD64AMode(u, i->Ain.DACAS.addr);
1613          addHRegUse(u, HRmRead, hregAMD64_RCX());
1614          addHRegUse(u, HRmRead, hregAMD64_RBX());
1615          addHRegUse(u, HRmModify, hregAMD64_RDX());
1616          addHRegUse(u, HRmModify, hregAMD64_RAX());
1617          return;
1618       case Ain_A87Free:
1619          return;
1620       case Ain_A87PushPop:
1621          addRegUsage_AMD64AMode(u, i->Ain.A87PushPop.addr);
1622          return;
1623       case Ain_A87FpOp:
1624          return;
1625       case Ain_A87LdCW:
1626          addRegUsage_AMD64AMode(u, i->Ain.A87LdCW.addr);
1627          return;
1628       case Ain_A87StSW:
1629          addRegUsage_AMD64AMode(u, i->Ain.A87StSW.addr);
1630          return;
1631       case Ain_LdMXCSR:
1632          addRegUsage_AMD64AMode(u, i->Ain.LdMXCSR.addr);
1633          return;
1634       case Ain_SseUComIS:
1635          addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcL);
1636          addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcR);
1637          addHRegUse(u, HRmWrite, i->Ain.SseUComIS.dst);
1638          return;
1639       case Ain_SseSI2SF:
1640          addHRegUse(u, HRmRead,  i->Ain.SseSI2SF.src);
1641          addHRegUse(u, HRmWrite, i->Ain.SseSI2SF.dst);
1642          return;
1643       case Ain_SseSF2SI:
1644          addHRegUse(u, HRmRead,  i->Ain.SseSF2SI.src);
1645          addHRegUse(u, HRmWrite, i->Ain.SseSF2SI.dst);
1646          return;
1647       case Ain_SseSDSS:
1648          addHRegUse(u, HRmRead,  i->Ain.SseSDSS.src);
1649          addHRegUse(u, HRmWrite, i->Ain.SseSDSS.dst);
1650          return;
1651       case Ain_SseLdSt:
1652          addRegUsage_AMD64AMode(u, i->Ain.SseLdSt.addr);
1653          addHRegUse(u, i->Ain.SseLdSt.isLoad ? HRmWrite : HRmRead,
1654                        i->Ain.SseLdSt.reg);
1655          return;
1656       case Ain_SseCStore:
1657          addRegUsage_AMD64AMode(u, i->Ain.SseCStore.addr);
1658          addHRegUse(u, HRmRead, i->Ain.SseCStore.src);
1659          return;
1660       case Ain_SseCLoad:
1661          addRegUsage_AMD64AMode(u, i->Ain.SseCLoad.addr);
1662          addHRegUse(u, HRmModify, i->Ain.SseCLoad.dst);
1663          return;
1664       case Ain_SseLdzLO:
1665          addRegUsage_AMD64AMode(u, i->Ain.SseLdzLO.addr);
1666          addHRegUse(u, HRmWrite, i->Ain.SseLdzLO.reg);
1667          return;
1668       case Ain_Sse32Fx4:
1669          vassert(i->Ain.Sse32Fx4.op != Asse_MOV);
1670          unary = toBool( i->Ain.Sse32Fx4.op == Asse_RCPF
1671                          || i->Ain.Sse32Fx4.op == Asse_RSQRTF
1672                          || i->Ain.Sse32Fx4.op == Asse_SQRTF
1673                          || i->Ain.Sse32Fx4.op == Asse_I2F
1674                          || i->Ain.Sse32Fx4.op == Asse_F2I );
1675          addHRegUse(u, HRmRead, i->Ain.Sse32Fx4.src);
1676          addHRegUse(u, unary ? HRmWrite : HRmModify,
1677                        i->Ain.Sse32Fx4.dst);
1678          return;
1679       case Ain_Sse32FLo:
1680          vassert(i->Ain.Sse32FLo.op != Asse_MOV);
1681          unary = toBool( i->Ain.Sse32FLo.op == Asse_RCPF
1682                          || i->Ain.Sse32FLo.op == Asse_RSQRTF
1683                          || i->Ain.Sse32FLo.op == Asse_SQRTF );
1684          addHRegUse(u, HRmRead, i->Ain.Sse32FLo.src);
1685          addHRegUse(u, unary ? HRmWrite : HRmModify,
1686                        i->Ain.Sse32FLo.dst);
1687          return;
1688       case Ain_Sse64Fx2:
1689          vassert(i->Ain.Sse64Fx2.op != Asse_MOV);
1690          unary = toBool( i->Ain.Sse64Fx2.op == Asse_RCPF
1691                          || i->Ain.Sse64Fx2.op == Asse_RSQRTF
1692                          || i->Ain.Sse64Fx2.op == Asse_SQRTF );
1693          addHRegUse(u, HRmRead, i->Ain.Sse64Fx2.src);
1694          addHRegUse(u, unary ? HRmWrite : HRmModify,
1695                        i->Ain.Sse64Fx2.dst);
1696          return;
1697       case Ain_Sse64FLo:
1698          vassert(i->Ain.Sse64FLo.op != Asse_MOV);
1699          unary = toBool( i->Ain.Sse64FLo.op == Asse_RCPF
1700                          || i->Ain.Sse64FLo.op == Asse_RSQRTF
1701                          || i->Ain.Sse64FLo.op == Asse_SQRTF );
1702          addHRegUse(u, HRmRead, i->Ain.Sse64FLo.src);
1703          addHRegUse(u, unary ? HRmWrite : HRmModify,
1704                        i->Ain.Sse64FLo.dst);
1705          return;
1706       case Ain_SseReRg:
1707          if ( (i->Ain.SseReRg.op == Asse_XOR
1708                || i->Ain.SseReRg.op == Asse_CMPEQ32)
1709               && sameHReg(i->Ain.SseReRg.src, i->Ain.SseReRg.dst)) {
1710             /* reg-alloc needs to understand 'xor r,r' and 'cmpeqd
1711                r,r' as a write of a value to r, and independent of any
1712                previous value in r */
1713             /* (as opposed to a rite of passage :-) */
1714             addHRegUse(u, HRmWrite, i->Ain.SseReRg.dst);
1715          } else {
1716             addHRegUse(u, HRmRead, i->Ain.SseReRg.src);
1717             addHRegUse(u, i->Ain.SseReRg.op == Asse_MOV
1718                              ? HRmWrite : HRmModify,
1719                           i->Ain.SseReRg.dst);
1720 
1721             if (i->Ain.SseReRg.op == Asse_MOV) {
1722                u->isRegRegMove = True;
1723                u->regMoveSrc   = i->Ain.SseReRg.src;
1724                u->regMoveDst   = i->Ain.SseReRg.dst;
1725             }
1726          }
1727          return;
1728       case Ain_SseCMov:
1729          addHRegUse(u, HRmRead,   i->Ain.SseCMov.src);
1730          addHRegUse(u, HRmModify, i->Ain.SseCMov.dst);
1731          return;
1732       case Ain_SseShuf:
1733          addHRegUse(u, HRmRead,  i->Ain.SseShuf.src);
1734          addHRegUse(u, HRmWrite, i->Ain.SseShuf.dst);
1735          return;
1736       case Ain_SseShiftN:
1737          addHRegUse(u, HRmModify, i->Ain.SseShiftN.dst);
1738          return;
1739       case Ain_SseMOVQ:
1740          addHRegUse(u, i->Ain.SseMOVQ.toXMM ? HRmRead : HRmWrite,
1741                     i->Ain.SseMOVQ.gpr);
1742          addHRegUse(u, i->Ain.SseMOVQ.toXMM ? HRmWrite : HRmRead,
1743                     i->Ain.SseMOVQ.xmm);
1744          return;
1745       //uu case Ain_AvxLdSt:
1746       //uu addRegUsage_AMD64AMode(u, i->Ain.AvxLdSt.addr);
1747       //uu addHRegUse(u, i->Ain.AvxLdSt.isLoad ? HRmWrite : HRmRead,
1748       //uu               i->Ain.AvxLdSt.reg);
1749       //uu return;
1750       //uu case Ain_AvxReRg:
1751       //uu    if ( (i->Ain.AvxReRg.op == Asse_XOR
1752       //uu          || i->Ain.AvxReRg.op == Asse_CMPEQ32)
1753       //uu         && i->Ain.AvxReRg.src == i->Ain.AvxReRg.dst) {
1754       //uu       /* See comments on the case for Ain_SseReRg. */
1755       //uu       addHRegUse(u, HRmWrite, i->Ain.AvxReRg.dst);
1756       //uu    } else {
1757       //uu       addHRegUse(u, HRmRead, i->Ain.AvxReRg.src);
1758       //uu       addHRegUse(u, i->Ain.AvxReRg.op == Asse_MOV
1759       //uu                        ? HRmWrite : HRmModify,
1760       //uu                     i->Ain.AvxReRg.dst);
1761       //uu
1762       //uu       if (i->Ain.AvxReRg.op == Asse_MOV) {
1763       //uu          u->isRegRegMove = True;
1764       //uu          u->regMoveSrc   = i->Ain.AvxReRg.src;
1765       //uu          u->regMoveDst   = i->Ain.AvxReRg.dst;
1766       //uu       }
1767       //uu    }
1768       //uu    return;
1769       case Ain_EvCheck:
1770          /* We expect both amodes only to mention %rbp, so this is in
1771             fact pointless, since %rbp isn't allocatable, but anyway.. */
1772          addRegUsage_AMD64AMode(u, i->Ain.EvCheck.amCounter);
1773          addRegUsage_AMD64AMode(u, i->Ain.EvCheck.amFailAddr);
1774          return;
1775       case Ain_ProfInc:
1776          addHRegUse(u, HRmWrite, hregAMD64_R11());
1777          return;
1778       default:
1779          ppAMD64Instr(i, mode64);
1780          vpanic("getRegUsage_AMD64Instr");
1781    }
1782 }
1783 
1784 /* local helper */
mapReg(HRegRemap * m,HReg * r)1785 static inline void mapReg(HRegRemap* m, HReg* r)
1786 {
1787    *r = lookupHRegRemap(m, *r);
1788 }
1789 
mapRegs_AMD64Instr(HRegRemap * m,AMD64Instr * i,Bool mode64)1790 void mapRegs_AMD64Instr ( HRegRemap* m, AMD64Instr* i, Bool mode64 )
1791 {
1792    vassert(mode64 == True);
1793    switch (i->tag) {
1794       case Ain_Imm64:
1795          mapReg(m, &i->Ain.Imm64.dst);
1796          return;
1797       case Ain_Alu64R:
1798          mapRegs_AMD64RMI(m, i->Ain.Alu64R.src);
1799          mapReg(m, &i->Ain.Alu64R.dst);
1800          return;
1801       case Ain_Alu64M:
1802          mapRegs_AMD64RI(m, i->Ain.Alu64M.src);
1803          mapRegs_AMD64AMode(m, i->Ain.Alu64M.dst);
1804          return;
1805       case Ain_Sh64:
1806          mapReg(m, &i->Ain.Sh64.dst);
1807          return;
1808       case Ain_Test64:
1809          mapReg(m, &i->Ain.Test64.dst);
1810          return;
1811       case Ain_Unary64:
1812          mapReg(m, &i->Ain.Unary64.dst);
1813          return;
1814       case Ain_Lea64:
1815          mapRegs_AMD64AMode(m, i->Ain.Lea64.am);
1816          mapReg(m, &i->Ain.Lea64.dst);
1817          return;
1818       case Ain_Alu32R:
1819          mapRegs_AMD64RMI(m, i->Ain.Alu32R.src);
1820          mapReg(m, &i->Ain.Alu32R.dst);
1821          return;
1822       case Ain_MulL:
1823          mapRegs_AMD64RM(m, i->Ain.MulL.src);
1824          return;
1825       case Ain_Div:
1826          mapRegs_AMD64RM(m, i->Ain.Div.src);
1827          return;
1828       case Ain_Push:
1829          mapRegs_AMD64RMI(m, i->Ain.Push.src);
1830          return;
1831       case Ain_Call:
1832          return;
1833       case Ain_XDirect:
1834          mapRegs_AMD64AMode(m, i->Ain.XDirect.amRIP);
1835          return;
1836       case Ain_XIndir:
1837          mapReg(m, &i->Ain.XIndir.dstGA);
1838          mapRegs_AMD64AMode(m, i->Ain.XIndir.amRIP);
1839          return;
1840       case Ain_XAssisted:
1841          mapReg(m, &i->Ain.XAssisted.dstGA);
1842          mapRegs_AMD64AMode(m, i->Ain.XAssisted.amRIP);
1843          return;
1844       case Ain_CMov64:
1845          mapReg(m, &i->Ain.CMov64.src);
1846          mapReg(m, &i->Ain.CMov64.dst);
1847          return;
1848       case Ain_CLoad:
1849          mapRegs_AMD64AMode(m, i->Ain.CLoad.addr);
1850          mapReg(m, &i->Ain.CLoad.dst);
1851          return;
1852       case Ain_CStore:
1853          mapRegs_AMD64AMode(m, i->Ain.CStore.addr);
1854          mapReg(m, &i->Ain.CStore.src);
1855          return;
1856       case Ain_MovxLQ:
1857          mapReg(m, &i->Ain.MovxLQ.src);
1858          mapReg(m, &i->Ain.MovxLQ.dst);
1859          return;
1860       case Ain_LoadEX:
1861          mapRegs_AMD64AMode(m, i->Ain.LoadEX.src);
1862          mapReg(m, &i->Ain.LoadEX.dst);
1863          return;
1864       case Ain_Store:
1865          mapReg(m, &i->Ain.Store.src);
1866          mapRegs_AMD64AMode(m, i->Ain.Store.dst);
1867          return;
1868       case Ain_Set64:
1869          mapReg(m, &i->Ain.Set64.dst);
1870          return;
1871       case Ain_Bsfr64:
1872          mapReg(m, &i->Ain.Bsfr64.src);
1873          mapReg(m, &i->Ain.Bsfr64.dst);
1874          return;
1875       case Ain_MFence:
1876          return;
1877       case Ain_ACAS:
1878          mapRegs_AMD64AMode(m, i->Ain.ACAS.addr);
1879          return;
1880       case Ain_DACAS:
1881          mapRegs_AMD64AMode(m, i->Ain.DACAS.addr);
1882          return;
1883       case Ain_A87Free:
1884          return;
1885       case Ain_A87PushPop:
1886          mapRegs_AMD64AMode(m, i->Ain.A87PushPop.addr);
1887          return;
1888       case Ain_A87FpOp:
1889          return;
1890       case Ain_A87LdCW:
1891          mapRegs_AMD64AMode(m, i->Ain.A87LdCW.addr);
1892          return;
1893       case Ain_A87StSW:
1894          mapRegs_AMD64AMode(m, i->Ain.A87StSW.addr);
1895          return;
1896       case Ain_LdMXCSR:
1897          mapRegs_AMD64AMode(m, i->Ain.LdMXCSR.addr);
1898          return;
1899       case Ain_SseUComIS:
1900          mapReg(m, &i->Ain.SseUComIS.srcL);
1901          mapReg(m, &i->Ain.SseUComIS.srcR);
1902          mapReg(m, &i->Ain.SseUComIS.dst);
1903          return;
1904       case Ain_SseSI2SF:
1905          mapReg(m, &i->Ain.SseSI2SF.src);
1906          mapReg(m, &i->Ain.SseSI2SF.dst);
1907          return;
1908       case Ain_SseSF2SI:
1909          mapReg(m, &i->Ain.SseSF2SI.src);
1910          mapReg(m, &i->Ain.SseSF2SI.dst);
1911          return;
1912       case Ain_SseSDSS:
1913          mapReg(m, &i->Ain.SseSDSS.src);
1914          mapReg(m, &i->Ain.SseSDSS.dst);
1915          return;
1916       case Ain_SseLdSt:
1917          mapReg(m, &i->Ain.SseLdSt.reg);
1918          mapRegs_AMD64AMode(m, i->Ain.SseLdSt.addr);
1919          break;
1920       case Ain_SseCStore:
1921          mapRegs_AMD64AMode(m, i->Ain.SseCStore.addr);
1922          mapReg(m, &i->Ain.SseCStore.src);
1923          return;
1924       case Ain_SseCLoad:
1925          mapRegs_AMD64AMode(m, i->Ain.SseCLoad.addr);
1926          mapReg(m, &i->Ain.SseCLoad.dst);
1927          return;
1928       case Ain_SseLdzLO:
1929          mapReg(m, &i->Ain.SseLdzLO.reg);
1930          mapRegs_AMD64AMode(m, i->Ain.SseLdzLO.addr);
1931          break;
1932       case Ain_Sse32Fx4:
1933          mapReg(m, &i->Ain.Sse32Fx4.src);
1934          mapReg(m, &i->Ain.Sse32Fx4.dst);
1935          return;
1936       case Ain_Sse32FLo:
1937          mapReg(m, &i->Ain.Sse32FLo.src);
1938          mapReg(m, &i->Ain.Sse32FLo.dst);
1939          return;
1940       case Ain_Sse64Fx2:
1941          mapReg(m, &i->Ain.Sse64Fx2.src);
1942          mapReg(m, &i->Ain.Sse64Fx2.dst);
1943          return;
1944       case Ain_Sse64FLo:
1945          mapReg(m, &i->Ain.Sse64FLo.src);
1946          mapReg(m, &i->Ain.Sse64FLo.dst);
1947          return;
1948       case Ain_SseReRg:
1949          mapReg(m, &i->Ain.SseReRg.src);
1950          mapReg(m, &i->Ain.SseReRg.dst);
1951          return;
1952       case Ain_SseCMov:
1953          mapReg(m, &i->Ain.SseCMov.src);
1954          mapReg(m, &i->Ain.SseCMov.dst);
1955          return;
1956       case Ain_SseShuf:
1957          mapReg(m, &i->Ain.SseShuf.src);
1958          mapReg(m, &i->Ain.SseShuf.dst);
1959          return;
1960       case Ain_SseShiftN:
1961          mapReg(m, &i->Ain.SseShiftN.dst);
1962          return;
1963       case Ain_SseMOVQ:
1964          mapReg(m, &i->Ain.SseMOVQ.gpr);
1965          mapReg(m, &i->Ain.SseMOVQ.xmm);
1966          return;
1967       //uu case Ain_AvxLdSt:
1968       //uu    mapReg(m, &i->Ain.AvxLdSt.reg);
1969       //uu    mapRegs_AMD64AMode(m, i->Ain.AvxLdSt.addr);
1970       //uu    break;
1971       //uu case Ain_AvxReRg:
1972       //uu    mapReg(m, &i->Ain.AvxReRg.src);
1973       //uu    mapReg(m, &i->Ain.AvxReRg.dst);
1974       //uu    return;
1975       case Ain_EvCheck:
1976          /* We expect both amodes only to mention %rbp, so this is in
1977             fact pointless, since %rbp isn't allocatable, but anyway.. */
1978          mapRegs_AMD64AMode(m, i->Ain.EvCheck.amCounter);
1979          mapRegs_AMD64AMode(m, i->Ain.EvCheck.amFailAddr);
1980          return;
1981       case Ain_ProfInc:
1982          /* hardwires r11 -- nothing to modify. */
1983          return;
1984       default:
1985          ppAMD64Instr(i, mode64);
1986          vpanic("mapRegs_AMD64Instr");
1987    }
1988 }
1989 
1990 /* Generate amd64 spill/reload instructions under the direction of the
1991    register allocator.  Note it's critical these don't write the
1992    condition codes. */
1993 
genSpill_AMD64(HInstr ** i1,HInstr ** i2,HReg rreg,Int offsetB,Bool mode64)1994 void genSpill_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
1995                       HReg rreg, Int offsetB, Bool mode64 )
1996 {
1997    AMD64AMode* am;
1998    vassert(offsetB >= 0);
1999    vassert(!hregIsVirtual(rreg));
2000    vassert(mode64 == True);
2001    *i1 = *i2 = NULL;
2002    am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
2003    switch (hregClass(rreg)) {
2004       case HRcInt64:
2005          *i1 = AMD64Instr_Alu64M ( Aalu_MOV, AMD64RI_Reg(rreg), am );
2006          return;
2007       case HRcVec128:
2008          *i1 = AMD64Instr_SseLdSt ( False/*store*/, 16, rreg, am );
2009          return;
2010       default:
2011          ppHRegClass(hregClass(rreg));
2012          vpanic("genSpill_AMD64: unimplemented regclass");
2013    }
2014 }
2015 
genReload_AMD64(HInstr ** i1,HInstr ** i2,HReg rreg,Int offsetB,Bool mode64)2016 void genReload_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
2017                        HReg rreg, Int offsetB, Bool mode64 )
2018 {
2019    AMD64AMode* am;
2020    vassert(offsetB >= 0);
2021    vassert(!hregIsVirtual(rreg));
2022    vassert(mode64 == True);
2023    *i1 = *i2 = NULL;
2024    am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
2025    switch (hregClass(rreg)) {
2026       case HRcInt64:
2027          *i1 = AMD64Instr_Alu64R ( Aalu_MOV, AMD64RMI_Mem(am), rreg );
2028          return;
2029       case HRcVec128:
2030          *i1 = AMD64Instr_SseLdSt ( True/*load*/, 16, rreg, am );
2031          return;
2032       default:
2033          ppHRegClass(hregClass(rreg));
2034          vpanic("genReload_AMD64: unimplemented regclass");
2035    }
2036 }
2037 
genMove_AMD64(HReg from,HReg to,Bool mode64)2038 AMD64Instr* genMove_AMD64(HReg from, HReg to, Bool mode64)
2039 {
2040    switch (hregClass(from)) {
2041    case HRcInt64:
2042       return AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(from), to);
2043    case HRcVec128:
2044       return AMD64Instr_SseReRg(Asse_MOV, from, to);
2045    default:
2046       ppHRegClass(hregClass(from));
2047       vpanic("genMove_AMD64: unimplemented regclass");
2048    }
2049 }
2050 
directReload_AMD64(AMD64Instr * i,HReg vreg,Short spill_off)2051 AMD64Instr* directReload_AMD64( AMD64Instr* i, HReg vreg, Short spill_off )
2052 {
2053    vassert(spill_off >= 0 && spill_off < 10000); /* let's say */
2054 
2055    /* Deal with form: src=RMI_Reg, dst=Reg where src == vreg
2056       Convert to: src=RMI_Mem, dst=Reg
2057    */
2058    if (i->tag == Ain_Alu64R
2059        && (i->Ain.Alu64R.op == Aalu_MOV || i->Ain.Alu64R.op == Aalu_OR
2060            || i->Ain.Alu64R.op == Aalu_XOR)
2061        && i->Ain.Alu64R.src->tag == Armi_Reg
2062        && sameHReg(i->Ain.Alu64R.src->Armi.Reg.reg, vreg)) {
2063       vassert(! sameHReg(i->Ain.Alu64R.dst, vreg));
2064       return AMD64Instr_Alu64R(
2065                 i->Ain.Alu64R.op,
2066                 AMD64RMI_Mem( AMD64AMode_IR( spill_off, hregAMD64_RBP())),
2067                 i->Ain.Alu64R.dst
2068              );
2069    }
2070 
2071    /* Deal with form: src=RMI_Imm, dst=Reg where dst == vreg
2072       Convert to: src=RI_Imm, dst=Mem
2073    */
2074    if (i->tag == Ain_Alu64R
2075        && (i->Ain.Alu64R.op == Aalu_CMP)
2076        && i->Ain.Alu64R.src->tag == Armi_Imm
2077        && sameHReg(i->Ain.Alu64R.dst, vreg)) {
2078       return AMD64Instr_Alu64M(
2079                 i->Ain.Alu64R.op,
2080                 AMD64RI_Imm( i->Ain.Alu64R.src->Armi.Imm.imm32 ),
2081                 AMD64AMode_IR( spill_off, hregAMD64_RBP())
2082              );
2083    }
2084 
2085    return NULL;
2086 }
2087 
2088 
2089 /* --------- The amd64 assembler (bleh.) --------- */
2090 
2091 /* Produce the low three bits of an integer register number. */
iregEnc210(HReg r)2092 inline static UInt iregEnc210 ( HReg r )
2093 {
2094    UInt n;
2095    vassert(hregClass(r) == HRcInt64);
2096    vassert(!hregIsVirtual(r));
2097    n = hregEncoding(r);
2098    vassert(n <= 15);
2099    return n & 7;
2100 }
2101 
2102 /* Produce bit 3 of an integer register number. */
iregEnc3(HReg r)2103 inline static UInt iregEnc3 ( HReg r )
2104 {
2105    UInt n;
2106    vassert(hregClass(r) == HRcInt64);
2107    vassert(!hregIsVirtual(r));
2108    n = hregEncoding(r);
2109    vassert(n <= 15);
2110    return (n >> 3) & 1;
2111 }
2112 
2113 /* Produce a complete 4-bit integer register number. */
iregEnc3210(HReg r)2114 inline static UInt iregEnc3210 ( HReg r )
2115 {
2116    UInt n;
2117    vassert(hregClass(r) == HRcInt64);
2118    vassert(!hregIsVirtual(r));
2119    n = hregEncoding(r);
2120    vassert(n <= 15);
2121    return n;
2122 }
2123 
2124 /* Produce a complete 4-bit integer register number. */
vregEnc3210(HReg r)2125 inline static UInt vregEnc3210 ( HReg r )
2126 {
2127    UInt n;
2128    vassert(hregClass(r) == HRcVec128);
2129    vassert(!hregIsVirtual(r));
2130    n = hregEncoding(r);
2131    vassert(n <= 15);
2132    return n;
2133 }
2134 
mkModRegRM(UInt mod,UInt reg,UInt regmem)2135 inline static UChar mkModRegRM ( UInt mod, UInt reg, UInt regmem )
2136 {
2137    vassert(mod < 4);
2138    vassert((reg|regmem) < 8);
2139    return (UChar)( ((mod & 3) << 6) | ((reg & 7) << 3) | (regmem & 7) );
2140 }
2141 
mkSIB(UInt shift,UInt regindex,UInt regbase)2142 inline static UChar mkSIB ( UInt shift, UInt regindex, UInt regbase )
2143 {
2144    vassert(shift < 4);
2145    vassert((regindex|regbase) < 8);
2146    return (UChar)( ((shift & 3) << 6) | ((regindex & 7) << 3) | (regbase & 7) );
2147 }
2148 
emit32(UChar * p,UInt w32)2149 static UChar* emit32 ( UChar* p, UInt w32 )
2150 {
2151    *p++ = toUChar((w32)       & 0x000000FF);
2152    *p++ = toUChar((w32 >>  8) & 0x000000FF);
2153    *p++ = toUChar((w32 >> 16) & 0x000000FF);
2154    *p++ = toUChar((w32 >> 24) & 0x000000FF);
2155    return p;
2156 }
2157 
emit64(UChar * p,ULong w64)2158 static UChar* emit64 ( UChar* p, ULong w64 )
2159 {
2160    p = emit32(p, toUInt(w64         & 0xFFFFFFFF));
2161    p = emit32(p, toUInt((w64 >> 32) & 0xFFFFFFFF));
2162    return p;
2163 }
2164 
2165 /* Does a sign-extend of the lowest 8 bits give
2166    the original number? */
fits8bits(UInt w32)2167 static Bool fits8bits ( UInt w32 )
2168 {
2169    Int i32 = (Int)w32;
2170    return toBool(i32 == ((Int)(w32 << 24) >> 24));
2171 }
2172 /* Can the lower 32 bits be signedly widened to produce the whole
2173    64-bit value?  In other words, are the top 33 bits either all 0 or
2174    all 1 ? */
fitsIn32Bits(ULong x)2175 static Bool fitsIn32Bits ( ULong x )
2176 {
2177    Long y1;
2178    y1 = x << 32;
2179    y1 >>=/*s*/ 32;
2180    return toBool(x == y1);
2181 }
2182 
2183 
2184 /* Forming mod-reg-rm bytes and scale-index-base bytes.
2185 
2186      greg,  0(ereg)    |  ereg is not any of: RSP RBP R12 R13
2187                        =  00 greg ereg
2188 
2189      greg,  d8(ereg)   |  ereg is neither of: RSP R12
2190                        =  01 greg ereg, d8
2191 
2192      greg,  d32(ereg)  |  ereg is neither of: RSP R12
2193                        =  10 greg ereg, d32
2194 
2195      greg,  d8(ereg)   |  ereg is either: RSP R12
2196                        =  01 greg 100, 0x24, d8
2197                        (lowest bit of rex distinguishes R12/RSP)
2198 
2199      greg,  d32(ereg)  |  ereg is either: RSP R12
2200                        =  10 greg 100, 0x24, d32
2201                        (lowest bit of rex distinguishes R12/RSP)
2202 
2203      -----------------------------------------------
2204 
2205      greg,  d8(base,index,scale)
2206                |  index != RSP
2207                =  01 greg 100, scale index base, d8
2208 
2209      greg,  d32(base,index,scale)
2210                |  index != RSP
2211                =  10 greg 100, scale index base, d32
2212 */
doAMode_M__wrk(UChar * p,UInt gregEnc3210,AMD64AMode * am)2213 static UChar* doAMode_M__wrk ( UChar* p, UInt gregEnc3210, AMD64AMode* am )
2214 {
2215    UInt gregEnc210 = gregEnc3210 & 7;
2216    if (am->tag == Aam_IR) {
2217       if (am->Aam.IR.imm == 0
2218           && ! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2219           && ! sameHReg(am->Aam.IR.reg, hregAMD64_RBP())
2220           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
2221           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R13())
2222          ) {
2223          *p++ = mkModRegRM(0, gregEnc210, iregEnc210(am->Aam.IR.reg));
2224          return p;
2225       }
2226       if (fits8bits(am->Aam.IR.imm)
2227           && ! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2228           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
2229          ) {
2230          *p++ = mkModRegRM(1, gregEnc210, iregEnc210(am->Aam.IR.reg));
2231          *p++ = toUChar(am->Aam.IR.imm & 0xFF);
2232          return p;
2233       }
2234       if (! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2235           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
2236          ) {
2237          *p++ = mkModRegRM(2, gregEnc210, iregEnc210(am->Aam.IR.reg));
2238          p = emit32(p, am->Aam.IR.imm);
2239          return p;
2240       }
2241       if ((sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2242            || sameHReg(am->Aam.IR.reg, hregAMD64_R12()))
2243           && fits8bits(am->Aam.IR.imm)) {
2244  	 *p++ = mkModRegRM(1, gregEnc210, 4);
2245          *p++ = 0x24;
2246          *p++ = toUChar(am->Aam.IR.imm & 0xFF);
2247          return p;
2248       }
2249       if (/* (sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2250 	      || wait for test case for RSP case */
2251           sameHReg(am->Aam.IR.reg, hregAMD64_R12())) {
2252  	 *p++ = mkModRegRM(2, gregEnc210, 4);
2253          *p++ = 0x24;
2254          p = emit32(p, am->Aam.IR.imm);
2255          return p;
2256       }
2257       ppAMD64AMode(am);
2258       vpanic("doAMode_M: can't emit amode IR");
2259       /*NOTREACHED*/
2260    }
2261    if (am->tag == Aam_IRRS) {
2262       if (fits8bits(am->Aam.IRRS.imm)
2263           && ! sameHReg(am->Aam.IRRS.index, hregAMD64_RSP())) {
2264          *p++ = mkModRegRM(1, gregEnc210, 4);
2265          *p++ = mkSIB(am->Aam.IRRS.shift, iregEnc210(am->Aam.IRRS.index),
2266                                           iregEnc210(am->Aam.IRRS.base));
2267          *p++ = toUChar(am->Aam.IRRS.imm & 0xFF);
2268          return p;
2269       }
2270       if (! sameHReg(am->Aam.IRRS.index, hregAMD64_RSP())) {
2271          *p++ = mkModRegRM(2, gregEnc210, 4);
2272          *p++ = mkSIB(am->Aam.IRRS.shift, iregEnc210(am->Aam.IRRS.index),
2273                                           iregEnc210(am->Aam.IRRS.base));
2274          p = emit32(p, am->Aam.IRRS.imm);
2275          return p;
2276       }
2277       ppAMD64AMode(am);
2278       vpanic("doAMode_M: can't emit amode IRRS");
2279       /*NOTREACHED*/
2280    }
2281    vpanic("doAMode_M: unknown amode");
2282    /*NOTREACHED*/
2283 }
2284 
doAMode_M(UChar * p,HReg greg,AMD64AMode * am)2285 static UChar* doAMode_M ( UChar* p, HReg greg, AMD64AMode* am )
2286 {
2287    return doAMode_M__wrk(p, iregEnc3210(greg), am);
2288 }
2289 
doAMode_M_enc(UChar * p,UInt gregEnc3210,AMD64AMode * am)2290 static UChar* doAMode_M_enc ( UChar* p, UInt gregEnc3210, AMD64AMode* am )
2291 {
2292    vassert(gregEnc3210 < 16);
2293    return doAMode_M__wrk(p, gregEnc3210, am);
2294 }
2295 
2296 
2297 /* Emit a mod-reg-rm byte when the rm bit denotes a reg. */
2298 inline
doAMode_R__wrk(UChar * p,UInt gregEnc3210,UInt eregEnc3210)2299 static UChar* doAMode_R__wrk ( UChar* p, UInt gregEnc3210, UInt eregEnc3210 )
2300 {
2301    *p++ = mkModRegRM(3, gregEnc3210 & 7, eregEnc3210 & 7);
2302    return p;
2303 }
2304 
doAMode_R(UChar * p,HReg greg,HReg ereg)2305 static UChar* doAMode_R ( UChar* p, HReg greg, HReg ereg )
2306 {
2307    return doAMode_R__wrk(p, iregEnc3210(greg), iregEnc3210(ereg));
2308 }
2309 
doAMode_R_enc_reg(UChar * p,UInt gregEnc3210,HReg ereg)2310 static UChar* doAMode_R_enc_reg ( UChar* p, UInt gregEnc3210, HReg ereg )
2311 {
2312    vassert(gregEnc3210 < 16);
2313    return doAMode_R__wrk(p, gregEnc3210, iregEnc3210(ereg));
2314 }
2315 
doAMode_R_reg_enc(UChar * p,HReg greg,UInt eregEnc3210)2316 static UChar* doAMode_R_reg_enc ( UChar* p, HReg greg, UInt eregEnc3210 )
2317 {
2318    vassert(eregEnc3210 < 16);
2319    return doAMode_R__wrk(p, iregEnc3210(greg), eregEnc3210);
2320 }
2321 
doAMode_R_enc_enc(UChar * p,UInt gregEnc3210,UInt eregEnc3210)2322 static UChar* doAMode_R_enc_enc ( UChar* p, UInt gregEnc3210, UInt eregEnc3210 )
2323 {
2324    vassert( (gregEnc3210|eregEnc3210) < 16);
2325    return doAMode_R__wrk(p, gregEnc3210, eregEnc3210);
2326 }
2327 
2328 
2329 /* Clear the W bit on a REX byte, thereby changing the operand size
2330    back to whatever that instruction's default operand size is. */
clearWBit(UChar rex)2331 static inline UChar clearWBit ( UChar rex )
2332 {
2333    return rex & ~(1<<3);
2334 }
2335 
setWBit(UChar rex)2336 static inline UChar setWBit ( UChar rex )
2337 {
2338    return rex | (1<<3);
2339 }
2340 
2341 
2342 /* Make up a REX byte, with W=1 (size=64), for a (greg,amode) pair. */
rexAMode_M__wrk(UInt gregEnc3210,AMD64AMode * am)2343 inline static UChar rexAMode_M__wrk ( UInt gregEnc3210, AMD64AMode* am )
2344 {
2345    if (am->tag == Aam_IR) {
2346       UChar W = 1;  /* we want 64-bit mode */
2347       UChar R = (gregEnc3210 >> 3) & 1;
2348       UChar X = 0; /* not relevant */
2349       UChar B = iregEnc3(am->Aam.IR.reg);
2350       return 0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0));
2351    }
2352    if (am->tag == Aam_IRRS) {
2353       UChar W = 1;  /* we want 64-bit mode */
2354       UChar R = (gregEnc3210 >> 3) & 1;
2355       UChar X = iregEnc3(am->Aam.IRRS.index);
2356       UChar B = iregEnc3(am->Aam.IRRS.base);
2357       return 0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0));
2358    }
2359    vassert(0);
2360    return 0; /*NOTREACHED*/
2361 }
2362 
rexAMode_M(HReg greg,AMD64AMode * am)2363 static UChar rexAMode_M ( HReg greg, AMD64AMode* am )
2364 {
2365    return rexAMode_M__wrk(iregEnc3210(greg), am);
2366 }
2367 
rexAMode_M_enc(UInt gregEnc3210,AMD64AMode * am)2368 static UChar rexAMode_M_enc ( UInt gregEnc3210, AMD64AMode* am )
2369 {
2370    vassert(gregEnc3210 < 16);
2371    return rexAMode_M__wrk(gregEnc3210, am);
2372 }
2373 
2374 
2375 /* Make up a REX byte, with W=1 (size=64), for a (greg,ereg) pair. */
rexAMode_R__wrk(UInt gregEnc3210,UInt eregEnc3210)2376 inline static UChar rexAMode_R__wrk ( UInt gregEnc3210, UInt eregEnc3210 )
2377 {
2378    UChar W = 1;  /* we want 64-bit mode */
2379    UChar R = (gregEnc3210 >> 3) & 1;
2380    UChar X = 0; /* not relevant */
2381    UChar B = (eregEnc3210 >> 3) & 1;
2382    return 0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0));
2383 }
2384 
rexAMode_R(HReg greg,HReg ereg)2385 static UChar rexAMode_R ( HReg greg, HReg ereg )
2386 {
2387    return rexAMode_R__wrk(iregEnc3210(greg), iregEnc3210(ereg));
2388 }
2389 
rexAMode_R_enc_reg(UInt gregEnc3210,HReg ereg)2390 static UChar rexAMode_R_enc_reg ( UInt gregEnc3210, HReg ereg )
2391 {
2392    vassert(gregEnc3210 < 16);
2393    return rexAMode_R__wrk(gregEnc3210, iregEnc3210(ereg));
2394 }
2395 
rexAMode_R_reg_enc(HReg greg,UInt eregEnc3210)2396 static UChar rexAMode_R_reg_enc ( HReg greg, UInt eregEnc3210 )
2397 {
2398    vassert(eregEnc3210 < 16);
2399    return rexAMode_R__wrk(iregEnc3210(greg), eregEnc3210);
2400 }
2401 
rexAMode_R_enc_enc(UInt gregEnc3210,UInt eregEnc3210)2402 static UChar rexAMode_R_enc_enc ( UInt gregEnc3210, UInt eregEnc3210 )
2403 {
2404    vassert((gregEnc3210|eregEnc3210) < 16);
2405    return rexAMode_R__wrk(gregEnc3210, eregEnc3210);
2406 }
2407 
2408 
2409 //uu /* May 2012: this VEX prefix stuff is currently unused, but has
2410 //uu    verified correct (I reckon).  Certainly it has been known to
2411 //uu    produce correct VEX prefixes during testing. */
2412 //uu
2413 //uu /* Assemble a 2 or 3 byte VEX prefix from parts.  rexR, rexX, rexB and
2414 //uu    notVvvvv need to be not-ed before packing.  mmmmm, rexW, L and pp go
2415 //uu    in verbatim.  There's no range checking on the bits. */
2416 //uu static UInt packVexPrefix ( UInt rexR, UInt rexX, UInt rexB,
2417 //uu                             UInt mmmmm, UInt rexW, UInt notVvvv,
2418 //uu                             UInt L, UInt pp )
2419 //uu {
2420 //uu    UChar byte0 = 0;
2421 //uu    UChar byte1 = 0;
2422 //uu    UChar byte2 = 0;
2423 //uu    if (rexX == 0 && rexB == 0 && mmmmm == 1 && rexW == 0) {
2424 //uu       /* 2 byte encoding is possible. */
2425 //uu       byte0 = 0xC5;
2426 //uu       byte1 = ((rexR ^ 1) << 7) | ((notVvvv ^ 0xF) << 3)
2427 //uu               | (L << 2) | pp;
2428 //uu    } else {
2429 //uu       /* 3 byte encoding is needed. */
2430 //uu       byte0 = 0xC4;
2431 //uu       byte1 = ((rexR ^ 1) << 7) | ((rexX ^ 1) << 6)
2432 //uu               | ((rexB ^ 1) << 5) | mmmmm;
2433 //uu       byte2 = (rexW << 7) | ((notVvvv ^ 0xF) << 3) | (L << 2) | pp;
2434 //uu    }
2435 //uu    return (((UInt)byte2) << 16) | (((UInt)byte1) << 8) | ((UInt)byte0);
2436 //uu }
2437 //uu
2438 //uu /* Make up a VEX prefix for a (greg,amode) pair.  First byte in bits
2439 //uu    7:0 of result, second in 15:8, third (for a 3 byte prefix) in
2440 //uu    23:16.  Has m-mmmm set to indicate a prefix of 0F, pp set to
2441 //uu    indicate no SIMD prefix, W=0 (ignore), L=1 (size=256), and
2442 //uu    vvvv=1111 (unused 3rd reg). */
2443 //uu static UInt vexAMode_M ( HReg greg, AMD64AMode* am )
2444 //uu {
2445 //uu    UChar L       = 1; /* size = 256 */
2446 //uu    UChar pp      = 0; /* no SIMD prefix */
2447 //uu    UChar mmmmm   = 1; /* 0F */
2448 //uu    UChar notVvvv = 0; /* unused */
2449 //uu    UChar rexW    = 0;
2450 //uu    UChar rexR    = 0;
2451 //uu    UChar rexX    = 0;
2452 //uu    UChar rexB    = 0;
2453 //uu    /* Same logic as in rexAMode_M. */
2454 //uu    if (am->tag == Aam_IR) {
2455 //uu       rexR = iregEnc3(greg);
2456 //uu       rexX = 0; /* not relevant */
2457 //uu       rexB = iregEnc3(am->Aam.IR.reg);
2458 //uu    }
2459 //uu    else if (am->tag == Aam_IRRS) {
2460 //uu       rexR = iregEnc3(greg);
2461 //uu       rexX = iregEnc3(am->Aam.IRRS.index);
2462 //uu       rexB = iregEnc3(am->Aam.IRRS.base);
2463 //uu    } else {
2464 //uu       vassert(0);
2465 //uu    }
2466 //uu    return packVexPrefix( rexR, rexX, rexB, mmmmm, rexW, notVvvv, L, pp );
2467 //uu }
2468 //uu
2469 //uu static UChar* emitVexPrefix ( UChar* p, UInt vex )
2470 //uu {
2471 //uu    switch (vex & 0xFF) {
2472 //uu       case 0xC5:
2473 //uu          *p++ = 0xC5;
2474 //uu          *p++ = (vex >> 8) & 0xFF;
2475 //uu          vassert(0 == (vex >> 16));
2476 //uu          break;
2477 //uu       case 0xC4:
2478 //uu          *p++ = 0xC4;
2479 //uu          *p++ = (vex >> 8) & 0xFF;
2480 //uu          *p++ = (vex >> 16) & 0xFF;
2481 //uu          vassert(0 == (vex >> 24));
2482 //uu          break;
2483 //uu       default:
2484 //uu          vassert(0);
2485 //uu    }
2486 //uu    return p;
2487 //uu }
2488 
2489 
2490 /* Emit ffree %st(N) */
do_ffree_st(UChar * p,Int n)2491 static UChar* do_ffree_st ( UChar* p, Int n )
2492 {
2493    vassert(n >= 0 && n <= 7);
2494    *p++ = 0xDD;
2495    *p++ = toUChar(0xC0 + n);
2496    return p;
2497 }
2498 
2499 /* Emit an instruction into buf and return the number of bytes used.
2500    Note that buf is not the insn's final place, and therefore it is
2501    imperative to emit position-independent code.  If the emitted
2502    instruction was a profiler inc, set *is_profInc to True, else
2503    leave it unchanged. */
2504 
emit_AMD64Instr(Bool * is_profInc,UChar * buf,Int nbuf,const AMD64Instr * i,Bool mode64,VexEndness endness_host,const void * disp_cp_chain_me_to_slowEP,const void * disp_cp_chain_me_to_fastEP,const void * disp_cp_xindir,const void * disp_cp_xassisted)2505 Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc,
2506                       UChar* buf, Int nbuf, const AMD64Instr* i,
2507                       Bool mode64, VexEndness endness_host,
2508                       const void* disp_cp_chain_me_to_slowEP,
2509                       const void* disp_cp_chain_me_to_fastEP,
2510                       const void* disp_cp_xindir,
2511                       const void* disp_cp_xassisted )
2512 {
2513    UInt /*irno,*/ opc, opc_rr, subopc_imm, opc_imma, opc_cl, opc_imm, subopc;
2514    UInt   xtra;
2515    UInt   reg;
2516    UChar  rex;
2517    UChar* p = &buf[0];
2518    UChar* ptmp;
2519    Int    j;
2520    vassert(nbuf >= 64);
2521    vassert(mode64 == True);
2522 
2523    /* vex_printf("asm  "); ppAMD64Instr(i, mode64); vex_printf("\n"); */
2524 
2525    switch (i->tag) {
2526 
2527    case Ain_Imm64:
2528       if (i->Ain.Imm64.imm64 <= 0xFFFFFULL) {
2529          /* Use the short form (load into 32 bit reg, + default
2530             widening rule) for constants under 1 million.  We could
2531             use this form for the range 0 to 0x7FFFFFFF inclusive, but
2532             limit it to a smaller range for verifiability purposes. */
2533          if (1 & iregEnc3(i->Ain.Imm64.dst))
2534             *p++ = 0x41;
2535          *p++ = 0xB8 + iregEnc210(i->Ain.Imm64.dst);
2536          p = emit32(p, (UInt)i->Ain.Imm64.imm64);
2537       } else {
2538          *p++ = toUChar(0x48 + (1 & iregEnc3(i->Ain.Imm64.dst)));
2539          *p++ = toUChar(0xB8 + iregEnc210(i->Ain.Imm64.dst));
2540          p = emit64(p, i->Ain.Imm64.imm64);
2541       }
2542       goto done;
2543 
2544    case Ain_Alu64R:
2545       /* Deal specially with MOV */
2546       if (i->Ain.Alu64R.op == Aalu_MOV) {
2547          switch (i->Ain.Alu64R.src->tag) {
2548             case Armi_Imm:
2549                if (0 == (i->Ain.Alu64R.src->Armi.Imm.imm32 & ~0xFFFFF)) {
2550                   /* Actually we could use this form for constants in
2551                      the range 0 through 0x7FFFFFFF inclusive, but
2552                      limit it to a small range for verifiability
2553                      purposes. */
2554                   /* Generate "movl $imm32, 32-bit-register" and let
2555                      the default zero-extend rule cause the upper half
2556                      of the dst to be zeroed out too.  This saves 1
2557                      and sometimes 2 bytes compared to the more
2558                      obvious encoding in the 'else' branch. */
2559                   if (1 & iregEnc3(i->Ain.Alu64R.dst))
2560                      *p++ = 0x41;
2561                   *p++ = 0xB8 + iregEnc210(i->Ain.Alu64R.dst);
2562                   p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2563                } else {
2564                   *p++ = toUChar(0x48 + (1 & iregEnc3(i->Ain.Alu64R.dst)));
2565                   *p++ = 0xC7;
2566                   *p++ = toUChar(0xC0 + iregEnc210(i->Ain.Alu64R.dst));
2567                   p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2568                }
2569                goto done;
2570             case Armi_Reg:
2571                *p++ = rexAMode_R( i->Ain.Alu64R.src->Armi.Reg.reg,
2572                                   i->Ain.Alu64R.dst );
2573                *p++ = 0x89;
2574                p = doAMode_R(p, i->Ain.Alu64R.src->Armi.Reg.reg,
2575                                 i->Ain.Alu64R.dst);
2576                goto done;
2577             case Armi_Mem:
2578                *p++ = rexAMode_M(i->Ain.Alu64R.dst,
2579                                  i->Ain.Alu64R.src->Armi.Mem.am);
2580                *p++ = 0x8B;
2581                p = doAMode_M(p, i->Ain.Alu64R.dst,
2582                                 i->Ain.Alu64R.src->Armi.Mem.am);
2583                goto done;
2584             default:
2585                goto bad;
2586          }
2587       }
2588       /* MUL */
2589       if (i->Ain.Alu64R.op == Aalu_MUL) {
2590          switch (i->Ain.Alu64R.src->tag) {
2591             case Armi_Reg:
2592                *p++ = rexAMode_R( i->Ain.Alu64R.dst,
2593                                   i->Ain.Alu64R.src->Armi.Reg.reg);
2594                *p++ = 0x0F;
2595                *p++ = 0xAF;
2596                p = doAMode_R(p, i->Ain.Alu64R.dst,
2597                                 i->Ain.Alu64R.src->Armi.Reg.reg);
2598                goto done;
2599             case Armi_Mem:
2600                *p++ = rexAMode_M(i->Ain.Alu64R.dst,
2601                                  i->Ain.Alu64R.src->Armi.Mem.am);
2602                *p++ = 0x0F;
2603                *p++ = 0xAF;
2604                p = doAMode_M(p, i->Ain.Alu64R.dst,
2605                                 i->Ain.Alu64R.src->Armi.Mem.am);
2606                goto done;
2607             case Armi_Imm:
2608                if (fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
2609                   *p++ = rexAMode_R(i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2610                   *p++ = 0x6B;
2611                   p = doAMode_R(p, i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2612                   *p++ = toUChar(0xFF & i->Ain.Alu64R.src->Armi.Imm.imm32);
2613                } else {
2614                   *p++ = rexAMode_R(i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2615                   *p++ = 0x69;
2616                   p = doAMode_R(p, i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2617                   p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2618                }
2619                goto done;
2620             default:
2621                goto bad;
2622          }
2623       }
2624       /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP */
2625       opc = opc_rr = subopc_imm = opc_imma = 0;
2626       switch (i->Ain.Alu64R.op) {
2627          case Aalu_ADC: opc = 0x13; opc_rr = 0x11;
2628                         subopc_imm = 2; opc_imma = 0x15; break;
2629          case Aalu_ADD: opc = 0x03; opc_rr = 0x01;
2630                         subopc_imm = 0; opc_imma = 0x05; break;
2631          case Aalu_SUB: opc = 0x2B; opc_rr = 0x29;
2632                         subopc_imm = 5; opc_imma = 0x2D; break;
2633          case Aalu_SBB: opc = 0x1B; opc_rr = 0x19;
2634                         subopc_imm = 3; opc_imma = 0x1D; break;
2635          case Aalu_AND: opc = 0x23; opc_rr = 0x21;
2636                         subopc_imm = 4; opc_imma = 0x25; break;
2637          case Aalu_XOR: opc = 0x33; opc_rr = 0x31;
2638                         subopc_imm = 6; opc_imma = 0x35; break;
2639          case Aalu_OR:  opc = 0x0B; opc_rr = 0x09;
2640                         subopc_imm = 1; opc_imma = 0x0D; break;
2641          case Aalu_CMP: opc = 0x3B; opc_rr = 0x39;
2642                         subopc_imm = 7; opc_imma = 0x3D; break;
2643          default: goto bad;
2644       }
2645       switch (i->Ain.Alu64R.src->tag) {
2646          case Armi_Imm:
2647             if (sameHReg(i->Ain.Alu64R.dst, hregAMD64_RAX())
2648                 && !fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
2649                goto bad; /* FIXME: awaiting test case */
2650                *p++ = toUChar(opc_imma);
2651                p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2652             } else
2653             if (fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
2654                *p++ = rexAMode_R_enc_reg( 0, i->Ain.Alu64R.dst );
2655                *p++ = 0x83;
2656                p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu64R.dst);
2657                *p++ = toUChar(0xFF & i->Ain.Alu64R.src->Armi.Imm.imm32);
2658             } else {
2659                *p++ = rexAMode_R_enc_reg( 0, i->Ain.Alu64R.dst);
2660                *p++ = 0x81;
2661                p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu64R.dst);
2662                p    = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2663             }
2664             goto done;
2665          case Armi_Reg:
2666             *p++ = rexAMode_R( i->Ain.Alu64R.src->Armi.Reg.reg,
2667                                i->Ain.Alu64R.dst);
2668             *p++ = toUChar(opc_rr);
2669             p = doAMode_R(p, i->Ain.Alu64R.src->Armi.Reg.reg,
2670                              i->Ain.Alu64R.dst);
2671             goto done;
2672          case Armi_Mem:
2673             *p++ = rexAMode_M( i->Ain.Alu64R.dst,
2674                                i->Ain.Alu64R.src->Armi.Mem.am);
2675             *p++ = toUChar(opc);
2676             p = doAMode_M(p, i->Ain.Alu64R.dst,
2677                              i->Ain.Alu64R.src->Armi.Mem.am);
2678             goto done;
2679          default:
2680             goto bad;
2681       }
2682       break;
2683 
2684    case Ain_Alu64M:
2685       /* Deal specially with MOV */
2686       if (i->Ain.Alu64M.op == Aalu_MOV) {
2687          switch (i->Ain.Alu64M.src->tag) {
2688             case Ari_Reg:
2689                *p++ = rexAMode_M(i->Ain.Alu64M.src->Ari.Reg.reg,
2690                                  i->Ain.Alu64M.dst);
2691                *p++ = 0x89;
2692                p = doAMode_M(p, i->Ain.Alu64M.src->Ari.Reg.reg,
2693                                 i->Ain.Alu64M.dst);
2694                goto done;
2695             case Ari_Imm:
2696                *p++ = rexAMode_M_enc(0, i->Ain.Alu64M.dst);
2697                *p++ = 0xC7;
2698                p = doAMode_M_enc(p, 0, i->Ain.Alu64M.dst);
2699                p = emit32(p, i->Ain.Alu64M.src->Ari.Imm.imm32);
2700                goto done;
2701             default:
2702                goto bad;
2703          }
2704       }
2705       /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP.  MUL is not
2706          allowed here. (This is derived from the x86 version of same). */
2707       opc = subopc_imm = opc_imma = 0;
2708       switch (i->Ain.Alu64M.op) {
2709          case Aalu_CMP: opc = 0x39; subopc_imm = 7; break;
2710          default: goto bad;
2711       }
2712       switch (i->Ain.Alu64M.src->tag) {
2713          /*
2714          case Xri_Reg:
2715             *p++ = toUChar(opc);
2716             p = doAMode_M(p, i->Xin.Alu32M.src->Xri.Reg.reg,
2717                              i->Xin.Alu32M.dst);
2718             goto done;
2719          */
2720          case Ari_Imm:
2721             if (fits8bits(i->Ain.Alu64M.src->Ari.Imm.imm32)) {
2722                *p++ = rexAMode_M_enc(subopc_imm, i->Ain.Alu64M.dst);
2723                *p++ = 0x83;
2724                p    = doAMode_M_enc(p, subopc_imm, i->Ain.Alu64M.dst);
2725                *p++ = toUChar(0xFF & i->Ain.Alu64M.src->Ari.Imm.imm32);
2726                goto done;
2727             } else {
2728                *p++ = rexAMode_M_enc(subopc_imm, i->Ain.Alu64M.dst);
2729                *p++ = 0x81;
2730                p    = doAMode_M_enc(p, subopc_imm, i->Ain.Alu64M.dst);
2731                p    = emit32(p, i->Ain.Alu64M.src->Ari.Imm.imm32);
2732                goto done;
2733             }
2734          default:
2735             goto bad;
2736       }
2737 
2738       break;
2739 
2740    case Ain_Sh64:
2741       opc_cl = opc_imm = subopc = 0;
2742       switch (i->Ain.Sh64.op) {
2743          case Ash_SHR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 5; break;
2744          case Ash_SAR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 7; break;
2745          case Ash_SHL: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 4; break;
2746          default: goto bad;
2747       }
2748       if (i->Ain.Sh64.src == 0) {
2749          *p++ = rexAMode_R_enc_reg(0, i->Ain.Sh64.dst);
2750          *p++ = toUChar(opc_cl);
2751          p = doAMode_R_enc_reg(p, subopc, i->Ain.Sh64.dst);
2752          goto done;
2753       } else {
2754          *p++ = rexAMode_R_enc_reg(0, i->Ain.Sh64.dst);
2755          *p++ = toUChar(opc_imm);
2756          p = doAMode_R_enc_reg(p, subopc, i->Ain.Sh64.dst);
2757          *p++ = (UChar)(i->Ain.Sh64.src);
2758          goto done;
2759       }
2760       break;
2761 
2762    case Ain_Test64:
2763       /* testq sign-extend($imm32), %reg */
2764       *p++ = rexAMode_R_enc_reg(0, i->Ain.Test64.dst);
2765       *p++ = 0xF7;
2766       p = doAMode_R_enc_reg(p, 0, i->Ain.Test64.dst);
2767       p = emit32(p, i->Ain.Test64.imm32);
2768       goto done;
2769 
2770    case Ain_Unary64:
2771       if (i->Ain.Unary64.op == Aun_NOT) {
2772          *p++ = rexAMode_R_enc_reg(0, i->Ain.Unary64.dst);
2773          *p++ = 0xF7;
2774          p = doAMode_R_enc_reg(p, 2, i->Ain.Unary64.dst);
2775          goto done;
2776       }
2777       if (i->Ain.Unary64.op == Aun_NEG) {
2778          *p++ = rexAMode_R_enc_reg(0, i->Ain.Unary64.dst);
2779          *p++ = 0xF7;
2780          p = doAMode_R_enc_reg(p, 3, i->Ain.Unary64.dst);
2781          goto done;
2782       }
2783       break;
2784 
2785    case Ain_Lea64:
2786       *p++ = rexAMode_M(i->Ain.Lea64.dst, i->Ain.Lea64.am);
2787       *p++ = 0x8D;
2788       p = doAMode_M(p, i->Ain.Lea64.dst, i->Ain.Lea64.am);
2789       goto done;
2790 
2791    case Ain_Alu32R:
2792       /* ADD/SUB/AND/OR/XOR/CMP */
2793       opc = opc_rr = subopc_imm = opc_imma = 0;
2794       switch (i->Ain.Alu32R.op) {
2795          case Aalu_ADD: opc = 0x03; opc_rr = 0x01;
2796                         subopc_imm = 0; opc_imma = 0x05; break;
2797          case Aalu_SUB: opc = 0x2B; opc_rr = 0x29;
2798                         subopc_imm = 5; opc_imma = 0x2D; break;
2799          case Aalu_AND: opc = 0x23; opc_rr = 0x21;
2800                         subopc_imm = 4; opc_imma = 0x25; break;
2801          case Aalu_XOR: opc = 0x33; opc_rr = 0x31;
2802                         subopc_imm = 6; opc_imma = 0x35; break;
2803          case Aalu_OR:  opc = 0x0B; opc_rr = 0x09;
2804                         subopc_imm = 1; opc_imma = 0x0D; break;
2805          case Aalu_CMP: opc = 0x3B; opc_rr = 0x39;
2806                         subopc_imm = 7; opc_imma = 0x3D; break;
2807          default: goto bad;
2808       }
2809       switch (i->Ain.Alu32R.src->tag) {
2810          case Armi_Imm:
2811             if (sameHReg(i->Ain.Alu32R.dst, hregAMD64_RAX())
2812                 && !fits8bits(i->Ain.Alu32R.src->Armi.Imm.imm32)) {
2813                goto bad; /* FIXME: awaiting test case */
2814                *p++ = toUChar(opc_imma);
2815                p = emit32(p, i->Ain.Alu32R.src->Armi.Imm.imm32);
2816             } else
2817             if (fits8bits(i->Ain.Alu32R.src->Armi.Imm.imm32)) {
2818                rex  = clearWBit( rexAMode_R_enc_reg( 0, i->Ain.Alu32R.dst ) );
2819                if (rex != 0x40) *p++ = rex;
2820                *p++ = 0x83;
2821                p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu32R.dst);
2822                *p++ = toUChar(0xFF & i->Ain.Alu32R.src->Armi.Imm.imm32);
2823             } else {
2824                rex  = clearWBit( rexAMode_R_enc_reg( 0, i->Ain.Alu32R.dst) );
2825                if (rex != 0x40) *p++ = rex;
2826                *p++ = 0x81;
2827                p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu32R.dst);
2828                p    = emit32(p, i->Ain.Alu32R.src->Armi.Imm.imm32);
2829             }
2830             goto done;
2831          case Armi_Reg:
2832             rex  = clearWBit(
2833                    rexAMode_R( i->Ain.Alu32R.src->Armi.Reg.reg,
2834                                i->Ain.Alu32R.dst) );
2835             if (rex != 0x40) *p++ = rex;
2836             *p++ = toUChar(opc_rr);
2837             p = doAMode_R(p, i->Ain.Alu32R.src->Armi.Reg.reg,
2838                              i->Ain.Alu32R.dst);
2839             goto done;
2840          case Armi_Mem:
2841             rex  = clearWBit(
2842                    rexAMode_M( i->Ain.Alu32R.dst,
2843                                i->Ain.Alu32R.src->Armi.Mem.am) );
2844             if (rex != 0x40) *p++ = rex;
2845             *p++ = toUChar(opc);
2846             p = doAMode_M(p, i->Ain.Alu32R.dst,
2847                              i->Ain.Alu32R.src->Armi.Mem.am);
2848             goto done;
2849          default:
2850             goto bad;
2851       }
2852       break;
2853 
2854    case Ain_MulL:
2855       subopc = i->Ain.MulL.syned ? 5 : 4;
2856       switch (i->Ain.MulL.src->tag)  {
2857          case Arm_Mem:
2858             *p++ = rexAMode_M_enc(0, i->Ain.MulL.src->Arm.Mem.am);
2859             *p++ = 0xF7;
2860             p = doAMode_M_enc(p, subopc, i->Ain.MulL.src->Arm.Mem.am);
2861             goto done;
2862          case Arm_Reg:
2863             *p++ = rexAMode_R_enc_reg(0, i->Ain.MulL.src->Arm.Reg.reg);
2864             *p++ = 0xF7;
2865             p = doAMode_R_enc_reg(p, subopc, i->Ain.MulL.src->Arm.Reg.reg);
2866             goto done;
2867          default:
2868             goto bad;
2869       }
2870       break;
2871 
2872    case Ain_Div:
2873       subopc = i->Ain.Div.syned ? 7 : 6;
2874       if (i->Ain.Div.sz == 4) {
2875          switch (i->Ain.Div.src->tag)  {
2876             case Arm_Mem:
2877                goto bad;
2878                /*FIXME*/
2879                *p++ = 0xF7;
2880                p = doAMode_M_enc(p, subopc, i->Ain.Div.src->Arm.Mem.am);
2881                goto done;
2882             case Arm_Reg:
2883                *p++ = clearWBit(
2884                       rexAMode_R_enc_reg(0, i->Ain.Div.src->Arm.Reg.reg));
2885                *p++ = 0xF7;
2886                p = doAMode_R_enc_reg(p, subopc, i->Ain.Div.src->Arm.Reg.reg);
2887                goto done;
2888             default:
2889                goto bad;
2890          }
2891       }
2892       if (i->Ain.Div.sz == 8) {
2893          switch (i->Ain.Div.src->tag)  {
2894             case Arm_Mem:
2895                *p++ = rexAMode_M_enc(0, i->Ain.Div.src->Arm.Mem.am);
2896                *p++ = 0xF7;
2897                p = doAMode_M_enc(p, subopc, i->Ain.Div.src->Arm.Mem.am);
2898                goto done;
2899             case Arm_Reg:
2900                *p++ = rexAMode_R_enc_reg(0, i->Ain.Div.src->Arm.Reg.reg);
2901                *p++ = 0xF7;
2902                p = doAMode_R_enc_reg(p, subopc, i->Ain.Div.src->Arm.Reg.reg);
2903                goto done;
2904             default:
2905                goto bad;
2906          }
2907       }
2908       break;
2909 
2910    case Ain_Push:
2911       switch (i->Ain.Push.src->tag) {
2912          case Armi_Mem:
2913             *p++ = clearWBit(
2914                    rexAMode_M_enc(0, i->Ain.Push.src->Armi.Mem.am));
2915             *p++ = 0xFF;
2916             p = doAMode_M_enc(p, 6, i->Ain.Push.src->Armi.Mem.am);
2917             goto done;
2918          case Armi_Imm:
2919             *p++ = 0x68;
2920             p = emit32(p, i->Ain.Push.src->Armi.Imm.imm32);
2921             goto done;
2922          case Armi_Reg:
2923             *p++ = toUChar(0x40 + (1 & iregEnc3(i->Ain.Push.src->Armi.Reg.reg)));
2924             *p++ = toUChar(0x50 + iregEnc210(i->Ain.Push.src->Armi.Reg.reg));
2925             goto done;
2926         default:
2927             goto bad;
2928       }
2929 
2930    case Ain_Call: {
2931       /* As per detailed comment for Ain_Call in getRegUsage_AMD64Instr
2932          above, %r11 is used as an address temporary. */
2933       /* If we don't need to do any fixup actions in the case that the
2934          call doesn't happen, just do the simple thing and emit
2935          straight-line code.  This is usually the case. */
2936       if (i->Ain.Call.cond == Acc_ALWAYS/*call always happens*/
2937           || i->Ain.Call.rloc.pri == RLPri_None/*no fixup action*/) {
2938          /* jump over the following two insns if the condition does
2939             not hold */
2940          Bool shortImm = fitsIn32Bits(i->Ain.Call.target);
2941          if (i->Ain.Call.cond != Acc_ALWAYS) {
2942             *p++ = toUChar(0x70 + (0xF & (i->Ain.Call.cond ^ 1)));
2943             *p++ = shortImm ? 10 : 13;
2944             /* 10 or 13 bytes in the next two insns */
2945          }
2946          if (shortImm) {
2947             /* 7 bytes: movl sign-extend(imm32), %r11 */
2948             *p++ = 0x49;
2949             *p++ = 0xC7;
2950             *p++ = 0xC3;
2951             p = emit32(p, (UInt)i->Ain.Call.target);
2952          } else {
2953             /* 10 bytes: movabsq $target, %r11 */
2954             *p++ = 0x49;
2955             *p++ = 0xBB;
2956             p = emit64(p, i->Ain.Call.target);
2957          }
2958          /* 3 bytes: call *%r11 */
2959          *p++ = 0x41;
2960          *p++ = 0xFF;
2961          *p++ = 0xD3;
2962       } else {
2963          Int delta;
2964          /* Complex case.  We have to generate an if-then-else diamond. */
2965          // before:
2966          //   j{!cond} else:
2967          //   movabsq $target, %r11
2968          //   call* %r11
2969          // preElse:
2970          //   jmp after:
2971          // else:
2972          //   movabsq $0x5555555555555555, %rax  // possibly
2973          //   movq %rax, %rdx                    // possibly
2974          // after:
2975 
2976          // before:
2977          UChar* pBefore = p;
2978 
2979          //   j{!cond} else:
2980          *p++ = toUChar(0x70 + (0xF & (i->Ain.Call.cond ^ 1)));
2981          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2982 
2983          //   movabsq $target, %r11
2984          *p++ = 0x49;
2985          *p++ = 0xBB;
2986          p = emit64(p, i->Ain.Call.target);
2987 
2988          //   call* %r11
2989          *p++ = 0x41;
2990          *p++ = 0xFF;
2991          *p++ = 0xD3;
2992 
2993          // preElse:
2994          UChar* pPreElse = p;
2995 
2996          //   jmp after:
2997          *p++ = 0xEB;
2998          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2999 
3000          // else:
3001          UChar* pElse = p;
3002 
3003          /* Do the 'else' actions */
3004          switch (i->Ain.Call.rloc.pri) {
3005             case RLPri_Int:
3006                // movabsq $0x5555555555555555, %rax
3007                *p++ = 0x48; *p++ = 0xB8; p = emit64(p, 0x5555555555555555ULL);
3008                break;
3009             case RLPri_2Int:
3010                goto bad; //ATC
3011                // movabsq $0x5555555555555555, %rax
3012                *p++ = 0x48; *p++ = 0xB8; p = emit64(p, 0x5555555555555555ULL);
3013                // movq %rax, %rdx
3014                *p++ = 0x48; *p++ = 0x89; *p++ = 0xC2;
3015                break;
3016             case RLPri_V128SpRel:
3017                if (i->Ain.Call.rloc.spOff == 0) {
3018                   // We could accept any |spOff| here, but that's more
3019                   // hassle and the only value we're ever going to get
3020                   // is zero (I believe.)  Hence take the easy path :)
3021                   // We need a scag register -- r11 can be it.
3022                   // movabsq $0x5555555555555555, %r11
3023                   *p++ = 0x49; *p++ = 0xBB;
3024                   p = emit64(p, 0x5555555555555555ULL);
3025                   // movq %r11, 0(%rsp)
3026                   *p++ = 0x4C; *p++ = 0x89; *p++ = 0x1C; *p++ = 0x24;
3027                   // movq %r11, 8(%rsp)
3028                   *p++ = 0x4C; *p++ = 0x89; *p++ = 0x5C; *p++ = 0x24;
3029                   *p++ = 0x08;
3030                   break;
3031                }
3032                goto bad; //ATC for all other spOff values
3033             case RLPri_V256SpRel:
3034                goto bad; //ATC
3035             case RLPri_None: case RLPri_INVALID: default:
3036                vassert(0); // should never get here
3037          }
3038 
3039          // after:
3040          UChar* pAfter = p;
3041 
3042          // Fix up the branch offsets.  The +2s in the offset
3043          // calculations are there because x86 requires conditional
3044          // branches to have their offset stated relative to the
3045          // instruction immediately following the branch insn.  And in
3046          // both cases the branch insns are 2 bytes long.
3047 
3048          // First, the "j{!cond} else:" at pBefore.
3049          delta = (Int)(Long)(pElse - (pBefore + 2));
3050          vassert(delta >= 0 && delta < 100/*arbitrary*/);
3051          *(pBefore+1) = (UChar)delta;
3052 
3053          // And secondly, the "jmp after:" at pPreElse.
3054          delta = (Int)(Long)(pAfter - (pPreElse + 2));
3055          vassert(delta >= 0 && delta < 100/*arbitrary*/);
3056          *(pPreElse+1) = (UChar)delta;
3057       }
3058       goto done;
3059    }
3060 
3061    case Ain_XDirect: {
3062       /* NB: what goes on here has to be very closely coordinated with the
3063          chainXDirect_AMD64 and unchainXDirect_AMD64 below. */
3064       /* We're generating chain-me requests here, so we need to be
3065          sure this is actually allowed -- no-redir translations can't
3066          use chain-me's.  Hence: */
3067       vassert(disp_cp_chain_me_to_slowEP != NULL);
3068       vassert(disp_cp_chain_me_to_fastEP != NULL);
3069 
3070       HReg r11 = hregAMD64_R11();
3071 
3072       /* Use ptmp for backpatching conditional jumps. */
3073       ptmp = NULL;
3074 
3075       /* First off, if this is conditional, create a conditional
3076          jump over the rest of it. */
3077       if (i->Ain.XDirect.cond != Acc_ALWAYS) {
3078          /* jmp fwds if !condition */
3079          *p++ = toUChar(0x70 + (0xF & (i->Ain.XDirect.cond ^ 1)));
3080          ptmp = p; /* fill in this bit later */
3081          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3082       }
3083 
3084       /* Update the guest RIP. */
3085       if (fitsIn32Bits(i->Ain.XDirect.dstGA)) {
3086          /* use a shorter encoding */
3087          /* movl sign-extend(dstGA), %r11 */
3088          *p++ = 0x49;
3089          *p++ = 0xC7;
3090          *p++ = 0xC3;
3091          p = emit32(p, (UInt)i->Ain.XDirect.dstGA);
3092       } else {
3093          /* movabsq $dstGA, %r11 */
3094          *p++ = 0x49;
3095          *p++ = 0xBB;
3096          p = emit64(p, i->Ain.XDirect.dstGA);
3097       }
3098 
3099       /* movq %r11, amRIP */
3100       *p++ = rexAMode_M(r11, i->Ain.XDirect.amRIP);
3101       *p++ = 0x89;
3102       p = doAMode_M(p, r11, i->Ain.XDirect.amRIP);
3103 
3104       /* --- FIRST PATCHABLE BYTE follows --- */
3105       /* VG_(disp_cp_chain_me_to_{slowEP,fastEP}) (where we're calling
3106          to) backs up the return address, so as to find the address of
3107          the first patchable byte.  So: don't change the length of the
3108          two instructions below. */
3109       /* movabsq $disp_cp_chain_me_to_{slow,fast}EP,%r11; */
3110       *p++ = 0x49;
3111       *p++ = 0xBB;
3112       const void* disp_cp_chain_me
3113                = i->Ain.XDirect.toFastEP ? disp_cp_chain_me_to_fastEP
3114                                          : disp_cp_chain_me_to_slowEP;
3115       p = emit64(p, (Addr)disp_cp_chain_me);
3116       /* call *%r11 */
3117       *p++ = 0x41;
3118       *p++ = 0xFF;
3119       *p++ = 0xD3;
3120       /* --- END of PATCHABLE BYTES --- */
3121 
3122       /* Fix up the conditional jump, if there was one. */
3123       if (i->Ain.XDirect.cond != Acc_ALWAYS) {
3124          Int delta = p - ptmp;
3125          vassert(delta > 0 && delta < 40);
3126          *ptmp = toUChar(delta-1);
3127       }
3128       goto done;
3129    }
3130 
3131    case Ain_XIndir: {
3132       /* We're generating transfers that could lead indirectly to a
3133          chain-me, so we need to be sure this is actually allowed --
3134          no-redir translations are not allowed to reach normal
3135          translations without going through the scheduler.  That means
3136          no XDirects or XIndirs out from no-redir translations.
3137          Hence: */
3138       vassert(disp_cp_xindir != NULL);
3139 
3140       /* Use ptmp for backpatching conditional jumps. */
3141       ptmp = NULL;
3142 
3143       /* First off, if this is conditional, create a conditional
3144          jump over the rest of it. */
3145       if (i->Ain.XIndir.cond != Acc_ALWAYS) {
3146          /* jmp fwds if !condition */
3147          *p++ = toUChar(0x70 + (0xF & (i->Ain.XIndir.cond ^ 1)));
3148          ptmp = p; /* fill in this bit later */
3149          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3150       }
3151 
3152       /* movq dstGA(a reg), amRIP -- copied from Alu64M MOV case */
3153       *p++ = rexAMode_M(i->Ain.XIndir.dstGA, i->Ain.XIndir.amRIP);
3154       *p++ = 0x89;
3155       p = doAMode_M(p, i->Ain.XIndir.dstGA, i->Ain.XIndir.amRIP);
3156 
3157       /* get $disp_cp_xindir into %r11 */
3158       if (fitsIn32Bits((Addr)disp_cp_xindir)) {
3159          /* use a shorter encoding */
3160          /* movl sign-extend(disp_cp_xindir), %r11 */
3161          *p++ = 0x49;
3162          *p++ = 0xC7;
3163          *p++ = 0xC3;
3164          p = emit32(p, (UInt)(Addr)disp_cp_xindir);
3165       } else {
3166          /* movabsq $disp_cp_xindir, %r11 */
3167          *p++ = 0x49;
3168          *p++ = 0xBB;
3169          p = emit64(p, (Addr)disp_cp_xindir);
3170       }
3171 
3172       /* jmp *%r11 */
3173       *p++ = 0x41;
3174       *p++ = 0xFF;
3175       *p++ = 0xE3;
3176 
3177       /* Fix up the conditional jump, if there was one. */
3178       if (i->Ain.XIndir.cond != Acc_ALWAYS) {
3179          Int delta = p - ptmp;
3180          vassert(delta > 0 && delta < 40);
3181          *ptmp = toUChar(delta-1);
3182       }
3183       goto done;
3184    }
3185 
3186    case Ain_XAssisted: {
3187       /* Use ptmp for backpatching conditional jumps. */
3188       ptmp = NULL;
3189 
3190       /* First off, if this is conditional, create a conditional
3191          jump over the rest of it. */
3192       if (i->Ain.XAssisted.cond != Acc_ALWAYS) {
3193          /* jmp fwds if !condition */
3194          *p++ = toUChar(0x70 + (0xF & (i->Ain.XAssisted.cond ^ 1)));
3195          ptmp = p; /* fill in this bit later */
3196          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3197       }
3198 
3199       /* movq dstGA(a reg), amRIP -- copied from Alu64M MOV case */
3200       *p++ = rexAMode_M(i->Ain.XAssisted.dstGA, i->Ain.XAssisted.amRIP);
3201       *p++ = 0x89;
3202       p = doAMode_M(p, i->Ain.XAssisted.dstGA, i->Ain.XAssisted.amRIP);
3203       /* movl $magic_number, %ebp.  Since these numbers are all small positive
3204          integers, we can get away with "movl $N, %ebp" rather than
3205          the longer "movq $N, %rbp". */
3206       UInt trcval = 0;
3207       switch (i->Ain.XAssisted.jk) {
3208          case Ijk_ClientReq:   trcval = VEX_TRC_JMP_CLIENTREQ;   break;
3209          case Ijk_Sys_syscall: trcval = VEX_TRC_JMP_SYS_SYSCALL; break;
3210          case Ijk_Sys_int32:   trcval = VEX_TRC_JMP_SYS_INT32;   break;
3211          case Ijk_Sys_int210:  trcval = VEX_TRC_JMP_SYS_INT210;  break;
3212          case Ijk_Yield:       trcval = VEX_TRC_JMP_YIELD;       break;
3213          case Ijk_EmWarn:      trcval = VEX_TRC_JMP_EMWARN;      break;
3214          case Ijk_MapFail:     trcval = VEX_TRC_JMP_MAPFAIL;     break;
3215          case Ijk_NoDecode:    trcval = VEX_TRC_JMP_NODECODE;    break;
3216          case Ijk_InvalICache: trcval = VEX_TRC_JMP_INVALICACHE; break;
3217          case Ijk_NoRedir:     trcval = VEX_TRC_JMP_NOREDIR;     break;
3218          case Ijk_SigTRAP:     trcval = VEX_TRC_JMP_SIGTRAP;     break;
3219          case Ijk_SigSEGV:     trcval = VEX_TRC_JMP_SIGSEGV;     break;
3220          case Ijk_Boring:      trcval = VEX_TRC_JMP_BORING;      break;
3221          /* We don't expect to see the following being assisted. */
3222          case Ijk_Ret:
3223          case Ijk_Call:
3224          /* fallthrough */
3225          default:
3226             ppIRJumpKind(i->Ain.XAssisted.jk);
3227             vpanic("emit_AMD64Instr.Ain_XAssisted: unexpected jump kind");
3228       }
3229       vassert(trcval != 0);
3230       *p++ = 0xBD;
3231       p = emit32(p, trcval);
3232       /* movabsq $disp_assisted, %r11 */
3233       *p++ = 0x49;
3234       *p++ = 0xBB;
3235       p = emit64(p, (Addr)disp_cp_xassisted);
3236       /* jmp *%r11 */
3237       *p++ = 0x41;
3238       *p++ = 0xFF;
3239       *p++ = 0xE3;
3240 
3241       /* Fix up the conditional jump, if there was one. */
3242       if (i->Ain.XAssisted.cond != Acc_ALWAYS) {
3243          Int delta = p - ptmp;
3244          vassert(delta > 0 && delta < 40);
3245          *ptmp = toUChar(delta-1);
3246       }
3247       goto done;
3248    }
3249 
3250    case Ain_CMov64:
3251       vassert(i->Ain.CMov64.cond != Acc_ALWAYS);
3252       *p++ = rexAMode_R(i->Ain.CMov64.dst, i->Ain.CMov64.src);
3253       *p++ = 0x0F;
3254       *p++ = toUChar(0x40 + (0xF & i->Ain.CMov64.cond));
3255       p = doAMode_R(p, i->Ain.CMov64.dst, i->Ain.CMov64.src);
3256       goto done;
3257 
3258    case Ain_CLoad: {
3259       vassert(i->Ain.CLoad.cond != Acc_ALWAYS);
3260 
3261       /* Only 32- or 64-bit variants are allowed. */
3262       vassert(i->Ain.CLoad.szB == 4 || i->Ain.CLoad.szB == 8);
3263 
3264       /* Use ptmp for backpatching conditional jumps. */
3265       ptmp = NULL;
3266 
3267       /* jmp fwds if !condition */
3268       *p++ = toUChar(0x70 + (0xF & (i->Ain.CLoad.cond ^ 1)));
3269       ptmp = p; /* fill in this bit later */
3270       *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3271 
3272       /* Now the load.  Either a normal 64 bit load or a normal 32 bit
3273          load, which, by the default zero-extension rule, zeroes out
3274          the upper half of the destination, as required. */
3275       rex = rexAMode_M(i->Ain.CLoad.dst, i->Ain.CLoad.addr);
3276       *p++ = i->Ain.CLoad.szB == 4 ? clearWBit(rex) : rex;
3277       *p++ = 0x8B;
3278       p = doAMode_M(p, i->Ain.CLoad.dst, i->Ain.CLoad.addr);
3279 
3280       /* Fix up the conditional branch */
3281       Int delta = p - ptmp;
3282       vassert(delta > 0 && delta < 40);
3283       *ptmp = toUChar(delta-1);
3284       goto done;
3285    }
3286 
3287    case Ain_CStore: {
3288       /* AFAICS this is identical to Ain_CLoad except that the opcode
3289          is 0x89 instead of 0x8B. */
3290       vassert(i->Ain.CStore.cond != Acc_ALWAYS);
3291 
3292       /* Only 32- or 64-bit variants are allowed. */
3293       vassert(i->Ain.CStore.szB == 4 || i->Ain.CStore.szB == 8);
3294 
3295       /* Use ptmp for backpatching conditional jumps. */
3296       ptmp = NULL;
3297 
3298       /* jmp fwds if !condition */
3299       *p++ = toUChar(0x70 + (0xF & (i->Ain.CStore.cond ^ 1)));
3300       ptmp = p; /* fill in this bit later */
3301       *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3302 
3303       /* Now the store. */
3304       rex = rexAMode_M(i->Ain.CStore.src, i->Ain.CStore.addr);
3305       *p++ = i->Ain.CStore.szB == 4 ? clearWBit(rex) : rex;
3306       *p++ = 0x89;
3307       p = doAMode_M(p, i->Ain.CStore.src, i->Ain.CStore.addr);
3308 
3309       /* Fix up the conditional branch */
3310       Int delta = p - ptmp;
3311       vassert(delta > 0 && delta < 40);
3312       *ptmp = toUChar(delta-1);
3313       goto done;
3314    }
3315 
3316    case Ain_MovxLQ:
3317       /* No, _don't_ ask me why the sense of the args has to be
3318          different in the S vs Z case.  I don't know. */
3319       if (i->Ain.MovxLQ.syned) {
3320          /* Need REX.W = 1 here, but rexAMode_R does that for us. */
3321          *p++ = rexAMode_R(i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
3322          *p++ = 0x63;
3323          p = doAMode_R(p, i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
3324       } else {
3325          /* Produce a 32-bit reg-reg move, since the implicit
3326             zero-extend does what we want. */
3327          *p++ = clearWBit (
3328                    rexAMode_R(i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst));
3329          *p++ = 0x89;
3330          p = doAMode_R(p, i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst);
3331       }
3332       goto done;
3333 
3334    case Ain_LoadEX:
3335       if (i->Ain.LoadEX.szSmall == 1 && !i->Ain.LoadEX.syned) {
3336          /* movzbq */
3337          *p++ = rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3338          *p++ = 0x0F;
3339          *p++ = 0xB6;
3340          p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3341          goto done;
3342       }
3343       if (i->Ain.LoadEX.szSmall == 2 && !i->Ain.LoadEX.syned) {
3344          /* movzwq */
3345          *p++ = rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3346          *p++ = 0x0F;
3347          *p++ = 0xB7;
3348          p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3349          goto done;
3350       }
3351       if (i->Ain.LoadEX.szSmall == 4 && !i->Ain.LoadEX.syned) {
3352          /* movzlq */
3353          /* This isn't really an existing AMD64 instruction per se.
3354             Rather, we have to do a 32-bit load.  Because a 32-bit
3355             write implicitly clears the upper 32 bits of the target
3356             register, we get what we want. */
3357          *p++ = clearWBit(
3358                 rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src));
3359          *p++ = 0x8B;
3360          p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3361          goto done;
3362       }
3363       break;
3364 
3365    case Ain_Set64:
3366       /* Make the destination register be 1 or 0, depending on whether
3367          the relevant condition holds.  Complication: the top 56 bits
3368          of the destination should be forced to zero, but doing 'xorq
3369          %r,%r' kills the flag(s) we are about to read.  Sigh.  So
3370          start off my moving $0 into the dest. */
3371       reg = iregEnc3210(i->Ain.Set64.dst);
3372       vassert(reg < 16);
3373 
3374       /* movq $0, %dst */
3375       *p++ = toUChar(reg >= 8 ? 0x49 : 0x48);
3376       *p++ = 0xC7;
3377       *p++ = toUChar(0xC0 + (reg & 7));
3378       p = emit32(p, 0);
3379 
3380       /* setb lo8(%dst) */
3381       /* note, 8-bit register rex trickyness.  Be careful here. */
3382       *p++ = toUChar(reg >= 8 ? 0x41 : 0x40);
3383       *p++ = 0x0F;
3384       *p++ = toUChar(0x90 + (0x0F & i->Ain.Set64.cond));
3385       *p++ = toUChar(0xC0 + (reg & 7));
3386       goto done;
3387 
3388    case Ain_Bsfr64:
3389       *p++ = rexAMode_R(i->Ain.Bsfr64.dst, i->Ain.Bsfr64.src);
3390       *p++ = 0x0F;
3391       if (i->Ain.Bsfr64.isFwds) {
3392          *p++ = 0xBC;
3393       } else {
3394          *p++ = 0xBD;
3395       }
3396       p = doAMode_R(p, i->Ain.Bsfr64.dst, i->Ain.Bsfr64.src);
3397       goto done;
3398 
3399    case Ain_MFence:
3400       /* mfence */
3401       *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF0;
3402       goto done;
3403 
3404    case Ain_ACAS:
3405       /* lock */
3406       *p++ = 0xF0;
3407       if (i->Ain.ACAS.sz == 2) *p++ = 0x66;
3408       /* cmpxchg{b,w,l,q} %rbx,mem.  Expected-value in %rax, new value
3409          in %rbx.  The new-value register is hardwired to be %rbx
3410          since dealing with byte integer registers is too much hassle,
3411          so we force the register operand to %rbx (could equally be
3412          %rcx or %rdx). */
3413       rex = rexAMode_M( hregAMD64_RBX(), i->Ain.ACAS.addr );
3414       if (i->Ain.ACAS.sz != 8)
3415          rex = clearWBit(rex);
3416 
3417       *p++ = rex; /* this can emit 0x40, which is pointless. oh well. */
3418       *p++ = 0x0F;
3419       if (i->Ain.ACAS.sz == 1) *p++ = 0xB0; else *p++ = 0xB1;
3420       p = doAMode_M(p, hregAMD64_RBX(), i->Ain.ACAS.addr);
3421       goto done;
3422 
3423    case Ain_DACAS:
3424       /* lock */
3425       *p++ = 0xF0;
3426       /* cmpxchg{8,16}b m{64,128}.  Expected-value in %rdx:%rax, new
3427          value in %rcx:%rbx.  All 4 regs are hardwired in the ISA, so
3428          aren't encoded in the insn. */
3429       rex = rexAMode_M_enc(1, i->Ain.ACAS.addr );
3430       if (i->Ain.ACAS.sz != 8)
3431          rex = clearWBit(rex);
3432       *p++ = rex;
3433       *p++ = 0x0F;
3434       *p++ = 0xC7;
3435       p = doAMode_M_enc(p, 1, i->Ain.DACAS.addr);
3436       goto done;
3437 
3438    case Ain_A87Free:
3439       vassert(i->Ain.A87Free.nregs > 0 && i->Ain.A87Free.nregs <= 7);
3440       for (j = 0; j < i->Ain.A87Free.nregs; j++) {
3441          p = do_ffree_st(p, 7-j);
3442       }
3443       goto done;
3444 
3445    case Ain_A87PushPop:
3446       vassert(i->Ain.A87PushPop.szB == 8 || i->Ain.A87PushPop.szB == 4);
3447       if (i->Ain.A87PushPop.isPush) {
3448          /* Load from memory into %st(0): flds/fldl amode */
3449          *p++ = clearWBit(
3450                    rexAMode_M_enc(0, i->Ain.A87PushPop.addr) );
3451          *p++ = i->Ain.A87PushPop.szB == 4 ? 0xD9 : 0xDD;
3452 	 p = doAMode_M_enc(p, 0/*subopcode*/, i->Ain.A87PushPop.addr);
3453       } else {
3454          /* Dump %st(0) to memory: fstps/fstpl amode */
3455          *p++ = clearWBit(
3456                    rexAMode_M_enc(3, i->Ain.A87PushPop.addr) );
3457          *p++ = i->Ain.A87PushPop.szB == 4 ? 0xD9 : 0xDD;
3458          p = doAMode_M_enc(p, 3/*subopcode*/, i->Ain.A87PushPop.addr);
3459          goto done;
3460       }
3461       goto done;
3462 
3463    case Ain_A87FpOp:
3464       switch (i->Ain.A87FpOp.op) {
3465          case Afp_SQRT:   *p++ = 0xD9; *p++ = 0xFA; break;
3466          case Afp_SIN:    *p++ = 0xD9; *p++ = 0xFE; break;
3467          case Afp_COS:    *p++ = 0xD9; *p++ = 0xFF; break;
3468          case Afp_ROUND:  *p++ = 0xD9; *p++ = 0xFC; break;
3469          case Afp_2XM1:   *p++ = 0xD9; *p++ = 0xF0; break;
3470          case Afp_SCALE:  *p++ = 0xD9; *p++ = 0xFD; break;
3471          case Afp_ATAN:   *p++ = 0xD9; *p++ = 0xF3; break;
3472          case Afp_YL2X:   *p++ = 0xD9; *p++ = 0xF1; break;
3473          case Afp_YL2XP1: *p++ = 0xD9; *p++ = 0xF9; break;
3474          case Afp_PREM:   *p++ = 0xD9; *p++ = 0xF8; break;
3475          case Afp_PREM1:  *p++ = 0xD9; *p++ = 0xF5; break;
3476          case Afp_TAN:
3477             /* fptan pushes 1.0 on the FP stack, except when the
3478                argument is out of range.  Hence we have to do the
3479                instruction, then inspect C2 to see if there is an out
3480                of range condition.  If there is, we skip the fincstp
3481                that is used by the in-range case to get rid of this
3482                extra 1.0 value. */
3483             *p++ = 0xD9; *p++ = 0xF2; // fptan
3484             *p++ = 0x50;              // pushq %rax
3485             *p++ = 0xDF; *p++ = 0xE0; // fnstsw %ax
3486             *p++ = 0x66; *p++ = 0xA9;
3487             *p++ = 0x00; *p++ = 0x04; // testw $0x400,%ax
3488             *p++ = 0x75; *p++ = 0x02; // jnz after_fincstp
3489             *p++ = 0xD9; *p++ = 0xF7; // fincstp
3490             *p++ = 0x58;              // after_fincstp: popq %rax
3491             break;
3492          default:
3493             goto bad;
3494       }
3495       goto done;
3496 
3497    case Ain_A87LdCW:
3498       *p++ = clearWBit(
3499                 rexAMode_M_enc(5, i->Ain.A87LdCW.addr) );
3500       *p++ = 0xD9;
3501       p = doAMode_M_enc(p, 5/*subopcode*/, i->Ain.A87LdCW.addr);
3502       goto done;
3503 
3504    case Ain_A87StSW:
3505       *p++ = clearWBit(
3506                 rexAMode_M_enc(7, i->Ain.A87StSW.addr) );
3507       *p++ = 0xDD;
3508       p = doAMode_M_enc(p, 7/*subopcode*/, i->Ain.A87StSW.addr);
3509       goto done;
3510 
3511    case Ain_Store:
3512       if (i->Ain.Store.sz == 2) {
3513          /* This just goes to show the crazyness of the instruction
3514             set encoding.  We have to insert two prefix bytes, but be
3515             careful to avoid a conflict in what the size should be, by
3516             ensuring that REX.W = 0. */
3517          *p++ = 0x66; /* override to 16-bits */
3518 	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
3519          *p++ = 0x89;
3520          p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
3521          goto done;
3522       }
3523       if (i->Ain.Store.sz == 4) {
3524 	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
3525          *p++ = 0x89;
3526          p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
3527          goto done;
3528       }
3529       if (i->Ain.Store.sz == 1) {
3530          /* This is one place where it would be wrong to skip emitting
3531             a rex byte of 0x40, since the mere presence of rex changes
3532             the meaning of the byte register access.  Be careful. */
3533 	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
3534          *p++ = 0x88;
3535          p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
3536          goto done;
3537       }
3538       break;
3539 
3540    case Ain_LdMXCSR:
3541       *p++ = clearWBit(rexAMode_M_enc(0, i->Ain.LdMXCSR.addr));
3542       *p++ = 0x0F;
3543       *p++ = 0xAE;
3544       p = doAMode_M_enc(p, 2/*subopcode*/, i->Ain.LdMXCSR.addr);
3545       goto done;
3546 
3547    case Ain_SseUComIS:
3548       /* ucomi[sd] %srcL, %srcR ;  pushfq ; popq %dst */
3549       /* ucomi[sd] %srcL, %srcR */
3550       if (i->Ain.SseUComIS.sz == 8) {
3551          *p++ = 0x66;
3552       } else {
3553          goto bad;
3554          vassert(i->Ain.SseUComIS.sz == 4);
3555       }
3556       *p++ = clearWBit (
3557              rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseUComIS.srcL),
3558                                  vregEnc3210(i->Ain.SseUComIS.srcR) ));
3559       *p++ = 0x0F;
3560       *p++ = 0x2E;
3561       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseUComIS.srcL),
3562                                vregEnc3210(i->Ain.SseUComIS.srcR) );
3563       /* pushfq */
3564       *p++ = 0x9C;
3565       /* popq %dst */
3566       *p++ = toUChar(0x40 + (1 & iregEnc3(i->Ain.SseUComIS.dst)));
3567       *p++ = toUChar(0x58 + iregEnc210(i->Ain.SseUComIS.dst));
3568       goto done;
3569 
3570    case Ain_SseSI2SF:
3571       /* cvssi2s[sd] %src, %dst */
3572       rex = rexAMode_R_enc_reg( vregEnc3210(i->Ain.SseSI2SF.dst),
3573                                 i->Ain.SseSI2SF.src );
3574       *p++ = toUChar(i->Ain.SseSI2SF.szD==4 ? 0xF3 : 0xF2);
3575       *p++ = toUChar(i->Ain.SseSI2SF.szS==4 ? clearWBit(rex) : rex);
3576       *p++ = 0x0F;
3577       *p++ = 0x2A;
3578       p = doAMode_R_enc_reg( p, vregEnc3210(i->Ain.SseSI2SF.dst),
3579                                 i->Ain.SseSI2SF.src );
3580       goto done;
3581 
3582    case Ain_SseSF2SI:
3583       /* cvss[sd]2si %src, %dst */
3584       rex = rexAMode_R_reg_enc( i->Ain.SseSF2SI.dst,
3585                                 vregEnc3210(i->Ain.SseSF2SI.src) );
3586       *p++ = toUChar(i->Ain.SseSF2SI.szS==4 ? 0xF3 : 0xF2);
3587       *p++ = toUChar(i->Ain.SseSF2SI.szD==4 ? clearWBit(rex) : rex);
3588       *p++ = 0x0F;
3589       *p++ = 0x2D;
3590       p = doAMode_R_reg_enc( p, i->Ain.SseSF2SI.dst,
3591                                 vregEnc3210(i->Ain.SseSF2SI.src) );
3592       goto done;
3593 
3594    case Ain_SseSDSS:
3595       /* cvtsd2ss/cvtss2sd %src, %dst */
3596       *p++ = toUChar(i->Ain.SseSDSS.from64 ? 0xF2 : 0xF3);
3597       *p++ = clearWBit(
3598               rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseSDSS.dst),
3599                                   vregEnc3210(i->Ain.SseSDSS.src) ));
3600       *p++ = 0x0F;
3601       *p++ = 0x5A;
3602       p = doAMode_R_enc_enc( p, vregEnc3210(i->Ain.SseSDSS.dst),
3603                                 vregEnc3210(i->Ain.SseSDSS.src) );
3604       goto done;
3605 
3606    case Ain_SseLdSt:
3607       if (i->Ain.SseLdSt.sz == 8) {
3608          *p++ = 0xF2;
3609       } else
3610       if (i->Ain.SseLdSt.sz == 4) {
3611          *p++ = 0xF3;
3612       } else
3613       if (i->Ain.SseLdSt.sz != 16) {
3614          vassert(0);
3615       }
3616       *p++ = clearWBit(
3617              rexAMode_M_enc(vregEnc3210(i->Ain.SseLdSt.reg),
3618                             i->Ain.SseLdSt.addr));
3619       *p++ = 0x0F;
3620       *p++ = toUChar(i->Ain.SseLdSt.isLoad ? 0x10 : 0x11);
3621       p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseLdSt.reg),
3622                            i->Ain.SseLdSt.addr);
3623       goto done;
3624 
3625    case Ain_SseCStore: {
3626       vassert(i->Ain.SseCStore.cond != Acc_ALWAYS);
3627 
3628       /* Use ptmp for backpatching conditional jumps. */
3629       ptmp = NULL;
3630 
3631       /* jmp fwds if !condition */
3632       *p++ = toUChar(0x70 + (0xF & (i->Ain.SseCStore.cond ^ 1)));
3633       ptmp = p; /* fill in this bit later */
3634       *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3635 
3636       /* Now the store. */
3637       *p++ = clearWBit(
3638              rexAMode_M_enc(vregEnc3210(i->Ain.SseCStore.src),
3639                             i->Ain.SseCStore.addr));
3640       *p++ = 0x0F;
3641       *p++ = toUChar(0x11);
3642       p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseCStore.src),
3643                            i->Ain.SseCStore.addr);
3644 
3645       /* Fix up the conditional branch */
3646       Int delta = p - ptmp;
3647       vassert(delta > 0 && delta < 40);
3648       *ptmp = toUChar(delta-1);
3649       goto done;
3650    }
3651 
3652    case Ain_SseCLoad: {
3653       vassert(i->Ain.SseCLoad.cond != Acc_ALWAYS);
3654 
3655       /* Use ptmp for backpatching conditional jumps. */
3656       ptmp = NULL;
3657 
3658       /* jmp fwds if !condition */
3659       *p++ = toUChar(0x70 + (0xF & (i->Ain.SseCLoad.cond ^ 1)));
3660       ptmp = p; /* fill in this bit later */
3661       *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3662 
3663       /* Now the load. */
3664       *p++ = clearWBit(
3665              rexAMode_M_enc(vregEnc3210(i->Ain.SseCLoad.dst),
3666                             i->Ain.SseCLoad.addr));
3667       *p++ = 0x0F;
3668       *p++ = toUChar(0x10);
3669       p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseCLoad.dst),
3670                            i->Ain.SseCLoad.addr);
3671 
3672       /* Fix up the conditional branch */
3673       Int delta = p - ptmp;
3674       vassert(delta > 0 && delta < 40);
3675       *ptmp = toUChar(delta-1);
3676       goto done;
3677    }
3678 
3679    case Ain_SseLdzLO:
3680       vassert(i->Ain.SseLdzLO.sz == 4 || i->Ain.SseLdzLO.sz == 8);
3681       /* movs[sd] amode, %xmm-dst */
3682       *p++ = toUChar(i->Ain.SseLdzLO.sz==4 ? 0xF3 : 0xF2);
3683       *p++ = clearWBit(
3684              rexAMode_M_enc(vregEnc3210(i->Ain.SseLdzLO.reg),
3685                             i->Ain.SseLdzLO.addr));
3686       *p++ = 0x0F;
3687       *p++ = 0x10;
3688       p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseLdzLO.reg),
3689                            i->Ain.SseLdzLO.addr);
3690       goto done;
3691 
3692    case Ain_Sse32Fx4:
3693       xtra = 0;
3694       switch (i->Ain.Sse32Fx4.op) {
3695          case Asse_F2I: *p++ = 0x66; break;
3696          default: break;
3697       }
3698       *p++ = clearWBit(
3699              rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse32Fx4.dst),
3700                                  vregEnc3210(i->Ain.Sse32Fx4.src) ));
3701       *p++ = 0x0F;
3702       switch (i->Ain.Sse32Fx4.op) {
3703          case Asse_ADDF:   *p++ = 0x58; break;
3704          case Asse_DIVF:   *p++ = 0x5E; break;
3705          case Asse_MAXF:   *p++ = 0x5F; break;
3706          case Asse_MINF:   *p++ = 0x5D; break;
3707          case Asse_MULF:   *p++ = 0x59; break;
3708          case Asse_RCPF:   *p++ = 0x53; break;
3709          case Asse_RSQRTF: *p++ = 0x52; break;
3710          case Asse_SQRTF:  *p++ = 0x51; break;
3711          case Asse_I2F:    *p++ = 0x5B; break; // cvtdq2ps; no 0x66 pfx
3712          case Asse_F2I:    *p++ = 0x5B; break; // cvtps2dq; with 0x66 pfx
3713          case Asse_SUBF:   *p++ = 0x5C; break;
3714          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3715          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3716          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3717          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3718          default: goto bad;
3719       }
3720       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse32Fx4.dst),
3721                                vregEnc3210(i->Ain.Sse32Fx4.src) );
3722       if (xtra & 0x100)
3723          *p++ = toUChar(xtra & 0xFF);
3724       goto done;
3725 
3726    case Ain_Sse64Fx2:
3727       xtra = 0;
3728       *p++ = 0x66;
3729       *p++ = clearWBit(
3730              rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse64Fx2.dst),
3731                                  vregEnc3210(i->Ain.Sse64Fx2.src) ));
3732       *p++ = 0x0F;
3733       switch (i->Ain.Sse64Fx2.op) {
3734          case Asse_ADDF:   *p++ = 0x58; break;
3735          case Asse_DIVF:   *p++ = 0x5E; break;
3736          case Asse_MAXF:   *p++ = 0x5F; break;
3737          case Asse_MINF:   *p++ = 0x5D; break;
3738          case Asse_MULF:   *p++ = 0x59; break;
3739          case Asse_SQRTF:  *p++ = 0x51; break;
3740          case Asse_SUBF:   *p++ = 0x5C; break;
3741          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3742          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3743          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3744          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3745          default: goto bad;
3746       }
3747       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse64Fx2.dst),
3748                                vregEnc3210(i->Ain.Sse64Fx2.src) );
3749       if (xtra & 0x100)
3750          *p++ = toUChar(xtra & 0xFF);
3751       goto done;
3752 
3753    case Ain_Sse32FLo:
3754       xtra = 0;
3755       *p++ = 0xF3;
3756       *p++ = clearWBit(
3757              rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse32FLo.dst),
3758                                  vregEnc3210(i->Ain.Sse32FLo.src) ));
3759       *p++ = 0x0F;
3760       switch (i->Ain.Sse32FLo.op) {
3761          case Asse_ADDF:   *p++ = 0x58; break;
3762          case Asse_DIVF:   *p++ = 0x5E; break;
3763          case Asse_MAXF:   *p++ = 0x5F; break;
3764          case Asse_MINF:   *p++ = 0x5D; break;
3765          case Asse_MULF:   *p++ = 0x59; break;
3766          case Asse_RCPF:   *p++ = 0x53; break;
3767          case Asse_RSQRTF: *p++ = 0x52; break;
3768          case Asse_SQRTF:  *p++ = 0x51; break;
3769          case Asse_SUBF:   *p++ = 0x5C; break;
3770          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3771          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3772          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3773          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3774          default: goto bad;
3775       }
3776       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse32FLo.dst),
3777                                vregEnc3210(i->Ain.Sse32FLo.src) );
3778       if (xtra & 0x100)
3779          *p++ = toUChar(xtra & 0xFF);
3780       goto done;
3781 
3782    case Ain_Sse64FLo:
3783       xtra = 0;
3784       *p++ = 0xF2;
3785       *p++ = clearWBit(
3786              rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse64FLo.dst),
3787                                  vregEnc3210(i->Ain.Sse64FLo.src) ));
3788       *p++ = 0x0F;
3789       switch (i->Ain.Sse64FLo.op) {
3790          case Asse_ADDF:   *p++ = 0x58; break;
3791          case Asse_DIVF:   *p++ = 0x5E; break;
3792          case Asse_MAXF:   *p++ = 0x5F; break;
3793          case Asse_MINF:   *p++ = 0x5D; break;
3794          case Asse_MULF:   *p++ = 0x59; break;
3795          case Asse_SQRTF:  *p++ = 0x51; break;
3796          case Asse_SUBF:   *p++ = 0x5C; break;
3797          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3798          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3799          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3800          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3801          default: goto bad;
3802       }
3803       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse64FLo.dst),
3804                                vregEnc3210(i->Ain.Sse64FLo.src) );
3805       if (xtra & 0x100)
3806          *p++ = toUChar(xtra & 0xFF);
3807       goto done;
3808 
3809    case Ain_SseReRg:
3810 #     define XX(_n) *p++ = (_n)
3811 
3812       rex = clearWBit(
3813             rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseReRg.dst),
3814                                 vregEnc3210(i->Ain.SseReRg.src) ));
3815 
3816       switch (i->Ain.SseReRg.op) {
3817          case Asse_MOV:     /*movups*/ XX(rex); XX(0x0F); XX(0x10); break;
3818          case Asse_OR:                 XX(rex); XX(0x0F); XX(0x56); break;
3819          case Asse_XOR:                XX(rex); XX(0x0F); XX(0x57); break;
3820          case Asse_AND:                XX(rex); XX(0x0F); XX(0x54); break;
3821          case Asse_ANDN:               XX(rex); XX(0x0F); XX(0x55); break;
3822          case Asse_PACKSSD:  XX(0x66); XX(rex); XX(0x0F); XX(0x6B); break;
3823          case Asse_PACKSSW:  XX(0x66); XX(rex); XX(0x0F); XX(0x63); break;
3824          case Asse_PACKUSW:  XX(0x66); XX(rex); XX(0x0F); XX(0x67); break;
3825          case Asse_ADD8:     XX(0x66); XX(rex); XX(0x0F); XX(0xFC); break;
3826          case Asse_ADD16:    XX(0x66); XX(rex); XX(0x0F); XX(0xFD); break;
3827          case Asse_ADD32:    XX(0x66); XX(rex); XX(0x0F); XX(0xFE); break;
3828          case Asse_ADD64:    XX(0x66); XX(rex); XX(0x0F); XX(0xD4); break;
3829          case Asse_QADD8S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEC); break;
3830          case Asse_QADD16S:  XX(0x66); XX(rex); XX(0x0F); XX(0xED); break;
3831          case Asse_QADD8U:   XX(0x66); XX(rex); XX(0x0F); XX(0xDC); break;
3832          case Asse_QADD16U:  XX(0x66); XX(rex); XX(0x0F); XX(0xDD); break;
3833          case Asse_AVG8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xE0); break;
3834          case Asse_AVG16U:   XX(0x66); XX(rex); XX(0x0F); XX(0xE3); break;
3835          case Asse_CMPEQ8:   XX(0x66); XX(rex); XX(0x0F); XX(0x74); break;
3836          case Asse_CMPEQ16:  XX(0x66); XX(rex); XX(0x0F); XX(0x75); break;
3837          case Asse_CMPEQ32:  XX(0x66); XX(rex); XX(0x0F); XX(0x76); break;
3838          case Asse_CMPGT8S:  XX(0x66); XX(rex); XX(0x0F); XX(0x64); break;
3839          case Asse_CMPGT16S: XX(0x66); XX(rex); XX(0x0F); XX(0x65); break;
3840          case Asse_CMPGT32S: XX(0x66); XX(rex); XX(0x0F); XX(0x66); break;
3841          case Asse_MAX16S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEE); break;
3842          case Asse_MAX8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xDE); break;
3843          case Asse_MIN16S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEA); break;
3844          case Asse_MIN8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xDA); break;
3845          case Asse_MULHI16U: XX(0x66); XX(rex); XX(0x0F); XX(0xE4); break;
3846          case Asse_MULHI16S: XX(0x66); XX(rex); XX(0x0F); XX(0xE5); break;
3847          case Asse_MUL16:    XX(0x66); XX(rex); XX(0x0F); XX(0xD5); break;
3848          case Asse_SHL16:    XX(0x66); XX(rex); XX(0x0F); XX(0xF1); break;
3849          case Asse_SHL32:    XX(0x66); XX(rex); XX(0x0F); XX(0xF2); break;
3850          case Asse_SHL64:    XX(0x66); XX(rex); XX(0x0F); XX(0xF3); break;
3851          case Asse_SAR16:    XX(0x66); XX(rex); XX(0x0F); XX(0xE1); break;
3852          case Asse_SAR32:    XX(0x66); XX(rex); XX(0x0F); XX(0xE2); break;
3853          case Asse_SHR16:    XX(0x66); XX(rex); XX(0x0F); XX(0xD1); break;
3854          case Asse_SHR32:    XX(0x66); XX(rex); XX(0x0F); XX(0xD2); break;
3855          case Asse_SHR64:    XX(0x66); XX(rex); XX(0x0F); XX(0xD3); break;
3856          case Asse_SUB8:     XX(0x66); XX(rex); XX(0x0F); XX(0xF8); break;
3857          case Asse_SUB16:    XX(0x66); XX(rex); XX(0x0F); XX(0xF9); break;
3858          case Asse_SUB32:    XX(0x66); XX(rex); XX(0x0F); XX(0xFA); break;
3859          case Asse_SUB64:    XX(0x66); XX(rex); XX(0x0F); XX(0xFB); break;
3860          case Asse_QSUB8S:   XX(0x66); XX(rex); XX(0x0F); XX(0xE8); break;
3861          case Asse_QSUB16S:  XX(0x66); XX(rex); XX(0x0F); XX(0xE9); break;
3862          case Asse_QSUB8U:   XX(0x66); XX(rex); XX(0x0F); XX(0xD8); break;
3863          case Asse_QSUB16U:  XX(0x66); XX(rex); XX(0x0F); XX(0xD9); break;
3864          case Asse_UNPCKHB:  XX(0x66); XX(rex); XX(0x0F); XX(0x68); break;
3865          case Asse_UNPCKHW:  XX(0x66); XX(rex); XX(0x0F); XX(0x69); break;
3866          case Asse_UNPCKHD:  XX(0x66); XX(rex); XX(0x0F); XX(0x6A); break;
3867          case Asse_UNPCKHQ:  XX(0x66); XX(rex); XX(0x0F); XX(0x6D); break;
3868          case Asse_UNPCKLB:  XX(0x66); XX(rex); XX(0x0F); XX(0x60); break;
3869          case Asse_UNPCKLW:  XX(0x66); XX(rex); XX(0x0F); XX(0x61); break;
3870          case Asse_UNPCKLD:  XX(0x66); XX(rex); XX(0x0F); XX(0x62); break;
3871          case Asse_UNPCKLQ:  XX(0x66); XX(rex); XX(0x0F); XX(0x6C); break;
3872          case Asse_PSHUFB:   XX(0x66); XX(rex);
3873                              XX(0x0F); XX(0x38); XX(0x00); break;
3874          default: goto bad;
3875       }
3876       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseReRg.dst),
3877                                vregEnc3210(i->Ain.SseReRg.src) );
3878 #     undef XX
3879       goto done;
3880 
3881    case Ain_SseCMov:
3882       /* jmp fwds if !condition */
3883       *p++ = toUChar(0x70 + (i->Ain.SseCMov.cond ^ 1));
3884       *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
3885       ptmp = p;
3886 
3887       /* movaps %src, %dst */
3888       *p++ = clearWBit(
3889              rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseCMov.dst),
3890                                  vregEnc3210(i->Ain.SseCMov.src) ));
3891       *p++ = 0x0F;
3892       *p++ = 0x28;
3893       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseCMov.dst),
3894                                vregEnc3210(i->Ain.SseCMov.src) );
3895 
3896       /* Fill in the jump offset. */
3897       *(ptmp-1) = toUChar(p - ptmp);
3898       goto done;
3899 
3900    case Ain_SseShuf:
3901       *p++ = 0x66;
3902       *p++ = clearWBit(
3903              rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseShuf.dst),
3904                                  vregEnc3210(i->Ain.SseShuf.src) ));
3905       *p++ = 0x0F;
3906       *p++ = 0x70;
3907       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseShuf.dst),
3908                                vregEnc3210(i->Ain.SseShuf.src) );
3909       *p++ = (UChar)(i->Ain.SseShuf.order);
3910       goto done;
3911 
3912    case Ain_SseShiftN: {
3913       opc         = 0; // invalid
3914       subopc_imm  = 0; // invalid
3915       UInt limit  = 0;
3916       UInt shiftImm = i->Ain.SseShiftN.shiftBits;
3917       switch (i->Ain.SseShiftN.op) {
3918          case Asse_SHL16: limit = 15; opc = 0x71; subopc_imm = 6; break;
3919          case Asse_SHL32: limit = 31; opc = 0x72; subopc_imm = 6; break;
3920          case Asse_SHL64: limit = 63; opc = 0x73; subopc_imm = 6; break;
3921          case Asse_SAR16: limit = 15; opc = 0x71; subopc_imm = 4; break;
3922          case Asse_SAR32: limit = 31; opc = 0x72; subopc_imm = 4; break;
3923          case Asse_SHR16: limit = 15; opc = 0x71; subopc_imm = 2; break;
3924          case Asse_SHR32: limit = 31; opc = 0x72; subopc_imm = 2; break;
3925          case Asse_SHR64: limit = 63; opc = 0x73; subopc_imm = 2; break;
3926          case Asse_SHL128:
3927             if ((shiftImm & 7) != 0) goto bad;
3928             shiftImm >>= 3;
3929             limit = 15; opc = 0x73; subopc_imm = 7;
3930             break;
3931          case Asse_SHR128:
3932             if ((shiftImm & 7) != 0) goto bad;
3933             shiftImm >>= 3;
3934             limit = 15; opc = 0x73; subopc_imm = 3;
3935             break;
3936          default:
3937             // This should never happen .. SSE2 only offers the above 10 insns
3938             // for the "shift with immediate" case
3939             goto bad;
3940       }
3941       vassert(limit > 0 && opc > 0 && subopc_imm > 0);
3942       if (shiftImm > limit) goto bad;
3943       *p++ = 0x66;
3944       *p++ = clearWBit(
3945              rexAMode_R_enc_enc( subopc_imm,
3946                                  vregEnc3210(i->Ain.SseShiftN.dst) ));
3947       *p++ = 0x0F;
3948       *p++ = opc;
3949       p = doAMode_R_enc_enc(p, subopc_imm, vregEnc3210(i->Ain.SseShiftN.dst));
3950       *p++ = shiftImm;
3951       goto done;
3952    }
3953 
3954    case Ain_SseMOVQ: {
3955       Bool toXMM = i->Ain.SseMOVQ.toXMM;
3956       HReg gpr = i->Ain.SseMOVQ.gpr;
3957       HReg xmm = i->Ain.SseMOVQ.xmm;
3958       *p++ = 0x66;
3959       *p++ = setWBit( rexAMode_R_enc_enc( vregEnc3210(xmm), iregEnc3210(gpr)) );
3960       *p++ = 0x0F;
3961       *p++ = toXMM ? 0x6E : 0x7E;
3962       p = doAMode_R_enc_enc( p, vregEnc3210(xmm), iregEnc3210(gpr) );
3963       goto done;
3964    }
3965 
3966    //uu case Ain_AvxLdSt: {
3967    //uu    UInt vex = vexAMode_M( dvreg2ireg(i->Ain.AvxLdSt.reg),
3968    //uu                           i->Ain.AvxLdSt.addr );
3969    //uu    p = emitVexPrefix(p, vex);
3970    //uu    *p++ = toUChar(i->Ain.AvxLdSt.isLoad ? 0x10 : 0x11);
3971    //uu    p = doAMode_M(p, dvreg2ireg(i->Ain.AvxLdSt.reg), i->Ain.AvxLdSt.addr);
3972    //uu      goto done;
3973    //uu }
3974 
3975    case Ain_EvCheck: {
3976       /* We generate:
3977             (3 bytes)  decl 8(%rbp)    8 == offsetof(host_EvC_COUNTER)
3978             (2 bytes)  jns  nofail     expected taken
3979             (3 bytes)  jmp* 0(%rbp)    0 == offsetof(host_EvC_FAILADDR)
3980             nofail:
3981       */
3982       /* This is heavily asserted re instruction lengths.  It needs to
3983          be.  If we get given unexpected forms of .amCounter or
3984          .amFailAddr -- basically, anything that's not of the form
3985          uimm7(%rbp) -- they are likely to fail. */
3986       /* Note also that after the decl we must be very careful not to
3987          read the carry flag, else we get a partial flags stall.
3988          js/jns avoids that, though. */
3989       UChar* p0 = p;
3990       /* ---  decl 8(%rbp) --- */
3991       /* Need to compute the REX byte for the decl in order to prove
3992          that we don't need it, since this is a 32-bit inc and all
3993          registers involved in the amode are < r8.  "1" because
3994          there's no register in this encoding; instead the register
3995          field is used as a sub opcode.  The encoding for "decl r/m32"
3996          is FF /1, hence the "1". */
3997       rex = clearWBit(rexAMode_M_enc(1, i->Ain.EvCheck.amCounter));
3998       if (rex != 0x40) goto bad; /* We don't expect to need the REX byte. */
3999       *p++ = 0xFF;
4000       p = doAMode_M_enc(p, 1, i->Ain.EvCheck.amCounter);
4001       vassert(p - p0 == 3);
4002       /* --- jns nofail --- */
4003       *p++ = 0x79;
4004       *p++ = 0x03; /* need to check this 0x03 after the next insn */
4005       vassert(p - p0 == 5);
4006       /* --- jmp* 0(%rbp) --- */
4007       /* Once again, verify we don't need REX.  The encoding is FF /4.
4008          We don't need REX.W since by default FF /4 in 64-bit mode
4009          implies a 64 bit load. */
4010       rex = clearWBit(rexAMode_M_enc(4, i->Ain.EvCheck.amFailAddr));
4011       if (rex != 0x40) goto bad;
4012       *p++ = 0xFF;
4013       p = doAMode_M_enc(p, 4, i->Ain.EvCheck.amFailAddr);
4014       vassert(p - p0 == 8); /* also ensures that 0x03 offset above is ok */
4015       /* And crosscheck .. */
4016       vassert(evCheckSzB_AMD64() == 8);
4017       goto done;
4018    }
4019 
4020    case Ain_ProfInc: {
4021       /* We generate   movabsq $0, %r11
4022                        incq (%r11)
4023          in the expectation that a later call to LibVEX_patchProfCtr
4024          will be used to fill in the immediate field once the right
4025          value is known.
4026          49 BB 00 00 00 00 00 00 00 00
4027          49 FF 03
4028       */
4029       *p++ = 0x49; *p++ = 0xBB;
4030       *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
4031       *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
4032       *p++ = 0x49; *p++ = 0xFF; *p++ = 0x03;
4033       /* Tell the caller .. */
4034       vassert(!(*is_profInc));
4035       *is_profInc = True;
4036       goto done;
4037    }
4038 
4039    default:
4040       goto bad;
4041    }
4042 
4043   bad:
4044    ppAMD64Instr(i, mode64);
4045    vpanic("emit_AMD64Instr");
4046    /*NOTREACHED*/
4047 
4048   done:
4049    vassert(p - &buf[0] <= 64);
4050    return p - &buf[0];
4051 }
4052 
4053 
4054 /* How big is an event check?  See case for Ain_EvCheck in
4055    emit_AMD64Instr just above.  That crosschecks what this returns, so
4056    we can tell if we're inconsistent. */
evCheckSzB_AMD64(void)4057 Int evCheckSzB_AMD64 (void)
4058 {
4059    return 8;
4060 }
4061 
4062 
4063 /* NB: what goes on here has to be very closely coordinated with the
4064    emitInstr case for XDirect, above. */
chainXDirect_AMD64(VexEndness endness_host,void * place_to_chain,const void * disp_cp_chain_me_EXPECTED,const void * place_to_jump_to)4065 VexInvalRange chainXDirect_AMD64 ( VexEndness endness_host,
4066                                    void* place_to_chain,
4067                                    const void* disp_cp_chain_me_EXPECTED,
4068                                    const void* place_to_jump_to )
4069 {
4070    vassert(endness_host == VexEndnessLE);
4071 
4072    /* What we're expecting to see is:
4073         movabsq $disp_cp_chain_me_EXPECTED, %r11
4074         call *%r11
4075       viz
4076         49 BB <8 bytes value == disp_cp_chain_me_EXPECTED>
4077         41 FF D3
4078    */
4079    UChar* p = (UChar*)place_to_chain;
4080    vassert(p[0] == 0x49);
4081    vassert(p[1] == 0xBB);
4082    vassert(read_misaligned_ULong_LE(&p[2]) == (Addr)disp_cp_chain_me_EXPECTED);
4083    vassert(p[10] == 0x41);
4084    vassert(p[11] == 0xFF);
4085    vassert(p[12] == 0xD3);
4086    /* And what we want to change it to is either:
4087         (general case):
4088           movabsq $place_to_jump_to, %r11
4089           jmpq *%r11
4090         viz
4091           49 BB <8 bytes value == place_to_jump_to>
4092           41 FF E3
4093         So it's the same length (convenient, huh) and we don't
4094         need to change all the bits.
4095       ---OR---
4096         in the case where the displacement falls within 32 bits
4097           jmpq disp32   where disp32 is relative to the next insn
4098           ud2; ud2; ud2; ud2
4099         viz
4100           E9 <4 bytes == disp32>
4101           0F 0B 0F 0B 0F 0B 0F 0B
4102 
4103       In both cases the replacement has the same length as the original.
4104       To remain sane & verifiable,
4105       (1) limit the displacement for the short form to
4106           (say) +/- one billion, so as to avoid wraparound
4107           off-by-ones
4108       (2) even if the short form is applicable, once every (say)
4109           1024 times use the long form anyway, so as to maintain
4110           verifiability
4111    */
4112    /* This is the delta we need to put into a JMP d32 insn.  It's
4113       relative to the start of the next insn, hence the -5.  */
4114    Long delta   = (Long)((const UChar *)place_to_jump_to - (const UChar*)p) - 5;
4115    Bool shortOK = delta >= -1000*1000*1000 && delta < 1000*1000*1000;
4116 
4117    static UInt shortCTR = 0; /* DO NOT MAKE NON-STATIC */
4118    if (shortOK) {
4119       shortCTR++; // thread safety bleh
4120       if (0 == (shortCTR & 0x3FF)) {
4121          shortOK = False;
4122          if (0)
4123             vex_printf("QQQ chainXDirect_AMD64: shortCTR = %u, "
4124                        "using long jmp\n", shortCTR);
4125       }
4126    }
4127 
4128    /* And make the modifications. */
4129    if (shortOK) {
4130       p[0]  = 0xE9;
4131       write_misaligned_UInt_LE(&p[1], (UInt)(Int)delta);
4132       p[5]  = 0x0F; p[6]  = 0x0B;
4133       p[7]  = 0x0F; p[8]  = 0x0B;
4134       p[9]  = 0x0F; p[10] = 0x0B;
4135       p[11] = 0x0F; p[12] = 0x0B;
4136       /* sanity check on the delta -- top 32 are all 0 or all 1 */
4137       delta >>= 32;
4138       vassert(delta == 0LL || delta == -1LL);
4139    } else {
4140       /* Minimal modifications from the starting sequence. */
4141       write_misaligned_ULong_LE(&p[2], (ULong)(Addr)place_to_jump_to);
4142       p[12] = 0xE3;
4143    }
4144    VexInvalRange vir = { (HWord)place_to_chain, 13 };
4145    return vir;
4146 }
4147 
4148 
4149 /* NB: what goes on here has to be very closely coordinated with the
4150    emitInstr case for XDirect, above. */
unchainXDirect_AMD64(VexEndness endness_host,void * place_to_unchain,const void * place_to_jump_to_EXPECTED,const void * disp_cp_chain_me)4151 VexInvalRange unchainXDirect_AMD64 ( VexEndness endness_host,
4152                                      void* place_to_unchain,
4153                                      const void* place_to_jump_to_EXPECTED,
4154                                      const void* disp_cp_chain_me )
4155 {
4156    vassert(endness_host == VexEndnessLE);
4157 
4158    /* What we're expecting to see is either:
4159         (general case)
4160           movabsq $place_to_jump_to_EXPECTED, %r11
4161           jmpq *%r11
4162         viz
4163           49 BB <8 bytes value == place_to_jump_to_EXPECTED>
4164           41 FF E3
4165       ---OR---
4166         in the case where the displacement falls within 32 bits
4167           jmpq d32
4168           ud2; ud2; ud2; ud2
4169         viz
4170           E9 <4 bytes == disp32>
4171           0F 0B 0F 0B 0F 0B 0F 0B
4172    */
4173    UChar* p     = (UChar*)place_to_unchain;
4174    Bool   valid = False;
4175    if (p[0] == 0x49 && p[1] == 0xBB
4176        && read_misaligned_ULong_LE(&p[2])
4177           == (ULong)(Addr)place_to_jump_to_EXPECTED
4178        && p[10] == 0x41 && p[11] == 0xFF && p[12] == 0xE3) {
4179       /* it's the long form */
4180       valid = True;
4181    }
4182    else
4183    if (p[0] == 0xE9
4184        && p[5]  == 0x0F && p[6]  == 0x0B
4185        && p[7]  == 0x0F && p[8]  == 0x0B
4186        && p[9]  == 0x0F && p[10] == 0x0B
4187        && p[11] == 0x0F && p[12] == 0x0B) {
4188       /* It's the short form.  Check the offset is right. */
4189       Int  s32 = (Int)read_misaligned_UInt_LE(&p[1]);
4190       Long s64 = (Long)s32;
4191       if ((UChar*)p + 5 + s64 == place_to_jump_to_EXPECTED) {
4192          valid = True;
4193          if (0)
4194             vex_printf("QQQ unchainXDirect_AMD64: found short form\n");
4195       }
4196    }
4197    vassert(valid);
4198    /* And what we want to change it to is:
4199         movabsq $disp_cp_chain_me, %r11
4200         call *%r11
4201       viz
4202         49 BB <8 bytes value == disp_cp_chain_me>
4203         41 FF D3
4204       So it's the same length (convenient, huh).
4205    */
4206    p[0] = 0x49;
4207    p[1] = 0xBB;
4208    write_misaligned_ULong_LE(&p[2], (ULong)(Addr)disp_cp_chain_me);
4209    p[10] = 0x41;
4210    p[11] = 0xFF;
4211    p[12] = 0xD3;
4212    VexInvalRange vir = { (HWord)place_to_unchain, 13 };
4213    return vir;
4214 }
4215 
4216 
4217 /* Patch the counter address into a profile inc point, as previously
4218    created by the Ain_ProfInc case for emit_AMD64Instr. */
patchProfInc_AMD64(VexEndness endness_host,void * place_to_patch,const ULong * location_of_counter)4219 VexInvalRange patchProfInc_AMD64 ( VexEndness endness_host,
4220                                    void*  place_to_patch,
4221                                    const ULong* location_of_counter )
4222 {
4223    vassert(endness_host == VexEndnessLE);
4224    vassert(sizeof(ULong*) == 8);
4225    UChar* p = (UChar*)place_to_patch;
4226    vassert(p[0] == 0x49);
4227    vassert(p[1] == 0xBB);
4228    vassert(p[2] == 0x00);
4229    vassert(p[3] == 0x00);
4230    vassert(p[4] == 0x00);
4231    vassert(p[5] == 0x00);
4232    vassert(p[6] == 0x00);
4233    vassert(p[7] == 0x00);
4234    vassert(p[8] == 0x00);
4235    vassert(p[9] == 0x00);
4236    vassert(p[10] == 0x49);
4237    vassert(p[11] == 0xFF);
4238    vassert(p[12] == 0x03);
4239    ULong imm64 = (ULong)(Addr)location_of_counter;
4240    p[2] = imm64 & 0xFF; imm64 >>= 8;
4241    p[3] = imm64 & 0xFF; imm64 >>= 8;
4242    p[4] = imm64 & 0xFF; imm64 >>= 8;
4243    p[5] = imm64 & 0xFF; imm64 >>= 8;
4244    p[6] = imm64 & 0xFF; imm64 >>= 8;
4245    p[7] = imm64 & 0xFF; imm64 >>= 8;
4246    p[8] = imm64 & 0xFF; imm64 >>= 8;
4247    p[9] = imm64 & 0xFF; imm64 >>= 8;
4248    VexInvalRange vir = { (HWord)place_to_patch, 13 };
4249    return vir;
4250 }
4251 
4252 
4253 /*---------------------------------------------------------------*/
4254 /*--- end                                   host_amd64_defs.c ---*/
4255 /*---------------------------------------------------------------*/
4256