1 
2 /*---------------------------------------------------------------*/
3 /*--- begin                             guest_amd64_helpers.c ---*/
4 /*---------------------------------------------------------------*/
5 
6 /*
7    This file is part of Valgrind, a dynamic binary instrumentation
8    framework.
9 
10    Copyright (C) 2004-2017 OpenWorks LLP
11       info@open-works.net
12 
13    This program is free software; you can redistribute it and/or
14    modify it under the terms of the GNU General Public License as
15    published by the Free Software Foundation; either version 2 of the
16    License, or (at your option) any later version.
17 
18    This program is distributed in the hope that it will be useful, but
19    WITHOUT ANY WARRANTY; without even the implied warranty of
20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21    General Public License for more details.
22 
23    You should have received a copy of the GNU General Public License
24    along with this program; if not, write to the Free Software
25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26    02110-1301, USA.
27 
28    The GNU General Public License is contained in the file COPYING.
29 
30    Neither the names of the U.S. Department of Energy nor the
31    University of California nor the names of its contributors may be
32    used to endorse or promote products derived from this software
33    without prior written permission.
34 */
35 
36 #include "libvex_basictypes.h"
37 #include "libvex_emnote.h"
38 #include "libvex_guest_amd64.h"
39 #include "libvex_ir.h"
40 #include "libvex.h"
41 
42 #include "main_util.h"
43 #include "main_globals.h"
44 #include "guest_generic_bb_to_IR.h"
45 #include "guest_amd64_defs.h"
46 #include "guest_generic_x87.h"
47 
48 
49 /* This file contains helper functions for amd64 guest code.
50    Calls to these functions are generated by the back end.
51    These calls are of course in the host machine code and
52    this file will be compiled to host machine code, so that
53    all makes sense.
54 
55    Only change the signatures of these helper functions very
56    carefully.  If you change the signature here, you'll have to change
57    the parameters passed to it in the IR calls constructed by
58    guest-amd64/toIR.c.
59 
60    The convention used is that all functions called from generated
61    code are named amd64g_<something>, and any function whose name lacks
62    that prefix is not called from generated code.  Note that some
63    LibVEX_* functions can however be called by VEX's client, but that
64    is not the same as calling them from VEX-generated code.
65 */
66 
67 
68 /* Set to 1 to get detailed profiling info about use of the flag
69    machinery. */
70 #define PROFILE_RFLAGS 0
71 
72 
73 /*---------------------------------------------------------------*/
74 /*--- %rflags run-time helpers.                               ---*/
75 /*---------------------------------------------------------------*/
76 
77 /* Do 64x64 -> 128 signed/unsigned multiplies, for computing flags
78    after imulq/mulq. */
79 
mullS64(Long u,Long v,Long * rHi,Long * rLo)80 static void mullS64 ( Long u, Long v, Long* rHi, Long* rLo )
81 {
82    const Long halfMask = 0xFFFFFFFFLL;
83    ULong u0, v0, w0;
84     Long u1, v1, w1, w2, t;
85    u0   = u & halfMask;
86    u1   = u >> 32;
87    v0   = v & halfMask;
88    v1   = v >> 32;
89    w0   = u0 * v0;
90    t    = u1 * v0 + (w0 >> 32);
91    w1   = t & halfMask;
92    w2   = t >> 32;
93    w1   = u0 * v1 + w1;
94    *rHi = u1 * v1 + w2 + (w1 >> 32);
95    *rLo = (Long)((ULong)u * (ULong)v);
96 }
97 
mullU64(ULong u,ULong v,ULong * rHi,ULong * rLo)98 static void mullU64 ( ULong u, ULong v, ULong* rHi, ULong* rLo )
99 {
100    const ULong halfMask = 0xFFFFFFFFULL;
101    ULong u0, v0, w0;
102    ULong u1, v1, w1,w2,t;
103    u0   = u & halfMask;
104    u1   = u >> 32;
105    v0   = v & halfMask;
106    v1   = v >> 32;
107    w0   = u0 * v0;
108    t    = u1 * v0 + (w0 >> 32);
109    w1   = t & halfMask;
110    w2   = t >> 32;
111    w1   = u0 * v1 + w1;
112    *rHi = u1 * v1 + w2 + (w1 >> 32);
113    *rLo = u * v;
114 }
115 
116 
117 static const UChar parity_table[256] = {
118     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
119     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
120     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
121     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
122     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
123     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
124     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
125     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
126     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
127     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
128     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
129     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
130     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
131     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
132     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
133     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
134     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
135     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
136     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
137     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
138     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
139     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
140     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
141     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
142     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
143     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
144     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
145     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
146     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
147     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
148     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
149     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
150 };
151 
152 /* generalised left-shifter */
lshift(Long x,Int n)153 static inline Long lshift ( Long x, Int n )
154 {
155    if (n >= 0)
156       return (ULong)x << n;
157    else
158       return x >> (-n);
159 }
160 
161 /* identity on ULong */
idULong(ULong x)162 static inline ULong idULong ( ULong x )
163 {
164    return x;
165 }
166 
167 
168 #define PREAMBLE(__data_bits)					\
169    /* const */ ULong DATA_MASK 					\
170       = __data_bits==8                                          \
171            ? 0xFFULL 					        \
172            : (__data_bits==16                                   \
173                 ? 0xFFFFULL 		                        \
174                 : (__data_bits==32                              \
175                      ? 0xFFFFFFFFULL                            \
176                      : 0xFFFFFFFFFFFFFFFFULL));                 \
177    /* const */ ULong SIGN_MASK = 1ULL << (__data_bits - 1);     \
178    /* const */ ULong CC_DEP1 = cc_dep1_formal;			\
179    /* const */ ULong CC_DEP2 = cc_dep2_formal;			\
180    /* const */ ULong CC_NDEP = cc_ndep_formal;			\
181    /* Four bogus assignments, which hopefully gcc can     */	\
182    /* optimise away, and which stop it complaining about  */	\
183    /* unused variables.                                   */	\
184    SIGN_MASK = SIGN_MASK;					\
185    DATA_MASK = DATA_MASK;					\
186    CC_DEP2 = CC_DEP2;						\
187    CC_NDEP = CC_NDEP;
188 
189 
190 /*-------------------------------------------------------------*/
191 
192 #define ACTIONS_ADD(DATA_BITS,DATA_UTYPE)			\
193 {								\
194    PREAMBLE(DATA_BITS);						\
195    { ULong cf, pf, af, zf, sf, of;				\
196      ULong argL, argR, res;					\
197      argL = CC_DEP1;						\
198      argR = CC_DEP2;						\
199      res  = argL + argR;					\
200      cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;			\
201      pf = parity_table[(UChar)res];				\
202      af = (res ^ argL ^ argR) & 0x10;				\
203      zf = ((DATA_UTYPE)res == 0) << 6;				\
204      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
205      of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
206                  12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
207      return cf | pf | af | zf | sf | of;			\
208    }								\
209 }
210 
211 /*-------------------------------------------------------------*/
212 
213 #define ACTIONS_SUB(DATA_BITS,DATA_UTYPE)			\
214 {								\
215    PREAMBLE(DATA_BITS);						\
216    { ULong cf, pf, af, zf, sf, of;				\
217      ULong argL, argR, res;					\
218      argL = CC_DEP1;						\
219      argR = CC_DEP2;						\
220      res  = argL - argR;					\
221      cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;			\
222      pf = parity_table[(UChar)res];				\
223      af = (res ^ argL ^ argR) & 0x10;				\
224      zf = ((DATA_UTYPE)res == 0) << 6;				\
225      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
226      of = lshift((argL ^ argR) & (argL ^ res),	 		\
227                  12 - DATA_BITS) & AMD64G_CC_MASK_O; 		\
228      return cf | pf | af | zf | sf | of;			\
229    }								\
230 }
231 
232 /*-------------------------------------------------------------*/
233 
234 #define ACTIONS_ADC(DATA_BITS,DATA_UTYPE)			\
235 {								\
236    PREAMBLE(DATA_BITS);						\
237    { ULong cf, pf, af, zf, sf, of;				\
238      ULong argL, argR, oldC, res;		 		\
239      oldC = CC_NDEP & AMD64G_CC_MASK_C;				\
240      argL = CC_DEP1;						\
241      argR = CC_DEP2 ^ oldC;	       				\
242      res  = (argL + argR) + oldC;				\
243      if (oldC)							\
244         cf = (DATA_UTYPE)res <= (DATA_UTYPE)argL;		\
245      else							\
246         cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;		\
247      pf = parity_table[(UChar)res];				\
248      af = (res ^ argL ^ argR) & 0x10;				\
249      zf = ((DATA_UTYPE)res == 0) << 6;				\
250      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
251      of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
252                   12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
253      return cf | pf | af | zf | sf | of;			\
254    }								\
255 }
256 
257 /*-------------------------------------------------------------*/
258 
259 #define ACTIONS_SBB(DATA_BITS,DATA_UTYPE)			\
260 {								\
261    PREAMBLE(DATA_BITS);						\
262    { ULong cf, pf, af, zf, sf, of;				\
263      ULong argL, argR, oldC, res;	       			\
264      oldC = CC_NDEP & AMD64G_CC_MASK_C;				\
265      argL = CC_DEP1;						\
266      argR = CC_DEP2 ^ oldC;	       				\
267      res  = (argL - argR) - oldC;				\
268      if (oldC)							\
269         cf = (DATA_UTYPE)argL <= (DATA_UTYPE)argR;		\
270      else							\
271         cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;		\
272      pf = parity_table[(UChar)res];				\
273      af = (res ^ argL ^ argR) & 0x10;				\
274      zf = ((DATA_UTYPE)res == 0) << 6;				\
275      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
276      of = lshift((argL ^ argR) & (argL ^ res), 			\
277                  12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
278      return cf | pf | af | zf | sf | of;			\
279    }								\
280 }
281 
282 /*-------------------------------------------------------------*/
283 
284 #define ACTIONS_LOGIC(DATA_BITS,DATA_UTYPE)			\
285 {								\
286    PREAMBLE(DATA_BITS);						\
287    { ULong cf, pf, af, zf, sf, of;				\
288      cf = 0;							\
289      pf = parity_table[(UChar)CC_DEP1];				\
290      af = 0;							\
291      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
292      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
293      of = 0;							\
294      return cf | pf | af | zf | sf | of;			\
295    }								\
296 }
297 
298 /*-------------------------------------------------------------*/
299 
300 #define ACTIONS_INC(DATA_BITS,DATA_UTYPE)			\
301 {								\
302    PREAMBLE(DATA_BITS);						\
303    { ULong cf, pf, af, zf, sf, of;				\
304      ULong argL, argR, res;					\
305      res  = CC_DEP1;						\
306      argL = res - 1;						\
307      argR = 1;							\
308      cf = CC_NDEP & AMD64G_CC_MASK_C;				\
309      pf = parity_table[(UChar)res];				\
310      af = (res ^ argL ^ argR) & 0x10;				\
311      zf = ((DATA_UTYPE)res == 0) << 6;				\
312      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
313      of = ((res & DATA_MASK) == SIGN_MASK) << 11;		\
314      return cf | pf | af | zf | sf | of;			\
315    }								\
316 }
317 
318 /*-------------------------------------------------------------*/
319 
320 #define ACTIONS_DEC(DATA_BITS,DATA_UTYPE)			\
321 {								\
322    PREAMBLE(DATA_BITS);						\
323    { ULong cf, pf, af, zf, sf, of;				\
324      ULong argL, argR, res;					\
325      res  = CC_DEP1;						\
326      argL = res + 1;						\
327      argR = 1;							\
328      cf = CC_NDEP & AMD64G_CC_MASK_C;				\
329      pf = parity_table[(UChar)res];				\
330      af = (res ^ argL ^ argR) & 0x10;				\
331      zf = ((DATA_UTYPE)res == 0) << 6;				\
332      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
333      of = ((res & DATA_MASK) 					\
334           == ((ULong)SIGN_MASK - 1)) << 11;			\
335      return cf | pf | af | zf | sf | of;			\
336    }								\
337 }
338 
339 /*-------------------------------------------------------------*/
340 
341 #define ACTIONS_SHL(DATA_BITS,DATA_UTYPE)			\
342 {								\
343    PREAMBLE(DATA_BITS);						\
344    { ULong cf, pf, af, zf, sf, of;				\
345      cf = (CC_DEP2 >> (DATA_BITS - 1)) & AMD64G_CC_MASK_C;	\
346      pf = parity_table[(UChar)CC_DEP1];				\
347      af = 0; /* undefined */					\
348      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
349      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
350      /* of is defined if shift count == 1 */			\
351      of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) 		\
352           & AMD64G_CC_MASK_O;					\
353      return cf | pf | af | zf | sf | of;			\
354    }								\
355 }
356 
357 /*-------------------------------------------------------------*/
358 
359 #define ACTIONS_SHR(DATA_BITS,DATA_UTYPE)			\
360 {								\
361    PREAMBLE(DATA_BITS);  					\
362    { ULong cf, pf, af, zf, sf, of;				\
363      cf = CC_DEP2 & 1;						\
364      pf = parity_table[(UChar)CC_DEP1];				\
365      af = 0; /* undefined */					\
366      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
367      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
368      /* of is defined if shift count == 1 */			\
369      of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS)		\
370           & AMD64G_CC_MASK_O;					\
371      return cf | pf | af | zf | sf | of;			\
372    }								\
373 }
374 
375 /*-------------------------------------------------------------*/
376 
377 /* ROL: cf' = lsb(result).  of' = msb(result) ^ lsb(result). */
378 /* DEP1 = result, NDEP = old flags */
379 #define ACTIONS_ROL(DATA_BITS,DATA_UTYPE)			\
380 {								\
381    PREAMBLE(DATA_BITS);						\
382    { ULong fl 							\
383         = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))	\
384           | (AMD64G_CC_MASK_C & CC_DEP1)			\
385           | (AMD64G_CC_MASK_O & (lshift(CC_DEP1,  		\
386                                       11-(DATA_BITS-1)) 	\
387                      ^ lshift(CC_DEP1, 11)));			\
388      return fl;							\
389    }								\
390 }
391 
392 /*-------------------------------------------------------------*/
393 
394 /* ROR: cf' = msb(result).  of' = msb(result) ^ msb-1(result). */
395 /* DEP1 = result, NDEP = old flags */
396 #define ACTIONS_ROR(DATA_BITS,DATA_UTYPE)			\
397 {								\
398    PREAMBLE(DATA_BITS);						\
399    { ULong fl 							\
400         = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))	\
401           | (AMD64G_CC_MASK_C & (CC_DEP1 >> (DATA_BITS-1)))	\
402           | (AMD64G_CC_MASK_O & (lshift(CC_DEP1, 		\
403                                       11-(DATA_BITS-1)) 	\
404                      ^ lshift(CC_DEP1, 11-(DATA_BITS-1)+1)));	\
405      return fl;							\
406    }								\
407 }
408 
409 /*-------------------------------------------------------------*/
410 
411 #define ACTIONS_UMUL(DATA_BITS, DATA_UTYPE,  NARROWtoU,         \
412                                 DATA_U2TYPE, NARROWto2U)        \
413 {                                                               \
414    PREAMBLE(DATA_BITS);                                         \
415    { ULong cf, pf, af, zf, sf, of;                              \
416      DATA_UTYPE  hi;                                            \
417      DATA_UTYPE  lo                                             \
418         = NARROWtoU( ((DATA_UTYPE)CC_DEP1)                      \
419                      * ((DATA_UTYPE)CC_DEP2) );                 \
420      DATA_U2TYPE rr                                             \
421         = NARROWto2U(                                           \
422              ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP1))               \
423              * ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP2)) );          \
424      hi = NARROWtoU(rr >>/*u*/ DATA_BITS);                      \
425      cf = (hi != 0);                                            \
426      pf = parity_table[(UChar)lo];                              \
427      af = 0; /* undefined */                                    \
428      zf = (lo == 0) << 6;                                       \
429      sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
430      of = cf << 11;                                             \
431      return cf | pf | af | zf | sf | of;                        \
432    }								\
433 }
434 
435 /*-------------------------------------------------------------*/
436 
437 #define ACTIONS_SMUL(DATA_BITS, DATA_STYPE,  NARROWtoS,         \
438                                 DATA_S2TYPE, NARROWto2S)        \
439 {                                                               \
440    PREAMBLE(DATA_BITS);                                         \
441    { ULong cf, pf, af, zf, sf, of;                              \
442      DATA_STYPE  hi;                                            \
443      DATA_STYPE  lo                                             \
444         = NARROWtoS( ((DATA_S2TYPE)(DATA_STYPE)CC_DEP1)         \
445                      * ((DATA_S2TYPE)(DATA_STYPE)CC_DEP2) );    \
446      DATA_S2TYPE rr                                             \
447         = NARROWto2S(                                           \
448              ((DATA_S2TYPE)((DATA_STYPE)CC_DEP1))               \
449              * ((DATA_S2TYPE)((DATA_STYPE)CC_DEP2)) );          \
450      hi = NARROWtoS(rr >>/*s*/ DATA_BITS);                      \
451      cf = (hi != (lo >>/*s*/ (DATA_BITS-1)));                   \
452      pf = parity_table[(UChar)lo];                              \
453      af = 0; /* undefined */                                    \
454      zf = (lo == 0) << 6;                                       \
455      sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
456      of = cf << 11;                                             \
457      return cf | pf | af | zf | sf | of;                        \
458    }								\
459 }
460 
461 /*-------------------------------------------------------------*/
462 
463 #define ACTIONS_UMULQ                                           \
464 {                                                               \
465    PREAMBLE(64);                                                \
466    { ULong cf, pf, af, zf, sf, of;                              \
467      ULong lo, hi;                                              \
468      mullU64( (ULong)CC_DEP1, (ULong)CC_DEP2, &hi, &lo );       \
469      cf = (hi != 0);                                            \
470      pf = parity_table[(UChar)lo];                              \
471      af = 0; /* undefined */                                    \
472      zf = (lo == 0) << 6;                                       \
473      sf = lshift(lo, 8 - 64) & 0x80;                            \
474      of = cf << 11;                                             \
475      return cf | pf | af | zf | sf | of;                        \
476    }								\
477 }
478 
479 /*-------------------------------------------------------------*/
480 
481 #define ACTIONS_SMULQ                                           \
482 {                                                               \
483    PREAMBLE(64);                                                \
484    { ULong cf, pf, af, zf, sf, of;                              \
485      Long lo, hi;                                               \
486      mullS64( (Long)CC_DEP1, (Long)CC_DEP2, &hi, &lo );         \
487      cf = (hi != (lo >>/*s*/ (64-1)));                          \
488      pf = parity_table[(UChar)lo];                              \
489      af = 0; /* undefined */                                    \
490      zf = (lo == 0) << 6;                                       \
491      sf = lshift(lo, 8 - 64) & 0x80;                            \
492      of = cf << 11;                                             \
493      return cf | pf | af | zf | sf | of;                        \
494    }								\
495 }
496 
497 /*-------------------------------------------------------------*/
498 
499 #define ACTIONS_ANDN(DATA_BITS,DATA_UTYPE)			\
500 {								\
501    PREAMBLE(DATA_BITS);						\
502    { ULong cf, pf, af, zf, sf, of;				\
503      cf = 0;							\
504      pf = 0;							\
505      af = 0;							\
506      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
507      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
508      of = 0;							\
509      return cf | pf | af | zf | sf | of;			\
510    }								\
511 }
512 
513 /*-------------------------------------------------------------*/
514 
515 #define ACTIONS_BLSI(DATA_BITS,DATA_UTYPE)			\
516 {								\
517    PREAMBLE(DATA_BITS);						\
518    { ULong cf, pf, af, zf, sf, of;				\
519      cf = ((DATA_UTYPE)CC_DEP2 != 0);				\
520      pf = 0;							\
521      af = 0;							\
522      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
523      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
524      of = 0;							\
525      return cf | pf | af | zf | sf | of;			\
526    }								\
527 }
528 
529 /*-------------------------------------------------------------*/
530 
531 #define ACTIONS_BLSMSK(DATA_BITS,DATA_UTYPE)			\
532 {								\
533    PREAMBLE(DATA_BITS);						\
534    { Long cf, pf, af, zf, sf, of;				\
535      cf = ((DATA_UTYPE)CC_DEP2 == 0);				\
536      pf = 0;							\
537      af = 0;							\
538      zf = 0;							\
539      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
540      of = 0;							\
541      return cf | pf | af | zf | sf | of;			\
542    }								\
543 }
544 
545 /*-------------------------------------------------------------*/
546 
547 #define ACTIONS_BLSR(DATA_BITS,DATA_UTYPE)			\
548 {								\
549    PREAMBLE(DATA_BITS);						\
550    { ULong cf, pf, af, zf, sf, of;				\
551      cf = ((DATA_UTYPE)CC_DEP2 == 0);				\
552      pf = 0;							\
553      af = 0;							\
554      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
555      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
556      of = 0;							\
557      return cf | pf | af | zf | sf | of;			\
558    }								\
559 }
560 
561 /*-------------------------------------------------------------*/
562 
563 #define ACTIONS_ADX(DATA_BITS,DATA_UTYPE,FLAGNAME)		\
564 {								\
565    PREAMBLE(DATA_BITS);						\
566    { ULong ocf;	/* o or c */					\
567      ULong argL, argR, oldOC, res;				\
568      oldOC = (CC_NDEP >> AMD64G_CC_SHIFT_##FLAGNAME) & 1;	\
569      argL  = CC_DEP1;						\
570      argR  = CC_DEP2 ^ oldOC;					\
571      res   = (argL + argR) + oldOC;				\
572      if (oldOC)							\
573         ocf = (DATA_UTYPE)res <= (DATA_UTYPE)argL;		\
574      else							\
575         ocf = (DATA_UTYPE)res < (DATA_UTYPE)argL;		\
576      return (CC_NDEP & ~AMD64G_CC_MASK_##FLAGNAME)		\
577             | (ocf << AMD64G_CC_SHIFT_##FLAGNAME);		\
578    }								\
579 }
580 
581 /*-------------------------------------------------------------*/
582 
583 
584 #if PROFILE_RFLAGS
585 
586 static Bool initted     = False;
587 
588 /* C flag, fast route */
589 static UInt tabc_fast[AMD64G_CC_OP_NUMBER];
590 /* C flag, slow route */
591 static UInt tabc_slow[AMD64G_CC_OP_NUMBER];
592 /* table for calculate_cond */
593 static UInt tab_cond[AMD64G_CC_OP_NUMBER][16];
594 /* total entry counts for calc_all, calc_c, calc_cond. */
595 static UInt n_calc_all  = 0;
596 static UInt n_calc_c    = 0;
597 static UInt n_calc_cond = 0;
598 
599 #define SHOW_COUNTS_NOW (0 == (0x3FFFFF & (n_calc_all+n_calc_c+n_calc_cond)))
600 
601 
showCounts(void)602 static void showCounts ( void )
603 {
604    Int op, co;
605    HChar ch;
606    vex_printf("\nTotal calls: calc_all=%u   calc_cond=%u   calc_c=%u\n",
607               n_calc_all, n_calc_cond, n_calc_c);
608 
609    vex_printf("      cSLOW  cFAST    O   NO    B   NB    Z   NZ   BE  NBE"
610               "    S   NS    P   NP    L   NL   LE  NLE\n");
611    vex_printf("     -----------------------------------------------------"
612               "----------------------------------------\n");
613    for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
614 
615       ch = ' ';
616       if (op > 0 && (op-1) % 4 == 0)
617          ch = 'B';
618       if (op > 0 && (op-1) % 4 == 1)
619          ch = 'W';
620       if (op > 0 && (op-1) % 4 == 2)
621          ch = 'L';
622       if (op > 0 && (op-1) % 4 == 3)
623          ch = 'Q';
624 
625       vex_printf("%2d%c: ", op, ch);
626       vex_printf("%6u ", tabc_slow[op]);
627       vex_printf("%6u ", tabc_fast[op]);
628       for (co = 0; co < 16; co++) {
629          Int n = tab_cond[op][co];
630          if (n >= 1000) {
631             vex_printf(" %3dK", n / 1000);
632          } else
633          if (n >= 0) {
634             vex_printf(" %3d ", n );
635          } else {
636             vex_printf("     ");
637          }
638       }
639       vex_printf("\n");
640    }
641    vex_printf("\n");
642 }
643 
initCounts(void)644 static void initCounts ( void )
645 {
646    Int op, co;
647    initted = True;
648    for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
649       tabc_fast[op] = tabc_slow[op] = 0;
650       for (co = 0; co < 16; co++)
651          tab_cond[op][co] = 0;
652    }
653 }
654 
655 #endif /* PROFILE_RFLAGS */
656 
657 
658 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
659 /* Calculate all the 6 flags from the supplied thunk parameters.
660    Worker function, not directly called from generated code. */
661 static
amd64g_calculate_rflags_all_WRK(ULong cc_op,ULong cc_dep1_formal,ULong cc_dep2_formal,ULong cc_ndep_formal)662 ULong amd64g_calculate_rflags_all_WRK ( ULong cc_op,
663                                         ULong cc_dep1_formal,
664                                         ULong cc_dep2_formal,
665                                         ULong cc_ndep_formal )
666 {
667    switch (cc_op) {
668       case AMD64G_CC_OP_COPY:
669          return cc_dep1_formal
670                 & (AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z
671                    | AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P);
672 
673       case AMD64G_CC_OP_ADDB:   ACTIONS_ADD( 8,  UChar  );
674       case AMD64G_CC_OP_ADDW:   ACTIONS_ADD( 16, UShort );
675       case AMD64G_CC_OP_ADDL:   ACTIONS_ADD( 32, UInt   );
676       case AMD64G_CC_OP_ADDQ:   ACTIONS_ADD( 64, ULong  );
677 
678       case AMD64G_CC_OP_ADCB:   ACTIONS_ADC( 8,  UChar  );
679       case AMD64G_CC_OP_ADCW:   ACTIONS_ADC( 16, UShort );
680       case AMD64G_CC_OP_ADCL:   ACTIONS_ADC( 32, UInt   );
681       case AMD64G_CC_OP_ADCQ:   ACTIONS_ADC( 64, ULong  );
682 
683       case AMD64G_CC_OP_SUBB:   ACTIONS_SUB(  8, UChar  );
684       case AMD64G_CC_OP_SUBW:   ACTIONS_SUB( 16, UShort );
685       case AMD64G_CC_OP_SUBL:   ACTIONS_SUB( 32, UInt   );
686       case AMD64G_CC_OP_SUBQ:   ACTIONS_SUB( 64, ULong  );
687 
688       case AMD64G_CC_OP_SBBB:   ACTIONS_SBB(  8, UChar  );
689       case AMD64G_CC_OP_SBBW:   ACTIONS_SBB( 16, UShort );
690       case AMD64G_CC_OP_SBBL:   ACTIONS_SBB( 32, UInt   );
691       case AMD64G_CC_OP_SBBQ:   ACTIONS_SBB( 64, ULong  );
692 
693       case AMD64G_CC_OP_LOGICB: ACTIONS_LOGIC(  8, UChar  );
694       case AMD64G_CC_OP_LOGICW: ACTIONS_LOGIC( 16, UShort );
695       case AMD64G_CC_OP_LOGICL: ACTIONS_LOGIC( 32, UInt   );
696       case AMD64G_CC_OP_LOGICQ: ACTIONS_LOGIC( 64, ULong  );
697 
698       case AMD64G_CC_OP_INCB:   ACTIONS_INC(  8, UChar  );
699       case AMD64G_CC_OP_INCW:   ACTIONS_INC( 16, UShort );
700       case AMD64G_CC_OP_INCL:   ACTIONS_INC( 32, UInt   );
701       case AMD64G_CC_OP_INCQ:   ACTIONS_INC( 64, ULong  );
702 
703       case AMD64G_CC_OP_DECB:   ACTIONS_DEC(  8, UChar  );
704       case AMD64G_CC_OP_DECW:   ACTIONS_DEC( 16, UShort );
705       case AMD64G_CC_OP_DECL:   ACTIONS_DEC( 32, UInt   );
706       case AMD64G_CC_OP_DECQ:   ACTIONS_DEC( 64, ULong  );
707 
708       case AMD64G_CC_OP_SHLB:   ACTIONS_SHL(  8, UChar  );
709       case AMD64G_CC_OP_SHLW:   ACTIONS_SHL( 16, UShort );
710       case AMD64G_CC_OP_SHLL:   ACTIONS_SHL( 32, UInt   );
711       case AMD64G_CC_OP_SHLQ:   ACTIONS_SHL( 64, ULong  );
712 
713       case AMD64G_CC_OP_SHRB:   ACTIONS_SHR(  8, UChar  );
714       case AMD64G_CC_OP_SHRW:   ACTIONS_SHR( 16, UShort );
715       case AMD64G_CC_OP_SHRL:   ACTIONS_SHR( 32, UInt   );
716       case AMD64G_CC_OP_SHRQ:   ACTIONS_SHR( 64, ULong  );
717 
718       case AMD64G_CC_OP_ROLB:   ACTIONS_ROL(  8, UChar  );
719       case AMD64G_CC_OP_ROLW:   ACTIONS_ROL( 16, UShort );
720       case AMD64G_CC_OP_ROLL:   ACTIONS_ROL( 32, UInt   );
721       case AMD64G_CC_OP_ROLQ:   ACTIONS_ROL( 64, ULong  );
722 
723       case AMD64G_CC_OP_RORB:   ACTIONS_ROR(  8, UChar  );
724       case AMD64G_CC_OP_RORW:   ACTIONS_ROR( 16, UShort );
725       case AMD64G_CC_OP_RORL:   ACTIONS_ROR( 32, UInt   );
726       case AMD64G_CC_OP_RORQ:   ACTIONS_ROR( 64, ULong  );
727 
728       case AMD64G_CC_OP_UMULB:  ACTIONS_UMUL(  8, UChar,  toUChar,
729                                                   UShort, toUShort );
730       case AMD64G_CC_OP_UMULW:  ACTIONS_UMUL( 16, UShort, toUShort,
731                                                   UInt,   toUInt );
732       case AMD64G_CC_OP_UMULL:  ACTIONS_UMUL( 32, UInt,   toUInt,
733                                                   ULong,  idULong );
734 
735       case AMD64G_CC_OP_UMULQ:  ACTIONS_UMULQ;
736 
737       case AMD64G_CC_OP_SMULB:  ACTIONS_SMUL(  8, Char,   toUChar,
738                                                   Short,  toUShort );
739       case AMD64G_CC_OP_SMULW:  ACTIONS_SMUL( 16, Short,  toUShort,
740                                                   Int,    toUInt   );
741       case AMD64G_CC_OP_SMULL:  ACTIONS_SMUL( 32, Int,    toUInt,
742                                                   Long,   idULong );
743 
744       case AMD64G_CC_OP_SMULQ:  ACTIONS_SMULQ;
745 
746       case AMD64G_CC_OP_ANDN32: ACTIONS_ANDN( 32, UInt   );
747       case AMD64G_CC_OP_ANDN64: ACTIONS_ANDN( 64, ULong  );
748 
749       case AMD64G_CC_OP_BLSI32: ACTIONS_BLSI( 32, UInt   );
750       case AMD64G_CC_OP_BLSI64: ACTIONS_BLSI( 64, ULong  );
751 
752       case AMD64G_CC_OP_BLSMSK32: ACTIONS_BLSMSK( 32, UInt   );
753       case AMD64G_CC_OP_BLSMSK64: ACTIONS_BLSMSK( 64, ULong  );
754 
755       case AMD64G_CC_OP_BLSR32: ACTIONS_BLSR( 32, UInt   );
756       case AMD64G_CC_OP_BLSR64: ACTIONS_BLSR( 64, ULong  );
757 
758       case AMD64G_CC_OP_ADCX32: ACTIONS_ADX( 32, UInt,  C );
759       case AMD64G_CC_OP_ADCX64: ACTIONS_ADX( 64, ULong, C );
760 
761       case AMD64G_CC_OP_ADOX32: ACTIONS_ADX( 32, UInt,  O );
762       case AMD64G_CC_OP_ADOX64: ACTIONS_ADX( 64, ULong, O );
763 
764       default:
765          /* shouldn't really make these calls from generated code */
766          vex_printf("amd64g_calculate_rflags_all_WRK(AMD64)"
767                     "( %llu, 0x%llx, 0x%llx, 0x%llx )\n",
768                     cc_op, cc_dep1_formal, cc_dep2_formal, cc_ndep_formal );
769          vpanic("amd64g_calculate_rflags_all_WRK(AMD64)");
770    }
771 }
772 
773 
774 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
775 /* Calculate all the 6 flags from the supplied thunk parameters. */
amd64g_calculate_rflags_all(ULong cc_op,ULong cc_dep1,ULong cc_dep2,ULong cc_ndep)776 ULong amd64g_calculate_rflags_all ( ULong cc_op,
777                                     ULong cc_dep1,
778                                     ULong cc_dep2,
779                                     ULong cc_ndep )
780 {
781 #  if PROFILE_RFLAGS
782    if (!initted) initCounts();
783    n_calc_all++;
784    if (SHOW_COUNTS_NOW) showCounts();
785 #  endif
786    return
787       amd64g_calculate_rflags_all_WRK ( cc_op, cc_dep1, cc_dep2, cc_ndep );
788 }
789 
790 
791 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
792 /* Calculate just the carry flag from the supplied thunk parameters. */
amd64g_calculate_rflags_c(ULong cc_op,ULong cc_dep1,ULong cc_dep2,ULong cc_ndep)793 ULong amd64g_calculate_rflags_c ( ULong cc_op,
794                                   ULong cc_dep1,
795                                   ULong cc_dep2,
796                                   ULong cc_ndep )
797 {
798 #  if PROFILE_RFLAGS
799    if (!initted) initCounts();
800    n_calc_c++;
801    tabc_fast[cc_op]++;
802    if (SHOW_COUNTS_NOW) showCounts();
803 #  endif
804 
805    /* Fast-case some common ones. */
806    switch (cc_op) {
807       case AMD64G_CC_OP_COPY:
808          return (cc_dep1 >> AMD64G_CC_SHIFT_C) & 1;
809       case AMD64G_CC_OP_LOGICQ:
810       case AMD64G_CC_OP_LOGICL:
811       case AMD64G_CC_OP_LOGICW:
812       case AMD64G_CC_OP_LOGICB:
813          return 0;
814 	 //      case AMD64G_CC_OP_SUBL:
815 	 //         return ((UInt)cc_dep1) < ((UInt)cc_dep2)
816 	 //                   ? AMD64G_CC_MASK_C : 0;
817 	 //      case AMD64G_CC_OP_SUBW:
818 	 //         return ((UInt)(cc_dep1 & 0xFFFF)) < ((UInt)(cc_dep2 & 0xFFFF))
819 	 //                   ? AMD64G_CC_MASK_C : 0;
820 	 //      case AMD64G_CC_OP_SUBB:
821 	 //         return ((UInt)(cc_dep1 & 0xFF)) < ((UInt)(cc_dep2 & 0xFF))
822 	 //                   ? AMD64G_CC_MASK_C : 0;
823 	 //      case AMD64G_CC_OP_INCL:
824 	 //      case AMD64G_CC_OP_DECL:
825 	 //         return cc_ndep & AMD64G_CC_MASK_C;
826       default:
827          break;
828    }
829 
830 #  if PROFILE_RFLAGS
831    tabc_fast[cc_op]--;
832    tabc_slow[cc_op]++;
833 #  endif
834 
835    return amd64g_calculate_rflags_all_WRK(cc_op,cc_dep1,cc_dep2,cc_ndep)
836           & AMD64G_CC_MASK_C;
837 }
838 
839 
840 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
841 /* returns 1 or 0 */
amd64g_calculate_condition(ULong cond,ULong cc_op,ULong cc_dep1,ULong cc_dep2,ULong cc_ndep)842 ULong amd64g_calculate_condition ( ULong/*AMD64Condcode*/ cond,
843                                    ULong cc_op,
844                                    ULong cc_dep1,
845                                    ULong cc_dep2,
846                                    ULong cc_ndep )
847 {
848    ULong rflags = amd64g_calculate_rflags_all_WRK(cc_op, cc_dep1,
849                                                   cc_dep2, cc_ndep);
850    ULong of,sf,zf,cf,pf;
851    ULong inv = cond & 1;
852 
853 #  if PROFILE_RFLAGS
854    if (!initted) initCounts();
855    tab_cond[cc_op][cond]++;
856    n_calc_cond++;
857    if (SHOW_COUNTS_NOW) showCounts();
858 #  endif
859 
860    switch (cond) {
861       case AMD64CondNO:
862       case AMD64CondO: /* OF == 1 */
863          of = rflags >> AMD64G_CC_SHIFT_O;
864          return 1 & (inv ^ of);
865 
866       case AMD64CondNZ:
867       case AMD64CondZ: /* ZF == 1 */
868          zf = rflags >> AMD64G_CC_SHIFT_Z;
869          return 1 & (inv ^ zf);
870 
871       case AMD64CondNB:
872       case AMD64CondB: /* CF == 1 */
873          cf = rflags >> AMD64G_CC_SHIFT_C;
874          return 1 & (inv ^ cf);
875          break;
876 
877       case AMD64CondNBE:
878       case AMD64CondBE: /* (CF or ZF) == 1 */
879          cf = rflags >> AMD64G_CC_SHIFT_C;
880          zf = rflags >> AMD64G_CC_SHIFT_Z;
881          return 1 & (inv ^ (cf | zf));
882          break;
883 
884       case AMD64CondNS:
885       case AMD64CondS: /* SF == 1 */
886          sf = rflags >> AMD64G_CC_SHIFT_S;
887          return 1 & (inv ^ sf);
888 
889       case AMD64CondNP:
890       case AMD64CondP: /* PF == 1 */
891          pf = rflags >> AMD64G_CC_SHIFT_P;
892          return 1 & (inv ^ pf);
893 
894       case AMD64CondNL:
895       case AMD64CondL: /* (SF xor OF) == 1 */
896          sf = rflags >> AMD64G_CC_SHIFT_S;
897          of = rflags >> AMD64G_CC_SHIFT_O;
898          return 1 & (inv ^ (sf ^ of));
899          break;
900 
901       case AMD64CondNLE:
902       case AMD64CondLE: /* ((SF xor OF) or ZF)  == 1 */
903          sf = rflags >> AMD64G_CC_SHIFT_S;
904          of = rflags >> AMD64G_CC_SHIFT_O;
905          zf = rflags >> AMD64G_CC_SHIFT_Z;
906          return 1 & (inv ^ ((sf ^ of) | zf));
907          break;
908 
909       default:
910          /* shouldn't really make these calls from generated code */
911          vex_printf("amd64g_calculate_condition"
912                     "( %llu, %llu, 0x%llx, 0x%llx, 0x%llx )\n",
913                     cond, cc_op, cc_dep1, cc_dep2, cc_ndep );
914          vpanic("amd64g_calculate_condition");
915    }
916 }
917 
918 
919 /* VISIBLE TO LIBVEX CLIENT */
LibVEX_GuestAMD64_get_rflags(const VexGuestAMD64State * vex_state)920 ULong LibVEX_GuestAMD64_get_rflags ( /*IN*/const VexGuestAMD64State* vex_state )
921 {
922    ULong rflags = amd64g_calculate_rflags_all_WRK(
923                      vex_state->guest_CC_OP,
924                      vex_state->guest_CC_DEP1,
925                      vex_state->guest_CC_DEP2,
926                      vex_state->guest_CC_NDEP
927                   );
928    Long dflag = vex_state->guest_DFLAG;
929    vassert(dflag == 1 || dflag == -1);
930    if (dflag == -1)
931       rflags |= (1<<10);
932    if (vex_state->guest_IDFLAG == 1)
933       rflags |= (1<<21);
934    if (vex_state->guest_ACFLAG == 1)
935       rflags |= (1<<18);
936 
937    return rflags;
938 }
939 
940 /* VISIBLE TO LIBVEX CLIENT */
941 void
LibVEX_GuestAMD64_put_rflags(ULong rflags,VexGuestAMD64State * vex_state)942 LibVEX_GuestAMD64_put_rflags ( ULong rflags,
943                                /*MOD*/VexGuestAMD64State* vex_state )
944 {
945    /* D flag */
946    if (rflags & AMD64G_CC_MASK_D) {
947       vex_state->guest_DFLAG = -1;
948       rflags &= ~AMD64G_CC_MASK_D;
949    }
950    else
951       vex_state->guest_DFLAG = 1;
952 
953    /* ID flag */
954    if (rflags & AMD64G_CC_MASK_ID) {
955       vex_state->guest_IDFLAG = 1;
956       rflags &= ~AMD64G_CC_MASK_ID;
957    }
958    else
959       vex_state->guest_IDFLAG = 0;
960 
961    /* AC flag */
962    if (rflags & AMD64G_CC_MASK_AC) {
963       vex_state->guest_ACFLAG = 1;
964       rflags &= ~AMD64G_CC_MASK_AC;
965    }
966    else
967       vex_state->guest_ACFLAG = 0;
968 
969    UInt cc_mask = AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z |
970                   AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P;
971    vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
972    vex_state->guest_CC_DEP1 = rflags & cc_mask;
973    vex_state->guest_CC_DEP2 = 0;
974    vex_state->guest_CC_NDEP = 0;
975 }
976 
977 /* VISIBLE TO LIBVEX CLIENT */
978 void
LibVEX_GuestAMD64_put_rflag_c(ULong new_carry_flag,VexGuestAMD64State * vex_state)979 LibVEX_GuestAMD64_put_rflag_c ( ULong new_carry_flag,
980                                /*MOD*/VexGuestAMD64State* vex_state )
981 {
982    ULong oszacp = amd64g_calculate_rflags_all_WRK(
983                      vex_state->guest_CC_OP,
984                      vex_state->guest_CC_DEP1,
985                      vex_state->guest_CC_DEP2,
986                      vex_state->guest_CC_NDEP
987                   );
988    if (new_carry_flag & 1) {
989       oszacp |= AMD64G_CC_MASK_C;
990    } else {
991       oszacp &= ~AMD64G_CC_MASK_C;
992    }
993    vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
994    vex_state->guest_CC_DEP1 = oszacp;
995    vex_state->guest_CC_DEP2 = 0;
996    vex_state->guest_CC_NDEP = 0;
997 }
998 
999 
1000 /*---------------------------------------------------------------*/
1001 /*--- %rflags translation-time function specialisers.         ---*/
1002 /*--- These help iropt specialise calls the above run-time    ---*/
1003 /*--- %rflags functions.                                      ---*/
1004 /*---------------------------------------------------------------*/
1005 
1006 /* Used by the optimiser to try specialisations.  Returns an
1007    equivalent expression, or NULL if none. */
1008 
isU64(IRExpr * e,ULong n)1009 static inline Bool isU64 ( IRExpr* e, ULong n )
1010 {
1011    return e->tag == Iex_Const
1012           && e->Iex.Const.con->tag == Ico_U64
1013           && e->Iex.Const.con->Ico.U64 == n;
1014 }
1015 
1016 /* Returns N if W64 is a value of the form 1 << N for N in 1 to 31,
1017    and zero in any other case. */
isU64_1_shl_N_literal(ULong w64)1018 static Int isU64_1_shl_N_literal ( ULong w64 )
1019 {
1020    if (w64 < (1ULL << 1) || w64 > (1ULL << 31))
1021       return 0;
1022    if ((w64 & (w64 - 1)) != 0)
1023       return 0;
1024    /* At this point, we know w64 is a power of two in the range 2^1 .. 2^31,
1025       and we only need to find out which one it is. */
1026    for (Int n = 1; n <= 31; n++) {
1027       if (w64 == (1ULL << n))
1028          return n;
1029    }
1030    /* Consequently we should never get here. */
1031    /*UNREACHED*/
1032    vassert(0);
1033    return 0;
1034 }
1035 
1036 /* Returns N if E is an immediate of the form 1 << N for N in 1 to 31,
1037    and zero in any other case. */
isU64_1_shl_N(IRExpr * e)1038 static Int isU64_1_shl_N ( IRExpr* e )
1039 {
1040    if (e->tag != Iex_Const || e->Iex.Const.con->tag != Ico_U64)
1041       return 0;
1042    ULong w64 = e->Iex.Const.con->Ico.U64;
1043    return isU64_1_shl_N_literal(w64);
1044 }
1045 
1046 /* Returns N if E is an immediate of the form (1 << N) - 1 for N in 1 to 31,
1047    and zero in any other case. */
isU64_1_shl_N_minus_1(IRExpr * e)1048 static Int isU64_1_shl_N_minus_1 ( IRExpr* e )
1049 {
1050   if (e->tag != Iex_Const || e->Iex.Const.con->tag != Ico_U64)
1051     return 0;
1052   ULong w64 = e->Iex.Const.con->Ico.U64;
1053   // This isn't actually necessary since isU64_1_shl_N_literal will return
1054   // zero given a zero argument, but still ..
1055   if (w64 == 0xFFFFFFFFFFFFFFFFULL)
1056      return 0;
1057   return isU64_1_shl_N_literal(w64 + 1);
1058 }
1059 
guest_amd64_spechelper(const HChar * function_name,IRExpr ** args,IRStmt ** precedingStmts,Int n_precedingStmts)1060 IRExpr* guest_amd64_spechelper ( const HChar* function_name,
1061                                  IRExpr** args,
1062                                  IRStmt** precedingStmts,
1063                                  Int      n_precedingStmts )
1064 {
1065 #  define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
1066 #  define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
1067 #  define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
1068 #  define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
1069 #  define mkU8(_n)  IRExpr_Const(IRConst_U8(_n))
1070 
1071    Int i, arity = 0;
1072    for (i = 0; args[i]; i++)
1073       arity++;
1074 #  if 0
1075    vex_printf("spec request:\n");
1076    vex_printf("   %s  ", function_name);
1077    for (i = 0; i < arity; i++) {
1078       vex_printf("  ");
1079       ppIRExpr(args[i]);
1080    }
1081    vex_printf("\n");
1082 #  endif
1083 
1084    /* --------- specialising "amd64g_calculate_condition" --------- */
1085 
1086    if (vex_streq(function_name, "amd64g_calculate_condition")) {
1087       /* specialise calls to above "calculate condition" function */
1088       IRExpr *cond, *cc_op, *cc_dep1, *cc_dep2;
1089       vassert(arity == 5);
1090       cond    = args[0];
1091       cc_op   = args[1];
1092       cc_dep1 = args[2];
1093       cc_dep2 = args[3];
1094 
1095       /*---------------- ADDQ ----------------*/
1096 
1097       if (isU64(cc_op, AMD64G_CC_OP_ADDQ) && isU64(cond, AMD64CondZ)) {
1098          /* long long add, then Z --> test (dst+src == 0) */
1099          return unop(Iop_1Uto64,
1100                      binop(Iop_CmpEQ64,
1101                            binop(Iop_Add64, cc_dep1, cc_dep2),
1102                            mkU64(0)));
1103       }
1104 
1105       /*---------------- ADDL ----------------*/
1106 
1107       if (isU64(cc_op, AMD64G_CC_OP_ADDL) && isU64(cond, AMD64CondO)) {
1108          /* This is very commonly generated by Javascript JITs, for
1109             the idiom "do a 32-bit add and jump to out-of-line code if
1110             an overflow occurs". */
1111          /* long add, then O (overflow)
1112             --> ((dep1 ^ dep2 ^ -1) & (dep1 ^ (dep1 + dep2)))[31]
1113             --> (((dep1 ^ dep2 ^ -1) & (dep1 ^ (dep1 +64 dep2))) >>u 31) & 1
1114             --> (((not(dep1 ^ dep2)) & (dep1 ^ (dep1 +64 dep2))) >>u 31) & 1
1115          */
1116          vassert(isIRAtom(cc_dep1));
1117          vassert(isIRAtom(cc_dep2));
1118          return
1119             binop(Iop_And64,
1120                   binop(Iop_Shr64,
1121                         binop(Iop_And64,
1122                               unop(Iop_Not64,
1123                                    binop(Iop_Xor64, cc_dep1, cc_dep2)),
1124                               binop(Iop_Xor64,
1125                                     cc_dep1,
1126                                     binop(Iop_Add64, cc_dep1, cc_dep2))),
1127                         mkU8(31)),
1128                   mkU64(1));
1129 
1130       }
1131 
1132       /*---------------- SUBQ ----------------*/
1133 
1134       /* 0, */
1135       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondO)) {
1136          /* long long sub/cmp, then O (overflow)
1137             --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2)))[63]
1138             --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2))) >>u 63
1139          */
1140          vassert(isIRAtom(cc_dep1));
1141          vassert(isIRAtom(cc_dep2));
1142          return binop(Iop_Shr64,
1143                       binop(Iop_And64,
1144                             binop(Iop_Xor64, cc_dep1, cc_dep2),
1145                             binop(Iop_Xor64,
1146                                   cc_dep1,
1147                                   binop(Iop_Sub64, cc_dep1, cc_dep2))),
1148                       mkU8(63));
1149       }
1150       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNO)) {
1151          /* No action.  Never yet found a test case. */
1152       }
1153 
1154       /* 2, 3 */
1155       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondB)) {
1156          /* long long sub/cmp, then B (unsigned less than)
1157             --> test dst <u src */
1158          return unop(Iop_1Uto64,
1159                      binop(Iop_CmpLT64U, cc_dep1, cc_dep2));
1160       }
1161       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNB)) {
1162          /* long long sub/cmp, then NB (unsigned greater than or equal)
1163             --> test src <=u dst */
1164          /* Note, args are opposite way round from the usual */
1165          return unop(Iop_1Uto64,
1166                      binop(Iop_CmpLE64U, cc_dep2, cc_dep1));
1167       }
1168 
1169       /* 4, 5 */
1170       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondZ)) {
1171          /* long long sub/cmp, then Z --> test dst==src */
1172          return unop(Iop_1Uto64,
1173                      binop(Iop_CmpEQ64,cc_dep1,cc_dep2));
1174       }
1175       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNZ)) {
1176          /* long long sub/cmp, then NZ --> test dst!=src */
1177          return unop(Iop_1Uto64,
1178                      binop(Iop_CmpNE64,cc_dep1,cc_dep2));
1179       }
1180 
1181       /* 6, 7 */
1182       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondBE)) {
1183          /* long long sub/cmp, then BE (unsigned less than or equal)
1184             --> test dst <=u src */
1185          return unop(Iop_1Uto64,
1186                      binop(Iop_CmpLE64U, cc_dep1, cc_dep2));
1187       }
1188       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNBE)) {
1189          /* long long sub/cmp, then NBE (unsigned greater than)
1190             --> test !(dst <=u src) */
1191          return binop(Iop_Xor64,
1192                       unop(Iop_1Uto64,
1193                            binop(Iop_CmpLE64U, cc_dep1, cc_dep2)),
1194                       mkU64(1));
1195       }
1196 
1197       /* 8, 9 */
1198       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondS)) {
1199          /* long long sub/cmp, then S (negative)
1200             --> (dst-src)[63]
1201             --> (dst-src) >>u 63 */
1202          return binop(Iop_Shr64,
1203                       binop(Iop_Sub64, cc_dep1, cc_dep2),
1204                       mkU8(63));
1205       }
1206       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNS)) {
1207          /* long long sub/cmp, then NS (not negative)
1208             --> (dst-src)[63] ^ 1
1209             --> ((dst-src) >>u 63) ^ 1 */
1210          return binop(Iop_Xor64,
1211                       binop(Iop_Shr64,
1212                             binop(Iop_Sub64, cc_dep1, cc_dep2),
1213                             mkU8(63)),
1214                       mkU64(1));
1215       }
1216 
1217       /* 12, 13 */
1218       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondL)) {
1219          /* long long sub/cmp, then L (signed less than)
1220             --> test dst <s src */
1221          return unop(Iop_1Uto64,
1222                      binop(Iop_CmpLT64S, cc_dep1, cc_dep2));
1223       }
1224       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNL)) {
1225          /* long long sub/cmp, then NL (signed greater than or equal)
1226             --> test dst >=s src
1227             --> test src <=s dst */
1228          return unop(Iop_1Uto64,
1229                      binop(Iop_CmpLE64S, cc_dep2, cc_dep1));
1230       }
1231 
1232       /* 14, 15 */
1233       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondLE)) {
1234          /* long long sub/cmp, then LE (signed less than or equal)
1235             --> test dst <=s src */
1236          return unop(Iop_1Uto64,
1237                      binop(Iop_CmpLE64S, cc_dep1, cc_dep2));
1238       }
1239       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNLE)) {
1240          /* long sub/cmp, then NLE (signed greater than)
1241             --> test !(dst <=s src)
1242             --> test (dst >s src)
1243             --> test (src <s dst) */
1244          return unop(Iop_1Uto64,
1245                      binop(Iop_CmpLT64S, cc_dep2, cc_dep1));
1246 
1247       }
1248 
1249       /*---------------- SUBL ----------------*/
1250 
1251       /* 0, */
1252       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondO)) {
1253          /* This is very commonly generated by Javascript JITs, for
1254             the idiom "do a 32-bit subtract and jump to out-of-line
1255             code if an overflow occurs". */
1256          /* long sub/cmp, then O (overflow)
1257             --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2)))[31]
1258             --> (((dep1 ^ dep2) & (dep1 ^ (dep1 -64 dep2))) >>u 31) & 1
1259          */
1260          vassert(isIRAtom(cc_dep1));
1261          vassert(isIRAtom(cc_dep2));
1262          return
1263             binop(Iop_And64,
1264                   binop(Iop_Shr64,
1265                         binop(Iop_And64,
1266                               binop(Iop_Xor64, cc_dep1, cc_dep2),
1267                               binop(Iop_Xor64,
1268                                     cc_dep1,
1269                                     binop(Iop_Sub64, cc_dep1, cc_dep2))),
1270                         mkU8(31)),
1271                   mkU64(1));
1272       }
1273       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNO)) {
1274          /* No action.  Never yet found a test case. */
1275       }
1276 
1277       /* 2, 3 */
1278       {
1279         /* It appears that LLVM 5.0 and later have a new way to find out
1280            whether the top N bits of a word W are all zero, by computing
1281 
1282              W  <u   0---(N-1)---0 1 0---0  or
1283              W  <=u  0---(N-1)---0 0 1---1
1284 
1285            In particular, the result will be defined if the top N bits of W
1286            are defined, even if the trailing bits -- those corresponding to
1287            the rightmost 0---0 / 1---1 section -- are undefined.  Rather than
1288            make Memcheck more complex, we detect this case where we can and
1289            shift out the irrelevant and potentially undefined bits. */
1290         Int n = 0;
1291         Bool is_NB_or_NBE = False;
1292         if (isU64(cc_op, AMD64G_CC_OP_SUBL)) {
1293            if (isU64(cond, AMD64CondB) || isU64(cond, AMD64CondNB)) {
1294               /* long sub/cmp, then B (unsigned less than),
1295                  where dep2 is a power of 2:
1296                    -> CmpLT32U(dep1, 1 << N)
1297                    -> CmpEQ32(dep1 >>u N, 0)
1298                  and
1299                  long sub/cmp, then NB (unsigned greater than or equal),
1300                  where dep2 is a power of 2:
1301                    -> CmpGE32U(dep1, 1 << N)
1302                    -> CmpNE32(dep1 >>u N, 0)
1303                  This avoids CmpLT32U/CmpGE32U being applied to potentially
1304                  uninitialised bits in the area being shifted out. */
1305               n = isU64_1_shl_N(cc_dep2);
1306               is_NB_or_NBE = isU64(cond, AMD64CondNB);
1307            } else if (isU64(cond, AMD64CondBE) || isU64(cond, AMD64CondNBE)) {
1308               /* long sub/cmp, then BE (unsigned less than or equal),
1309                  where dep2 is a power of 2 minus 1:
1310                   -> CmpLE32U(dep1, (1 << N) - 1)
1311                   -> CmpEQ32(dep1 >>u N, 0)
1312                  and
1313                  long sub/cmp, then NBE (unsigned greater than),
1314                  where dep2 is a power of 2 minus 1:
1315                    -> CmpGT32U(dep1, (1 << N) - 1)
1316                    -> CmpNE32(dep1 >>u N, 0)
1317                  This avoids CmpLE32U/CmpGT32U being applied to potentially
1318                  uninitialised bits in the area being shifted out. */
1319               n = isU64_1_shl_N_minus_1(cc_dep2);
1320               is_NB_or_NBE = isU64(cond, AMD64CondNBE);
1321            }
1322         }
1323         if (n > 0) {
1324            vassert(n >= 1 && n <= 31);
1325            return unop(Iop_1Uto64,
1326                        binop(is_NB_or_NBE ? Iop_CmpNE32 : Iop_CmpEQ32,
1327                              binop(Iop_Shr32, unop(Iop_64to32, cc_dep1),
1328                                               mkU8(n)),
1329                              mkU32(0)));
1330         }
1331       }
1332       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondB)) {
1333          /* long sub/cmp, then B (unsigned less than)
1334             --> test dst <u src */
1335          return unop(Iop_1Uto64,
1336                      binop(Iop_CmpLT32U,
1337                            unop(Iop_64to32, cc_dep1),
1338                            unop(Iop_64to32, cc_dep2)));
1339       }
1340       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNB)) {
1341          /* long sub/cmp, then NB (unsigned greater than or equal)
1342             --> test src <=u dst */
1343          /* Note, args are opposite way round from the usual */
1344          return unop(Iop_1Uto64,
1345                      binop(Iop_CmpLE32U,
1346                            unop(Iop_64to32, cc_dep2),
1347                            unop(Iop_64to32, cc_dep1)));
1348       }
1349 
1350       /* 4, 5 */
1351       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondZ)) {
1352          /* long sub/cmp, then Z --> test dst==src */
1353          return unop(Iop_1Uto64,
1354                      binop(Iop_CmpEQ32,
1355                            unop(Iop_64to32, cc_dep1),
1356                            unop(Iop_64to32, cc_dep2)));
1357       }
1358       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNZ)) {
1359          /* long sub/cmp, then NZ --> test dst!=src */
1360          return unop(Iop_1Uto64,
1361                      binop(Iop_CmpNE32,
1362                            unop(Iop_64to32, cc_dep1),
1363                            unop(Iop_64to32, cc_dep2)));
1364       }
1365 
1366       /* 6, 7 */
1367       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondBE)) {
1368          /* long sub/cmp, then BE (unsigned less than or equal)
1369             --> test dst <=u src */
1370          return unop(Iop_1Uto64,
1371                      binop(Iop_CmpLE32U,
1372                            unop(Iop_64to32, cc_dep1),
1373                            unop(Iop_64to32, cc_dep2)));
1374       }
1375       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNBE)) {
1376          /* long sub/cmp, then NBE (unsigned greater than)
1377             --> test src <u dst */
1378          /* Note, args are opposite way round from the usual */
1379          return unop(Iop_1Uto64,
1380                      binop(Iop_CmpLT32U,
1381                            unop(Iop_64to32, cc_dep2),
1382                            unop(Iop_64to32, cc_dep1)));
1383       }
1384 
1385       /* 8, 9 */
1386       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondS)) {
1387          /* long sub/cmp, then S (negative)
1388             --> (dst-src)[31]
1389             --> ((dst -64 src) >>u 31) & 1
1390             Pointless to narrow the args to 32 bit before the subtract. */
1391          return binop(Iop_And64,
1392                       binop(Iop_Shr64,
1393                             binop(Iop_Sub64, cc_dep1, cc_dep2),
1394                             mkU8(31)),
1395                       mkU64(1));
1396       }
1397       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNS)) {
1398          /* long sub/cmp, then NS (not negative)
1399             --> (dst-src)[31] ^ 1
1400             --> (((dst -64 src) >>u 31) & 1) ^ 1
1401             Pointless to narrow the args to 32 bit before the subtract. */
1402          return binop(Iop_Xor64,
1403                       binop(Iop_And64,
1404                             binop(Iop_Shr64,
1405                                   binop(Iop_Sub64, cc_dep1, cc_dep2),
1406                                   mkU8(31)),
1407                             mkU64(1)),
1408                       mkU64(1));
1409       }
1410 
1411       /* 12, 13 */
1412       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondL)) {
1413          /* long sub/cmp, then L (signed less than)
1414             --> test dst <s src */
1415          return unop(Iop_1Uto64,
1416                      binop(Iop_CmpLT32S,
1417                            unop(Iop_64to32, cc_dep1),
1418                            unop(Iop_64to32, cc_dep2)));
1419       }
1420       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNL)) {
1421          /* long sub/cmp, then NL (signed greater than or equal)
1422             --> test dst >=s src
1423             --> test src <=s dst */
1424          return unop(Iop_1Uto64,
1425                      binop(Iop_CmpLE32S,
1426                            unop(Iop_64to32, cc_dep2),
1427                            unop(Iop_64to32, cc_dep1)));
1428       }
1429 
1430       /* 14, 15 */
1431       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondLE)) {
1432          /* long sub/cmp, then LE (signed less than or equal)
1433             --> test dst <=s src */
1434          return unop(Iop_1Uto64,
1435                      binop(Iop_CmpLE32S,
1436                            unop(Iop_64to32, cc_dep1),
1437                            unop(Iop_64to32, cc_dep2)));
1438 
1439       }
1440       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNLE)) {
1441          /* long sub/cmp, then NLE (signed greater than)
1442             --> test !(dst <=s src)
1443             --> test (dst >s src)
1444             --> test (src <s dst) */
1445          return unop(Iop_1Uto64,
1446                      binop(Iop_CmpLT32S,
1447                            unop(Iop_64to32, cc_dep2),
1448                            unop(Iop_64to32, cc_dep1)));
1449 
1450       }
1451 
1452       /*---------------- SUBW ----------------*/
1453 
1454       /* 4, 5 */
1455       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondZ)) {
1456          /* word sub/cmp, then Z --> test dst==src */
1457          return unop(Iop_1Uto64,
1458                      binop(Iop_CmpEQ16,
1459                            unop(Iop_64to16,cc_dep1),
1460                            unop(Iop_64to16,cc_dep2)));
1461       }
1462       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondNZ)) {
1463          /* word sub/cmp, then NZ --> test dst!=src */
1464          return unop(Iop_1Uto64,
1465                      binop(Iop_CmpNE16,
1466                            unop(Iop_64to16,cc_dep1),
1467                            unop(Iop_64to16,cc_dep2)));
1468       }
1469 
1470       /* 6, */
1471       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondBE)) {
1472          /* word sub/cmp, then BE (unsigned less than or equal)
1473             --> test dst <=u src */
1474          return unop(Iop_1Uto64,
1475                      binop(Iop_CmpLE64U,
1476                            binop(Iop_Shl64, cc_dep1, mkU8(48)),
1477                            binop(Iop_Shl64, cc_dep2, mkU8(48))));
1478       }
1479 
1480       /* 8, 9 */
1481       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondS)
1482                                           && isU64(cc_dep2, 0)) {
1483          /* word sub/cmp of zero, then S --> test (dst-0 <s 0)
1484                                          --> test dst <s 0
1485                                          --> (ULong)dst[15]
1486             This is yet another scheme by which clang figures out if the
1487             top bit of a word is 1 or 0.  See also LOGICB/CondS below. */
1488          /* Note: isU64(cc_dep2, 0) is correct, even though this is
1489             for an 16-bit comparison, since the args to the helper
1490             function are always U64s. */
1491          return binop(Iop_And64,
1492                       binop(Iop_Shr64,cc_dep1,mkU8(15)),
1493                       mkU64(1));
1494       }
1495       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondNS)
1496                                           && isU64(cc_dep2, 0)) {
1497          /* word sub/cmp of zero, then NS --> test !(dst-0 <s 0)
1498                                           --> test !(dst <s 0)
1499                                           --> (ULong) !dst[15]
1500          */
1501          return binop(Iop_Xor64,
1502                       binop(Iop_And64,
1503                             binop(Iop_Shr64,cc_dep1,mkU8(15)),
1504                             mkU64(1)),
1505                       mkU64(1));
1506       }
1507 
1508       /* 14, */
1509       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondLE)) {
1510          /* word sub/cmp, then LE (signed less than or equal)
1511             --> test dst <=s src */
1512          return unop(Iop_1Uto64,
1513                      binop(Iop_CmpLE64S,
1514                            binop(Iop_Shl64,cc_dep1,mkU8(48)),
1515                            binop(Iop_Shl64,cc_dep2,mkU8(48))));
1516 
1517       }
1518 
1519       /*---------------- SUBB ----------------*/
1520 
1521       /* 2, 3 */
1522       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondB)) {
1523          /* byte sub/cmp, then B (unsigned less than)
1524             --> test dst <u src */
1525          return unop(Iop_1Uto64,
1526                      binop(Iop_CmpLT64U,
1527                            binop(Iop_And64, cc_dep1, mkU64(0xFF)),
1528                            binop(Iop_And64, cc_dep2, mkU64(0xFF))));
1529       }
1530       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNB)) {
1531          /* byte sub/cmp, then NB (unsigned greater than or equal)
1532             --> test src <=u dst */
1533          /* Note, args are opposite way round from the usual */
1534          return unop(Iop_1Uto64,
1535                      binop(Iop_CmpLE64U,
1536                            binop(Iop_And64, cc_dep2, mkU64(0xFF)),
1537                            binop(Iop_And64, cc_dep1, mkU64(0xFF))));
1538       }
1539 
1540       /* 4, 5 */
1541       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondZ)) {
1542          /* byte sub/cmp, then Z --> test dst==src */
1543          return unop(Iop_1Uto64,
1544                      binop(Iop_CmpEQ8,
1545                            unop(Iop_64to8,cc_dep1),
1546                            unop(Iop_64to8,cc_dep2)));
1547       }
1548       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNZ)) {
1549          /* byte sub/cmp, then NZ --> test dst!=src */
1550          return unop(Iop_1Uto64,
1551                      binop(Iop_CmpNE8,
1552                            unop(Iop_64to8,cc_dep1),
1553                            unop(Iop_64to8,cc_dep2)));
1554       }
1555 
1556       /* 6, */
1557       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondBE)) {
1558          /* byte sub/cmp, then BE (unsigned less than or equal)
1559             --> test dst <=u src */
1560          return unop(Iop_1Uto64,
1561                      binop(Iop_CmpLE64U,
1562                            binop(Iop_And64, cc_dep1, mkU64(0xFF)),
1563                            binop(Iop_And64, cc_dep2, mkU64(0xFF))));
1564       }
1565 
1566       /* 8, 9 */
1567       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondS)
1568                                           && isU64(cc_dep2, 0)) {
1569          /* byte sub/cmp of zero, then S --> test (dst-0 <s 0)
1570                                          --> test dst <s 0
1571                                          --> (ULong)dst[7]
1572             This is yet another scheme by which gcc figures out if the
1573             top bit of a byte is 1 or 0.  See also LOGICB/CondS below. */
1574          /* Note: isU64(cc_dep2, 0) is correct, even though this is
1575             for an 8-bit comparison, since the args to the helper
1576             function are always U64s. */
1577          return binop(Iop_And64,
1578                       binop(Iop_Shr64,cc_dep1,mkU8(7)),
1579                       mkU64(1));
1580       }
1581       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNS)
1582                                           && isU64(cc_dep2, 0)) {
1583          /* byte sub/cmp of zero, then NS --> test !(dst-0 <s 0)
1584                                           --> test !(dst <s 0)
1585                                           --> (ULong) !dst[7]
1586          */
1587          return binop(Iop_Xor64,
1588                       binop(Iop_And64,
1589                             binop(Iop_Shr64,cc_dep1,mkU8(7)),
1590                             mkU64(1)),
1591                       mkU64(1));
1592       }
1593 
1594       /*---------------- LOGICQ ----------------*/
1595 
1596       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondZ)) {
1597          /* long long and/or/xor, then Z --> test dst==0 */
1598          return unop(Iop_1Uto64,
1599                      binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
1600       }
1601       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondNZ)) {
1602          /* long long and/or/xor, then NZ --> test dst!=0 */
1603          return unop(Iop_1Uto64,
1604                      binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
1605       }
1606 
1607       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondL)) {
1608          /* long long and/or/xor, then L
1609             LOGIC sets SF and ZF according to the
1610             result and makes OF be zero.  L computes SF ^ OF, but
1611             OF is zero, so this reduces to SF -- which will be 1 iff
1612             the result is < signed 0.  Hence ...
1613          */
1614          return unop(Iop_1Uto64,
1615                      binop(Iop_CmpLT64S,
1616                            cc_dep1,
1617                            mkU64(0)));
1618       }
1619 
1620       /*---------------- LOGICL ----------------*/
1621 
1622       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondZ)) {
1623          /* long and/or/xor, then Z --> test dst==0 */
1624          return unop(Iop_1Uto64,
1625                      binop(Iop_CmpEQ32,
1626                            unop(Iop_64to32, cc_dep1),
1627                            mkU32(0)));
1628       }
1629       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNZ)) {
1630          /* long and/or/xor, then NZ --> test dst!=0 */
1631          return unop(Iop_1Uto64,
1632                      binop(Iop_CmpNE32,
1633                            unop(Iop_64to32, cc_dep1),
1634                            mkU32(0)));
1635       }
1636 
1637       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondLE)) {
1638          /* long and/or/xor, then LE
1639             This is pretty subtle.  LOGIC sets SF and ZF according to the
1640             result and makes OF be zero.  LE computes (SF ^ OF) | ZF, but
1641             OF is zero, so this reduces to SF | ZF -- which will be 1 iff
1642             the result is <=signed 0.  Hence ...
1643          */
1644          return unop(Iop_1Uto64,
1645                      binop(Iop_CmpLE32S,
1646                            unop(Iop_64to32, cc_dep1),
1647                            mkU32(0)));
1648       }
1649 
1650       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondS)) {
1651          /* long and/or/xor, then S --> (ULong)result[31] */
1652          return binop(Iop_And64,
1653                       binop(Iop_Shr64, cc_dep1, mkU8(31)),
1654                       mkU64(1));
1655       }
1656       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNS)) {
1657          /* long and/or/xor, then S --> (ULong) ~ result[31] */
1658          return binop(Iop_Xor64,
1659                 binop(Iop_And64,
1660                       binop(Iop_Shr64, cc_dep1, mkU8(31)),
1661                       mkU64(1)),
1662                 mkU64(1));
1663       }
1664 
1665       /*---------------- LOGICW ----------------*/
1666 
1667       if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondZ)) {
1668          /* word and/or/xor, then Z --> test dst==0 */
1669          // Use CmpEQ32 rather than CmpEQ64 here, so that Memcheck instruments
1670          // it exactly at EdcAUTO.
1671          return unop(Iop_1Uto64,
1672                      binop(Iop_CmpEQ32,
1673                            unop(Iop_16Uto32, unop(Iop_64to16, cc_dep1)),
1674                            mkU32(0)));
1675       }
1676       if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondNZ)) {
1677          /* word and/or/xor, then NZ --> test dst!=0 */
1678          // Use CmpNE32 rather than CmpNE64 here, so that Memcheck instruments
1679          // it exactly at EdcAUTO.
1680          return unop(Iop_1Uto64,
1681                      binop(Iop_CmpNE32,
1682                            unop(Iop_16Uto32, unop(Iop_64to16, cc_dep1)),
1683                            mkU32(0)));
1684       }
1685 
1686       /*---------------- LOGICB ----------------*/
1687 
1688       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondZ)) {
1689          /* byte and/or/xor, then Z --> test dst==0 */
1690          // Use CmpEQ32 rather than CmpEQ64 here, so that Memcheck instruments
1691          // it exactly at EdcAUTO.
1692          return unop(Iop_1Uto64,
1693                      binop(Iop_CmpEQ32,
1694                            unop(Iop_8Uto32, unop(Iop_64to8, cc_dep1)),
1695                            mkU32(0)));
1696       }
1697       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNZ)) {
1698          /* byte and/or/xor, then NZ --> test dst!=0 */
1699          // Use CmpNE32 rather than CmpNE64 here, so that Memcheck instruments
1700          // it exactly at EdcAUTO.
1701          return unop(Iop_1Uto64,
1702                      binop(Iop_CmpNE32,
1703                            unop(Iop_8Uto32, unop(Iop_64to8, cc_dep1)),
1704                            mkU32(0)));
1705       }
1706 
1707       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondS)) {
1708          /* this is an idiom gcc sometimes uses to find out if the top
1709             bit of a byte register is set: eg testb %al,%al; js ..
1710             Since it just depends on the top bit of the byte, extract
1711             that bit and explicitly get rid of all the rest.  This
1712             helps memcheck avoid false positives in the case where any
1713             of the other bits in the byte are undefined. */
1714          /* byte and/or/xor, then S --> (UInt)result[7] */
1715          return binop(Iop_And64,
1716                       binop(Iop_Shr64,cc_dep1,mkU8(7)),
1717                       mkU64(1));
1718       }
1719       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNS)) {
1720          /* byte and/or/xor, then NS --> (UInt)!result[7] */
1721          return binop(Iop_Xor64,
1722                       binop(Iop_And64,
1723                             binop(Iop_Shr64,cc_dep1,mkU8(7)),
1724                             mkU64(1)),
1725                       mkU64(1));
1726       }
1727 
1728       /*---------------- INCB ----------------*/
1729 
1730       if (isU64(cc_op, AMD64G_CC_OP_INCB) && isU64(cond, AMD64CondLE)) {
1731          /* 8-bit inc, then LE --> sign bit of the arg */
1732          return binop(Iop_And64,
1733                       binop(Iop_Shr64,
1734                             binop(Iop_Sub64, cc_dep1, mkU64(1)),
1735                             mkU8(7)),
1736                       mkU64(1));
1737       }
1738 
1739       /*---------------- INCW ----------------*/
1740 
1741       if (isU64(cc_op, AMD64G_CC_OP_INCW) && isU64(cond, AMD64CondZ)) {
1742          /* 16-bit inc, then Z --> test dst == 0 */
1743          return unop(Iop_1Uto64,
1744                      binop(Iop_CmpEQ64,
1745                            binop(Iop_Shl64,cc_dep1,mkU8(48)),
1746                            mkU64(0)));
1747       }
1748 
1749       /*---------------- DECL ----------------*/
1750 
1751       if (isU64(cc_op, AMD64G_CC_OP_DECL) && isU64(cond, AMD64CondZ)) {
1752          /* dec L, then Z --> test dst == 0 */
1753          return unop(Iop_1Uto64,
1754                      binop(Iop_CmpEQ32,
1755                            unop(Iop_64to32, cc_dep1),
1756                            mkU32(0)));
1757       }
1758 
1759       /*---------------- DECW ----------------*/
1760 
1761       if (isU64(cc_op, AMD64G_CC_OP_DECW) && isU64(cond, AMD64CondNZ)) {
1762          /* 16-bit dec, then NZ --> test dst != 0 */
1763          return unop(Iop_1Uto64,
1764                      binop(Iop_CmpNE64,
1765                            binop(Iop_Shl64,cc_dep1,mkU8(48)),
1766                            mkU64(0)));
1767       }
1768 
1769       /*---------------- SHRQ ----------------*/
1770 
1771       if (isU64(cc_op, AMD64G_CC_OP_SHRQ) && isU64(cond, AMD64CondZ)) {
1772          /* SHRQ, then Z --> test dep1 == 0 */
1773          return unop(Iop_1Uto64,
1774                      binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
1775       }
1776       if (isU64(cc_op, AMD64G_CC_OP_SHRQ) && isU64(cond, AMD64CondNZ)) {
1777          /* SHRQ, then NZ --> test dep1 != 0 */
1778          return unop(Iop_1Uto64,
1779                      binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
1780       }
1781 
1782       /*---------------- SHRL ----------------*/
1783 
1784       if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondZ)) {
1785          /* SHRL, then Z --> test dep1 == 0 */
1786          return unop(Iop_1Uto64,
1787                      binop(Iop_CmpEQ32, unop(Iop_64to32, cc_dep1),
1788                            mkU32(0)));
1789       }
1790       if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondNZ)) {
1791          /* SHRL, then NZ --> test dep1 != 0 */
1792          return unop(Iop_1Uto64,
1793                      binop(Iop_CmpNE32, unop(Iop_64to32, cc_dep1),
1794                            mkU32(0)));
1795       }
1796 
1797       if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondS)) {
1798          /* SHRL/SARL, then S --> (ULong)result[31] */
1799          return binop(Iop_And64,
1800                       binop(Iop_Shr64, cc_dep1, mkU8(31)),
1801                       mkU64(1));
1802       }
1803       // The following looks correct to me, but never seems to happen because
1804       // the front end converts jns to js by switching the fallthrough vs
1805       // taken addresses.  See jcc_01().  But then why do other conditions
1806       // considered by this function show up in both variants (xx and Nxx) ?
1807       //if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondNS)) {
1808       //   /* SHRL/SARL, then NS --> (ULong) ~ result[31] */
1809       //   vassert(0);
1810       //   return binop(Iop_Xor64,
1811       //                binop(Iop_And64,
1812       //                      binop(Iop_Shr64, cc_dep1, mkU8(31)),
1813       //                      mkU64(1)),
1814       //                mkU64(1));
1815       //}
1816 
1817       /*---------------- COPY ----------------*/
1818       /* This can happen, as a result of amd64 FP compares: "comisd ... ;
1819          jbe" for example. */
1820 
1821       if (isU64(cc_op, AMD64G_CC_OP_COPY)
1822           && (isU64(cond, AMD64CondBE) || isU64(cond, AMD64CondNBE))) {
1823          /* COPY, then BE --> extract C and Z from dep1, and test (C
1824             or Z == 1). */
1825          /* COPY, then NBE --> extract C and Z from dep1, and test (C
1826             or Z == 0). */
1827          ULong nnn = isU64(cond, AMD64CondBE) ? 1 : 0;
1828          return
1829             unop(
1830                Iop_1Uto64,
1831                binop(
1832                   Iop_CmpEQ64,
1833                   binop(
1834                      Iop_And64,
1835                      binop(
1836                         Iop_Or64,
1837                         binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
1838                         binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z))
1839                      ),
1840                      mkU64(1)
1841                   ),
1842                   mkU64(nnn)
1843                )
1844             );
1845       }
1846 
1847       if (isU64(cc_op, AMD64G_CC_OP_COPY)
1848           && (isU64(cond, AMD64CondB) || isU64(cond, AMD64CondNB))) {
1849          /* COPY, then B --> extract C from dep1, and test (C == 1). */
1850          /* COPY, then NB --> extract C from dep1, and test (C == 0). */
1851          ULong nnn = isU64(cond, AMD64CondB) ? 1 : 0;
1852          return
1853             unop(
1854                Iop_1Uto64,
1855                binop(
1856                   Iop_CmpEQ64,
1857                   binop(
1858                      Iop_And64,
1859                      binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
1860                      mkU64(1)
1861                   ),
1862                   mkU64(nnn)
1863                )
1864             );
1865       }
1866 
1867       if (isU64(cc_op, AMD64G_CC_OP_COPY)
1868           && (isU64(cond, AMD64CondZ) || isU64(cond, AMD64CondNZ))) {
1869          /* COPY, then Z --> extract Z from dep1, and test (Z == 1). */
1870          /* COPY, then NZ --> extract Z from dep1, and test (Z == 0). */
1871          ULong nnn = isU64(cond, AMD64CondZ) ? 1 : 0;
1872          return
1873             unop(
1874                Iop_1Uto64,
1875                binop(
1876                   Iop_CmpEQ64,
1877                   binop(
1878                      Iop_And64,
1879                      binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z)),
1880                      mkU64(1)
1881                   ),
1882                   mkU64(nnn)
1883                )
1884             );
1885       }
1886 
1887       if (isU64(cc_op, AMD64G_CC_OP_COPY)
1888           && (isU64(cond, AMD64CondP) || isU64(cond, AMD64CondNP))) {
1889          /* COPY, then P --> extract P from dep1, and test (P == 1). */
1890          /* COPY, then NP --> extract P from dep1, and test (P == 0). */
1891          ULong nnn = isU64(cond, AMD64CondP) ? 1 : 0;
1892          return
1893             unop(
1894                Iop_1Uto64,
1895                binop(
1896                   Iop_CmpEQ64,
1897                   binop(
1898                      Iop_And64,
1899                      binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_P)),
1900                      mkU64(1)
1901                   ),
1902                   mkU64(nnn)
1903                )
1904             );
1905       }
1906 
1907       return NULL;
1908    }
1909 
1910    /* --------- specialising "amd64g_calculate_rflags_c" --------- */
1911 
1912    if (vex_streq(function_name, "amd64g_calculate_rflags_c")) {
1913       /* specialise calls to above "calculate_rflags_c" function */
1914       IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
1915       vassert(arity == 4);
1916       cc_op   = args[0];
1917       cc_dep1 = args[1];
1918       cc_dep2 = args[2];
1919       cc_ndep = args[3];
1920 
1921       if (isU64(cc_op, AMD64G_CC_OP_SUBQ)) {
1922          /* C after sub denotes unsigned less than */
1923          return unop(Iop_1Uto64,
1924                      binop(Iop_CmpLT64U,
1925                            cc_dep1,
1926                            cc_dep2));
1927       }
1928       if (isU64(cc_op, AMD64G_CC_OP_SUBL)) {
1929          /* C after sub denotes unsigned less than */
1930          return unop(Iop_1Uto64,
1931                      binop(Iop_CmpLT32U,
1932                            unop(Iop_64to32, cc_dep1),
1933                            unop(Iop_64to32, cc_dep2)));
1934       }
1935       if (isU64(cc_op, AMD64G_CC_OP_SUBB)) {
1936          /* C after sub denotes unsigned less than */
1937          return unop(Iop_1Uto64,
1938                      binop(Iop_CmpLT64U,
1939                            binop(Iop_And64,cc_dep1,mkU64(0xFF)),
1940                            binop(Iop_And64,cc_dep2,mkU64(0xFF))));
1941       }
1942       if (isU64(cc_op, AMD64G_CC_OP_ADDQ)) {
1943          /* C after add denotes sum <u either arg */
1944          return unop(Iop_1Uto64,
1945                      binop(Iop_CmpLT64U,
1946                            binop(Iop_Add64, cc_dep1, cc_dep2),
1947                            cc_dep1));
1948       }
1949       if (isU64(cc_op, AMD64G_CC_OP_ADDL)) {
1950          /* C after add denotes sum <u either arg */
1951          return unop(Iop_1Uto64,
1952                      binop(Iop_CmpLT32U,
1953                            unop(Iop_64to32, binop(Iop_Add64, cc_dep1, cc_dep2)),
1954                            unop(Iop_64to32, cc_dep1)));
1955       }
1956       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ)
1957           || isU64(cc_op, AMD64G_CC_OP_LOGICL)
1958           || isU64(cc_op, AMD64G_CC_OP_LOGICW)
1959           || isU64(cc_op, AMD64G_CC_OP_LOGICB)) {
1960          /* cflag after logic is zero */
1961          return mkU64(0);
1962       }
1963       if (isU64(cc_op, AMD64G_CC_OP_DECL) || isU64(cc_op, AMD64G_CC_OP_INCL)
1964           || isU64(cc_op, AMD64G_CC_OP_DECQ) || isU64(cc_op, AMD64G_CC_OP_INCQ)) {
1965          /* If the thunk is dec or inc, the cflag is supplied as CC_NDEP. */
1966          return cc_ndep;
1967       }
1968 
1969 #     if 0
1970       if (cc_op->tag == Iex_Const) {
1971          vex_printf("CFLAG "); ppIRExpr(cc_op); vex_printf("\n");
1972       }
1973 #     endif
1974 
1975       return NULL;
1976    }
1977 
1978 #  undef unop
1979 #  undef binop
1980 #  undef mkU64
1981 #  undef mkU32
1982 #  undef mkU8
1983 
1984    return NULL;
1985 }
1986 
1987 
1988 /*---------------------------------------------------------------*/
1989 /*--- Supporting functions for x87 FPU activities.            ---*/
1990 /*---------------------------------------------------------------*/
1991 
host_is_little_endian(void)1992 static inline Bool host_is_little_endian ( void )
1993 {
1994    UInt x = 0x76543210;
1995    UChar* p = (UChar*)(&x);
1996    return toBool(*p == 0x10);
1997 }
1998 
1999 /* Inspect a value and its tag, as per the x87 'FXAM' instruction. */
2000 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calculate_FXAM(ULong tag,ULong dbl)2001 ULong amd64g_calculate_FXAM ( ULong tag, ULong dbl )
2002 {
2003    Bool   mantissaIsZero;
2004    Int    bexp;
2005    UChar  sign;
2006    UChar* f64;
2007 
2008    vassert(host_is_little_endian());
2009 
2010    /* vex_printf("calculate_FXAM ( %d, %llx ) .. ", tag, dbl ); */
2011 
2012    f64  = (UChar*)(&dbl);
2013    sign = toUChar( (f64[7] >> 7) & 1 );
2014 
2015    /* First off, if the tag indicates the register was empty,
2016       return 1,0,sign,1 */
2017    if (tag == 0) {
2018       /* vex_printf("Empty\n"); */
2019       return AMD64G_FC_MASK_C3 | 0 | (sign << AMD64G_FC_SHIFT_C1)
2020                                    | AMD64G_FC_MASK_C0;
2021    }
2022 
2023    bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
2024    bexp &= 0x7FF;
2025 
2026    mantissaIsZero
2027       = toBool(
2028            (f64[6] & 0x0F) == 0
2029            && (f64[5] | f64[4] | f64[3] | f64[2] | f64[1] | f64[0]) == 0
2030         );
2031 
2032    /* If both exponent and mantissa are zero, the value is zero.
2033       Return 1,0,sign,0. */
2034    if (bexp == 0 && mantissaIsZero) {
2035       /* vex_printf("Zero\n"); */
2036       return AMD64G_FC_MASK_C3 | 0
2037                                | (sign << AMD64G_FC_SHIFT_C1) | 0;
2038    }
2039 
2040    /* If exponent is zero but mantissa isn't, it's a denormal.
2041       Return 1,1,sign,0. */
2042    if (bexp == 0 && !mantissaIsZero) {
2043       /* vex_printf("Denormal\n"); */
2044       return AMD64G_FC_MASK_C3 | AMD64G_FC_MASK_C2
2045                                | (sign << AMD64G_FC_SHIFT_C1) | 0;
2046    }
2047 
2048    /* If the exponent is 7FF and the mantissa is zero, this is an infinity.
2049       Return 0,1,sign,1. */
2050    if (bexp == 0x7FF && mantissaIsZero) {
2051       /* vex_printf("Inf\n"); */
2052       return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1)
2053                                    | AMD64G_FC_MASK_C0;
2054    }
2055 
2056    /* If the exponent is 7FF and the mantissa isn't zero, this is a NaN.
2057       Return 0,0,sign,1. */
2058    if (bexp == 0x7FF && !mantissaIsZero) {
2059       /* vex_printf("NaN\n"); */
2060       return 0 | 0 | (sign << AMD64G_FC_SHIFT_C1) | AMD64G_FC_MASK_C0;
2061    }
2062 
2063    /* Uh, ok, we give up.  It must be a normal finite number.
2064       Return 0,1,sign,0.
2065    */
2066    /* vex_printf("normal\n"); */
2067    return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1) | 0;
2068 }
2069 
2070 
2071 /* This is used to implement both 'frstor' and 'fldenv'.  The latter
2072    appears to differ from the former only in that the 8 FP registers
2073    themselves are not transferred into the guest state. */
2074 static
do_put_x87(Bool moveRegs,Fpu_State * x87_state,VexGuestAMD64State * vex_state)2075 VexEmNote do_put_x87 ( Bool moveRegs,
2076                        /*IN*/Fpu_State* x87_state,
2077                        /*OUT*/VexGuestAMD64State* vex_state )
2078 {
2079    Int        stno, preg;
2080    UInt       tag;
2081    ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2082    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2083    UInt       ftop    = (x87_state->env[FP_ENV_STAT] >> 11) & 7;
2084    UInt       tagw    = x87_state->env[FP_ENV_TAG];
2085    UInt       fpucw   = x87_state->env[FP_ENV_CTRL];
2086    UInt       c3210   = x87_state->env[FP_ENV_STAT] & 0x4700;
2087    VexEmNote  ew;
2088    UInt       fpround;
2089    ULong      pair;
2090 
2091    /* Copy registers and tags */
2092    for (stno = 0; stno < 8; stno++) {
2093       preg = (stno + ftop) & 7;
2094       tag = (tagw >> (2*preg)) & 3;
2095       if (tag == 3) {
2096          /* register is empty */
2097          /* hmm, if it's empty, does it still get written?  Probably
2098             safer to say it does.  If we don't, memcheck could get out
2099             of sync, in that it thinks all FP registers are defined by
2100             this helper, but in reality some have not been updated. */
2101          if (moveRegs)
2102             vexRegs[preg] = 0; /* IEEE754 64-bit zero */
2103          vexTags[preg] = 0;
2104       } else {
2105          /* register is non-empty */
2106          if (moveRegs)
2107             convert_f80le_to_f64le( &x87_state->reg[10*stno],
2108                                     (UChar*)&vexRegs[preg] );
2109          vexTags[preg] = 1;
2110       }
2111    }
2112 
2113    /* stack pointer */
2114    vex_state->guest_FTOP = ftop;
2115 
2116    /* status word */
2117    vex_state->guest_FC3210 = c3210;
2118 
2119    /* handle the control word, setting FPROUND and detecting any
2120       emulation warnings. */
2121    pair    = amd64g_check_fldcw ( (ULong)fpucw );
2122    fpround = (UInt)pair & 0xFFFFFFFFULL;
2123    ew      = (VexEmNote)(pair >> 32);
2124 
2125    vex_state->guest_FPROUND = fpround & 3;
2126 
2127    /* emulation warnings --> caller */
2128    return ew;
2129 }
2130 
2131 
2132 /* Create an x87 FPU state from the guest state, as close as
2133    we can approximate it. */
2134 static
do_get_x87(VexGuestAMD64State * vex_state,Fpu_State * x87_state)2135 void do_get_x87 ( /*IN*/VexGuestAMD64State* vex_state,
2136                   /*OUT*/Fpu_State* x87_state )
2137 {
2138    Int        i, stno, preg;
2139    UInt       tagw;
2140    ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2141    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2142    UInt       ftop    = vex_state->guest_FTOP;
2143    UInt       c3210   = vex_state->guest_FC3210;
2144 
2145    for (i = 0; i < 14; i++)
2146       x87_state->env[i] = 0;
2147 
2148    x87_state->env[1] = x87_state->env[3] = x87_state->env[5]
2149       = x87_state->env[13] = 0xFFFF;
2150    x87_state->env[FP_ENV_STAT]
2151       = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
2152    x87_state->env[FP_ENV_CTRL]
2153       = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
2154 
2155    /* Dump the register stack in ST order. */
2156    tagw = 0;
2157    for (stno = 0; stno < 8; stno++) {
2158       preg = (stno + ftop) & 7;
2159       if (vexTags[preg] == 0) {
2160          /* register is empty */
2161          tagw |= (3 << (2*preg));
2162          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2163                                  &x87_state->reg[10*stno] );
2164       } else {
2165          /* register is full. */
2166          tagw |= (0 << (2*preg));
2167          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2168                                  &x87_state->reg[10*stno] );
2169       }
2170    }
2171    x87_state->env[FP_ENV_TAG] = toUShort(tagw);
2172 }
2173 
2174 
2175 /*---------------------------------------------------------------*/
2176 /*--- Supporting functions for XSAVE/FXSAVE.                  ---*/
2177 /*---------------------------------------------------------------*/
2178 
2179 /* CALLED FROM GENERATED CODE */
2180 /* DIRTY HELPER (reads guest state, writes guest mem) */
2181 /* XSAVE component 0 is the x87 FPU state. */
amd64g_dirtyhelper_XSAVE_COMPONENT_0(VexGuestAMD64State * gst,HWord addr)2182 void amd64g_dirtyhelper_XSAVE_COMPONENT_0
2183         ( VexGuestAMD64State* gst, HWord addr )
2184 {
2185    /* Derived from values obtained from
2186       vendor_id       : AuthenticAMD
2187       cpu family      : 15
2188       model           : 12
2189       model name      : AMD Athlon(tm) 64 Processor 3200+
2190       stepping        : 0
2191       cpu MHz         : 2200.000
2192       cache size      : 512 KB
2193    */
2194    /* Somewhat roundabout, but at least it's simple. */
2195    Fpu_State tmp;
2196    UShort*   addrS = (UShort*)addr;
2197    UChar*    addrC = (UChar*)addr;
2198    UShort    fp_tags;
2199    UInt      summary_tags;
2200    Int       r, stno;
2201    UShort    *srcS, *dstS;
2202 
2203    do_get_x87( gst, &tmp );
2204 
2205    /* Now build the proper fxsave x87 image from the fsave x87 image
2206       we just made. */
2207 
2208    addrS[0]  = tmp.env[FP_ENV_CTRL]; /* FCW: fpu control word */
2209    addrS[1]  = tmp.env[FP_ENV_STAT]; /* FCW: fpu status word */
2210 
2211    /* set addrS[2] in an endian-independent way */
2212    summary_tags = 0;
2213    fp_tags = tmp.env[FP_ENV_TAG];
2214    for (r = 0; r < 8; r++) {
2215       if ( ((fp_tags >> (2*r)) & 3) != 3 )
2216          summary_tags |= (1 << r);
2217    }
2218    addrC[4]  = toUChar(summary_tags); /* FTW: tag summary byte */
2219    addrC[5]  = 0; /* pad */
2220 
2221    /* FOP: faulting fpu opcode.  From experimentation, the real CPU
2222       does not write this field. (?!) */
2223    addrS[3]  = 0; /* BOGUS */
2224 
2225    /* RIP (Last x87 instruction pointer).  From experimentation, the
2226       real CPU does not write this field. (?!) */
2227    addrS[4]  = 0; /* BOGUS */
2228    addrS[5]  = 0; /* BOGUS */
2229    addrS[6]  = 0; /* BOGUS */
2230    addrS[7]  = 0; /* BOGUS */
2231 
2232    /* RDP (Last x87 data pointer).  From experimentation, the real CPU
2233       does not write this field. (?!) */
2234    addrS[8]  = 0; /* BOGUS */
2235    addrS[9]  = 0; /* BOGUS */
2236    addrS[10] = 0; /* BOGUS */
2237    addrS[11] = 0; /* BOGUS */
2238 
2239    /* addrS[13,12] are MXCSR -- not written */
2240    /* addrS[15,14] are MXCSR_MASK -- not written */
2241 
2242    /* Copy in the FP registers, in ST order. */
2243    for (stno = 0; stno < 8; stno++) {
2244       srcS = (UShort*)(&tmp.reg[10*stno]);
2245       dstS = (UShort*)(&addrS[16 + 8*stno]);
2246       dstS[0] = srcS[0];
2247       dstS[1] = srcS[1];
2248       dstS[2] = srcS[2];
2249       dstS[3] = srcS[3];
2250       dstS[4] = srcS[4];
2251       dstS[5] = 0;
2252       dstS[6] = 0;
2253       dstS[7] = 0;
2254    }
2255 }
2256 
2257 
2258 /* CALLED FROM GENERATED CODE */
2259 /* DIRTY HELPER (reads guest state, writes guest mem) */
2260 /* XSAVE component 1 is the SSE state. */
amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS(VexGuestAMD64State * gst,HWord addr)2261 void amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS
2262         ( VexGuestAMD64State* gst, HWord addr )
2263 {
2264    UShort* addrS = (UShort*)addr;
2265    UInt    mxcsr;
2266 
2267    /* The only non-register parts of the SSE state are MXCSR and
2268       MXCSR_MASK. */
2269    mxcsr = amd64g_create_mxcsr( gst->guest_SSEROUND );
2270 
2271    addrS[12] = toUShort(mxcsr);  /* MXCSR */
2272    addrS[13] = toUShort(mxcsr >> 16);
2273 
2274    addrS[14] = 0xFFFF; /* MXCSR mask (lo16) */
2275    addrS[15] = 0x0000; /* MXCSR mask (hi16) */
2276 }
2277 
2278 
2279 /* VISIBLE TO LIBVEX CLIENT */
2280 /* Do FXSAVE from the supplied VexGuestAMD64State structure and store
2281    the result at the given address which represents a buffer of at
2282    least 416 bytes.
2283 
2284    This function is not called from generated code.  FXSAVE is dealt
2285    with by the amd64 front end by calling the XSAVE_COMPONENT_{0,1}
2286    functions above plus some in-line IR.  This function is merely a
2287    convenience function for VEX's users.
2288 */
LibVEX_GuestAMD64_fxsave(VexGuestAMD64State * gst,HWord fp_state)2289 void LibVEX_GuestAMD64_fxsave ( /*IN*/VexGuestAMD64State* gst,
2290                                 /*OUT*/HWord fp_state )
2291 {
2292    /* Do the x87 part */
2293    amd64g_dirtyhelper_XSAVE_COMPONENT_0(gst, fp_state);
2294 
2295    /* And now the SSE part, except for the registers themselves. */
2296    amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS(gst, fp_state);
2297 
2298    /* That's the first 160 bytes of the image done. */
2299    /* Now only %xmm0 .. %xmm15 remain to be copied.  If the host is
2300       big-endian, these need to be byte-swapped. */
2301    U128 *xmm = (U128 *)(fp_state + 160);
2302    vassert(host_is_little_endian());
2303 
2304 #  define COPY_U128(_dst,_src)                       \
2305       do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
2306            _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
2307       while (0)
2308 
2309    COPY_U128( xmm[0],  gst->guest_YMM0 );
2310    COPY_U128( xmm[1],  gst->guest_YMM1 );
2311    COPY_U128( xmm[2],  gst->guest_YMM2 );
2312    COPY_U128( xmm[3],  gst->guest_YMM3 );
2313    COPY_U128( xmm[4],  gst->guest_YMM4 );
2314    COPY_U128( xmm[5],  gst->guest_YMM5 );
2315    COPY_U128( xmm[6],  gst->guest_YMM6 );
2316    COPY_U128( xmm[7],  gst->guest_YMM7 );
2317    COPY_U128( xmm[8],  gst->guest_YMM8 );
2318    COPY_U128( xmm[9],  gst->guest_YMM9 );
2319    COPY_U128( xmm[10], gst->guest_YMM10 );
2320    COPY_U128( xmm[11], gst->guest_YMM11 );
2321    COPY_U128( xmm[12], gst->guest_YMM12 );
2322    COPY_U128( xmm[13], gst->guest_YMM13 );
2323    COPY_U128( xmm[14], gst->guest_YMM14 );
2324    COPY_U128( xmm[15], gst->guest_YMM15 );
2325 #  undef COPY_U128
2326 }
2327 
2328 
2329 /*---------------------------------------------------------------*/
2330 /*--- Supporting functions for XRSTOR/FXRSTOR.                ---*/
2331 /*---------------------------------------------------------------*/
2332 
2333 /* CALLED FROM GENERATED CODE */
2334 /* DIRTY HELPER (writes guest state, reads guest mem) */
amd64g_dirtyhelper_XRSTOR_COMPONENT_0(VexGuestAMD64State * gst,HWord addr)2335 VexEmNote amd64g_dirtyhelper_XRSTOR_COMPONENT_0
2336              ( VexGuestAMD64State* gst, HWord addr )
2337 {
2338    Fpu_State tmp;
2339    UShort*   addrS   = (UShort*)addr;
2340    UChar*    addrC   = (UChar*)addr;
2341    UShort    fp_tags;
2342    Int       r, stno, i;
2343 
2344    /* Copy the x87 registers out of the image, into a temporary
2345       Fpu_State struct. */
2346    for (i = 0; i < 14; i++) tmp.env[i] = 0;
2347    for (i = 0; i < 80; i++) tmp.reg[i] = 0;
2348    /* fill in tmp.reg[0..7] */
2349    for (stno = 0; stno < 8; stno++) {
2350       UShort* dstS = (UShort*)(&tmp.reg[10*stno]);
2351       UShort* srcS = (UShort*)(&addrS[16 + 8*stno]);
2352       dstS[0] = srcS[0];
2353       dstS[1] = srcS[1];
2354       dstS[2] = srcS[2];
2355       dstS[3] = srcS[3];
2356       dstS[4] = srcS[4];
2357    }
2358    /* fill in tmp.env[0..13] */
2359    tmp.env[FP_ENV_CTRL] = addrS[0]; /* FCW: fpu control word */
2360    tmp.env[FP_ENV_STAT] = addrS[1]; /* FCW: fpu status word */
2361 
2362    fp_tags = 0;
2363    for (r = 0; r < 8; r++) {
2364       if (addrC[4] & (1<<r))
2365          fp_tags |= (0 << (2*r)); /* EMPTY */
2366       else
2367          fp_tags |= (3 << (2*r)); /* VALID -- not really precise enough. */
2368    }
2369    tmp.env[FP_ENV_TAG] = fp_tags;
2370 
2371    /* Now write 'tmp' into the guest state. */
2372    VexEmNote warnX87 = do_put_x87( True/*moveRegs*/, &tmp, gst );
2373 
2374    return warnX87;
2375 }
2376 
2377 
2378 /* CALLED FROM GENERATED CODE */
2379 /* DIRTY HELPER (writes guest state, reads guest mem) */
amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS(VexGuestAMD64State * gst,HWord addr)2380 VexEmNote amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS
2381              ( VexGuestAMD64State* gst, HWord addr )
2382 {
2383    UShort* addrS = (UShort*)addr;
2384    UInt    w32   = (((UInt)addrS[12]) & 0xFFFF)
2385                    | ((((UInt)addrS[13]) & 0xFFFF) << 16);
2386    ULong   w64   = amd64g_check_ldmxcsr( (ULong)w32 );
2387 
2388    VexEmNote warnXMM = (VexEmNote)(w64 >> 32);
2389 
2390    gst->guest_SSEROUND = w64 & 0xFFFFFFFFULL;
2391    return warnXMM;
2392 }
2393 
2394 
2395 /* VISIBLE TO LIBVEX CLIENT */
2396 /* Do FXRSTOR from the supplied address and store read values to the given
2397    VexGuestAMD64State structure.
2398 
2399    This function is not called from generated code.  FXRSTOR is dealt
2400    with by the amd64 front end by calling the XRSTOR_COMPONENT_{0,1}
2401    functions above plus some in-line IR.  This function is merely a
2402    convenience function for VEX's users.
2403 */
LibVEX_GuestAMD64_fxrstor(HWord fp_state,VexGuestAMD64State * gst)2404 VexEmNote LibVEX_GuestAMD64_fxrstor ( /*IN*/HWord fp_state,
2405                                       /*MOD*/VexGuestAMD64State* gst )
2406 {
2407    /* Restore %xmm0 .. %xmm15.  If the host is big-endian, these need
2408       to be byte-swapped. */
2409    U128 *xmm = (U128 *)(fp_state + 160);
2410 
2411    vassert(host_is_little_endian());
2412 
2413 #  define COPY_U128(_dst,_src)                       \
2414       do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
2415            _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
2416       while (0)
2417 
2418    COPY_U128( gst->guest_YMM0, xmm[0] );
2419    COPY_U128( gst->guest_YMM1, xmm[1] );
2420    COPY_U128( gst->guest_YMM2, xmm[2] );
2421    COPY_U128( gst->guest_YMM3, xmm[3] );
2422    COPY_U128( gst->guest_YMM4, xmm[4] );
2423    COPY_U128( gst->guest_YMM5, xmm[5] );
2424    COPY_U128( gst->guest_YMM6, xmm[6] );
2425    COPY_U128( gst->guest_YMM7, xmm[7] );
2426    COPY_U128( gst->guest_YMM8, xmm[8] );
2427    COPY_U128( gst->guest_YMM9, xmm[9] );
2428    COPY_U128( gst->guest_YMM10, xmm[10] );
2429    COPY_U128( gst->guest_YMM11, xmm[11] );
2430    COPY_U128( gst->guest_YMM12, xmm[12] );
2431    COPY_U128( gst->guest_YMM13, xmm[13] );
2432    COPY_U128( gst->guest_YMM14, xmm[14] );
2433    COPY_U128( gst->guest_YMM15, xmm[15] );
2434 
2435 #  undef COPY_U128
2436 
2437    VexEmNote warnXMM
2438       = amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS(gst, fp_state);
2439    VexEmNote warnX87
2440       = amd64g_dirtyhelper_XRSTOR_COMPONENT_0(gst, fp_state);
2441 
2442    /* Prefer an X87 emwarn over an XMM one, if both exist. */
2443    if (warnX87 != EmNote_NONE)
2444       return warnX87;
2445    else
2446       return warnXMM;
2447 }
2448 
2449 
2450 /*---------------------------------------------------------------*/
2451 /*--- Supporting functions for FSAVE/FRSTOR                   ---*/
2452 /*---------------------------------------------------------------*/
2453 
2454 /* DIRTY HELPER (writes guest state) */
2455 /* Initialise the x87 FPU state as per 'finit'. */
amd64g_dirtyhelper_FINIT(VexGuestAMD64State * gst)2456 void amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* gst )
2457 {
2458    Int i;
2459    gst->guest_FTOP = 0;
2460    for (i = 0; i < 8; i++) {
2461       gst->guest_FPTAG[i] = 0; /* empty */
2462       gst->guest_FPREG[i] = 0; /* IEEE754 64-bit zero */
2463    }
2464    gst->guest_FPROUND = (ULong)Irrm_NEAREST;
2465    gst->guest_FC3210  = 0;
2466 }
2467 
2468 
2469 /* CALLED FROM GENERATED CODE */
2470 /* DIRTY HELPER (reads guest memory) */
amd64g_dirtyhelper_loadF80le(Addr addrU)2471 ULong amd64g_dirtyhelper_loadF80le ( Addr addrU )
2472 {
2473    ULong f64;
2474    convert_f80le_to_f64le ( (UChar*)addrU, (UChar*)&f64 );
2475    return f64;
2476 }
2477 
2478 /* CALLED FROM GENERATED CODE */
2479 /* DIRTY HELPER (writes guest memory) */
amd64g_dirtyhelper_storeF80le(Addr addrU,ULong f64)2480 void amd64g_dirtyhelper_storeF80le ( Addr addrU, ULong f64 )
2481 {
2482    convert_f64le_to_f80le( (UChar*)&f64, (UChar*)addrU );
2483 }
2484 
2485 
2486 /* CALLED FROM GENERATED CODE */
2487 /* CLEAN HELPER */
2488 /* mxcsr[15:0] contains a SSE native format MXCSR value.
2489    Extract from it the required SSEROUND value and any resulting
2490    emulation warning, and return (warn << 32) | sseround value.
2491 */
amd64g_check_ldmxcsr(ULong mxcsr)2492 ULong amd64g_check_ldmxcsr ( ULong mxcsr )
2493 {
2494    /* Decide on a rounding mode.  mxcsr[14:13] holds it. */
2495    /* NOTE, encoded exactly as per enum IRRoundingMode. */
2496    ULong rmode = (mxcsr >> 13) & 3;
2497 
2498    /* Detect any required emulation warnings. */
2499    VexEmNote ew = EmNote_NONE;
2500 
2501    if ((mxcsr & 0x1F80) != 0x1F80) {
2502       /* unmasked exceptions! */
2503       ew = EmWarn_X86_sseExns;
2504    }
2505    else
2506    if (mxcsr & (1<<15)) {
2507       /* FZ is set */
2508       ew = EmWarn_X86_fz;
2509    }
2510    else
2511    if (mxcsr & (1<<6)) {
2512       /* DAZ is set */
2513       ew = EmWarn_X86_daz;
2514    }
2515 
2516    return (((ULong)ew) << 32) | ((ULong)rmode);
2517 }
2518 
2519 
2520 /* CALLED FROM GENERATED CODE */
2521 /* CLEAN HELPER */
2522 /* Given sseround as an IRRoundingMode value, create a suitable SSE
2523    native format MXCSR value. */
amd64g_create_mxcsr(ULong sseround)2524 ULong amd64g_create_mxcsr ( ULong sseround )
2525 {
2526    sseround &= 3;
2527    return 0x1F80 | (sseround << 13);
2528 }
2529 
2530 
2531 /* CLEAN HELPER */
2532 /* fpucw[15:0] contains a x87 native format FPU control word.
2533    Extract from it the required FPROUND value and any resulting
2534    emulation warning, and return (warn << 32) | fpround value.
2535 */
amd64g_check_fldcw(ULong fpucw)2536 ULong amd64g_check_fldcw ( ULong fpucw )
2537 {
2538    /* Decide on a rounding mode.  fpucw[11:10] holds it. */
2539    /* NOTE, encoded exactly as per enum IRRoundingMode. */
2540    ULong rmode = (fpucw >> 10) & 3;
2541 
2542    /* Detect any required emulation warnings. */
2543    VexEmNote ew = EmNote_NONE;
2544 
2545    if ((fpucw & 0x3F) != 0x3F) {
2546       /* unmasked exceptions! */
2547       ew = EmWarn_X86_x87exns;
2548    }
2549    else
2550    if (((fpucw >> 8) & 3) != 3) {
2551       /* unsupported precision */
2552       ew = EmWarn_X86_x87precision;
2553    }
2554 
2555    return (((ULong)ew) << 32) | ((ULong)rmode);
2556 }
2557 
2558 
2559 /* CLEAN HELPER */
2560 /* Given fpround as an IRRoundingMode value, create a suitable x87
2561    native format FPU control word. */
amd64g_create_fpucw(ULong fpround)2562 ULong amd64g_create_fpucw ( ULong fpround )
2563 {
2564    fpround &= 3;
2565    return 0x037F | (fpround << 10);
2566 }
2567 
2568 
2569 /* This is used to implement 'fldenv'.
2570    Reads 28 bytes at x87_state[0 .. 27]. */
2571 /* CALLED FROM GENERATED CODE */
2572 /* DIRTY HELPER */
amd64g_dirtyhelper_FLDENV(VexGuestAMD64State * vex_state,HWord x87_state)2573 VexEmNote amd64g_dirtyhelper_FLDENV ( /*OUT*/VexGuestAMD64State* vex_state,
2574                                       /*IN*/HWord x87_state)
2575 {
2576    return do_put_x87( False, (Fpu_State*)x87_state, vex_state );
2577 }
2578 
2579 
2580 /* CALLED FROM GENERATED CODE */
2581 /* DIRTY HELPER */
2582 /* Create an x87 FPU env from the guest state, as close as we can
2583    approximate it.  Writes 28 bytes at x87_state[0..27]. */
amd64g_dirtyhelper_FSTENV(VexGuestAMD64State * vex_state,HWord x87_state)2584 void amd64g_dirtyhelper_FSTENV ( /*IN*/VexGuestAMD64State* vex_state,
2585                                  /*OUT*/HWord x87_state )
2586 {
2587    Int        i, stno, preg;
2588    UInt       tagw;
2589    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2590    Fpu_State* x87     = (Fpu_State*)x87_state;
2591    UInt       ftop    = vex_state->guest_FTOP;
2592    ULong      c3210   = vex_state->guest_FC3210;
2593 
2594    for (i = 0; i < 14; i++)
2595       x87->env[i] = 0;
2596 
2597    x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
2598    x87->env[FP_ENV_STAT]
2599       = toUShort(toUInt( ((ftop & 7) << 11) | (c3210 & 0x4700) ));
2600    x87->env[FP_ENV_CTRL]
2601       = toUShort(toUInt( amd64g_create_fpucw( vex_state->guest_FPROUND ) ));
2602 
2603    /* Compute the x87 tag word. */
2604    tagw = 0;
2605    for (stno = 0; stno < 8; stno++) {
2606       preg = (stno + ftop) & 7;
2607       if (vexTags[preg] == 0) {
2608          /* register is empty */
2609          tagw |= (3 << (2*preg));
2610       } else {
2611          /* register is full. */
2612          tagw |= (0 << (2*preg));
2613       }
2614    }
2615    x87->env[FP_ENV_TAG] = toUShort(tagw);
2616 
2617    /* We don't dump the x87 registers, tho. */
2618 }
2619 
2620 
2621 /* This is used to implement 'fnsave'.
2622    Writes 108 bytes at x87_state[0 .. 107]. */
2623 /* CALLED FROM GENERATED CODE */
2624 /* DIRTY HELPER */
amd64g_dirtyhelper_FNSAVE(VexGuestAMD64State * vex_state,HWord x87_state)2625 void amd64g_dirtyhelper_FNSAVE ( /*IN*/VexGuestAMD64State* vex_state,
2626                                  /*OUT*/HWord x87_state)
2627 {
2628    do_get_x87( vex_state, (Fpu_State*)x87_state );
2629 }
2630 
2631 
2632 /* This is used to implement 'fnsaves'.
2633    Writes 94 bytes at x87_state[0 .. 93]. */
2634 /* CALLED FROM GENERATED CODE */
2635 /* DIRTY HELPER */
amd64g_dirtyhelper_FNSAVES(VexGuestAMD64State * vex_state,HWord x87_state)2636 void amd64g_dirtyhelper_FNSAVES ( /*IN*/VexGuestAMD64State* vex_state,
2637                                   /*OUT*/HWord x87_state)
2638 {
2639    Int           i, stno, preg;
2640    UInt          tagw;
2641    ULong*        vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2642    UChar*        vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2643    Fpu_State_16* x87     = (Fpu_State_16*)x87_state;
2644    UInt          ftop    = vex_state->guest_FTOP;
2645    UInt          c3210   = vex_state->guest_FC3210;
2646 
2647    for (i = 0; i < 7; i++)
2648       x87->env[i] = 0;
2649 
2650    x87->env[FPS_ENV_STAT]
2651       = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
2652    x87->env[FPS_ENV_CTRL]
2653       = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
2654 
2655    /* Dump the register stack in ST order. */
2656    tagw = 0;
2657    for (stno = 0; stno < 8; stno++) {
2658       preg = (stno + ftop) & 7;
2659       if (vexTags[preg] == 0) {
2660          /* register is empty */
2661          tagw |= (3 << (2*preg));
2662          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2663                                  &x87->reg[10*stno] );
2664       } else {
2665          /* register is full. */
2666          tagw |= (0 << (2*preg));
2667          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2668                                  &x87->reg[10*stno] );
2669       }
2670    }
2671    x87->env[FPS_ENV_TAG] = toUShort(tagw);
2672 }
2673 
2674 
2675 /* This is used to implement 'frstor'.
2676    Reads 108 bytes at x87_state[0 .. 107]. */
2677 /* CALLED FROM GENERATED CODE */
2678 /* DIRTY HELPER */
amd64g_dirtyhelper_FRSTOR(VexGuestAMD64State * vex_state,HWord x87_state)2679 VexEmNote amd64g_dirtyhelper_FRSTOR ( /*OUT*/VexGuestAMD64State* vex_state,
2680                                       /*IN*/HWord x87_state)
2681 {
2682    return do_put_x87( True, (Fpu_State*)x87_state, vex_state );
2683 }
2684 
2685 
2686 /* This is used to implement 'frstors'.
2687    Reads 94 bytes at x87_state[0 .. 93]. */
2688 /* CALLED FROM GENERATED CODE */
2689 /* DIRTY HELPER */
amd64g_dirtyhelper_FRSTORS(VexGuestAMD64State * vex_state,HWord x87_state)2690 VexEmNote amd64g_dirtyhelper_FRSTORS ( /*OUT*/VexGuestAMD64State* vex_state,
2691                                        /*IN*/HWord x87_state)
2692 {
2693    Int           stno, preg;
2694    UInt          tag;
2695    ULong*        vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2696    UChar*        vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2697    Fpu_State_16* x87     = (Fpu_State_16*)x87_state;
2698    UInt          ftop    = (x87->env[FPS_ENV_STAT] >> 11) & 7;
2699    UInt          tagw    = x87->env[FPS_ENV_TAG];
2700    UInt          fpucw   = x87->env[FPS_ENV_CTRL];
2701    UInt          c3210   = x87->env[FPS_ENV_STAT] & 0x4700;
2702    VexEmNote     ew;
2703    UInt          fpround;
2704    ULong         pair;
2705 
2706    /* Copy registers and tags */
2707    for (stno = 0; stno < 8; stno++) {
2708       preg = (stno + ftop) & 7;
2709       tag = (tagw >> (2*preg)) & 3;
2710       if (tag == 3) {
2711          /* register is empty */
2712          /* hmm, if it's empty, does it still get written?  Probably
2713             safer to say it does.  If we don't, memcheck could get out
2714             of sync, in that it thinks all FP registers are defined by
2715             this helper, but in reality some have not been updated. */
2716          vexRegs[preg] = 0; /* IEEE754 64-bit zero */
2717          vexTags[preg] = 0;
2718       } else {
2719          /* register is non-empty */
2720          convert_f80le_to_f64le( &x87->reg[10*stno],
2721                                  (UChar*)&vexRegs[preg] );
2722          vexTags[preg] = 1;
2723       }
2724    }
2725 
2726    /* stack pointer */
2727    vex_state->guest_FTOP = ftop;
2728 
2729    /* status word */
2730    vex_state->guest_FC3210 = c3210;
2731 
2732    /* handle the control word, setting FPROUND and detecting any
2733       emulation warnings. */
2734    pair    = amd64g_check_fldcw ( (ULong)fpucw );
2735    fpround = (UInt)pair & 0xFFFFFFFFULL;
2736    ew      = (VexEmNote)(pair >> 32);
2737 
2738    vex_state->guest_FPROUND = fpround & 3;
2739 
2740    /* emulation warnings --> caller */
2741    return ew;
2742 }
2743 
2744 
2745 /*---------------------------------------------------------------*/
2746 /*--- CPUID helpers.                                          ---*/
2747 /*---------------------------------------------------------------*/
2748 
2749 /* Claim to be the following CPU, which is probably representative of
2750    the lowliest (earliest) amd64 offerings.  It can do neither sse3
2751    nor cx16.
2752 
2753    vendor_id       : AuthenticAMD
2754    cpu family      : 15
2755    model           : 5
2756    model name      : AMD Opteron (tm) Processor 848
2757    stepping        : 10
2758    cpu MHz         : 1797.682
2759    cache size      : 1024 KB
2760    fpu             : yes
2761    fpu_exception   : yes
2762    cpuid level     : 1
2763    wp              : yes
2764    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
2765                      mtrr pge mca cmov pat pse36 clflush mmx fxsr
2766                      sse sse2 syscall nx mmxext lm 3dnowext 3dnow
2767    bogomips        : 3600.62
2768    TLB size        : 1088 4K pages
2769    clflush size    : 64
2770    cache_alignment : 64
2771    address sizes   : 40 bits physical, 48 bits virtual
2772    power management: ts fid vid ttp
2773 
2774    2012-Feb-21: don't claim 3dnow or 3dnowext, since in fact
2775    we don't support them.  See #291568.  3dnow is 80000001.EDX.31
2776    and 3dnowext is 80000001.EDX.30.
2777 */
amd64g_dirtyhelper_CPUID_baseline(VexGuestAMD64State * st)2778 void amd64g_dirtyhelper_CPUID_baseline ( VexGuestAMD64State* st )
2779 {
2780 #  define SET_ABCD(_a,_b,_c,_d)                \
2781       do { st->guest_RAX = (ULong)(_a);        \
2782            st->guest_RBX = (ULong)(_b);        \
2783            st->guest_RCX = (ULong)(_c);        \
2784            st->guest_RDX = (ULong)(_d);        \
2785       } while (0)
2786 
2787    switch (0xFFFFFFFF & st->guest_RAX) {
2788       case 0x00000000:
2789          SET_ABCD(0x00000001, 0x68747541, 0x444d4163, 0x69746e65);
2790          break;
2791       case 0x00000001:
2792          SET_ABCD(0x00000f5a, 0x01000800, 0x00000000, 0x078bfbff);
2793          break;
2794       case 0x80000000:
2795          SET_ABCD(0x80000018, 0x68747541, 0x444d4163, 0x69746e65);
2796          break;
2797       case 0x80000001:
2798          /* Don't claim to support 3dnow or 3dnowext.  0xe1d3fbff is
2799             the original it-is-supported value that the h/w provides.
2800             See #291568. */
2801          SET_ABCD(0x00000f5a, 0x00000505, 0x00000000, /*0xe1d3fbff*/
2802                                                       0x21d3fbff);
2803          break;
2804       case 0x80000002:
2805          SET_ABCD(0x20444d41, 0x6574704f, 0x206e6f72, 0x296d7428);
2806          break;
2807       case 0x80000003:
2808          SET_ABCD(0x6f725020, 0x73736563, 0x3820726f, 0x00003834);
2809          break;
2810       case 0x80000004:
2811          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2812          break;
2813       case 0x80000005:
2814          SET_ABCD(0xff08ff08, 0xff20ff20, 0x40020140, 0x40020140);
2815          break;
2816       case 0x80000006:
2817          SET_ABCD(0x00000000, 0x42004200, 0x04008140, 0x00000000);
2818          break;
2819       case 0x80000007:
2820          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x0000000f);
2821          break;
2822       case 0x80000008:
2823          SET_ABCD(0x00003028, 0x00000000, 0x00000000, 0x00000000);
2824          break;
2825       default:
2826          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2827          break;
2828    }
2829 #  undef SET_ABCD
2830 }
2831 
2832 
2833 /* Claim to be the following CPU (2 x ...), which is sse3 and cx16
2834    capable.
2835 
2836    vendor_id       : GenuineIntel
2837    cpu family      : 6
2838    model           : 15
2839    model name      : Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
2840    stepping        : 6
2841    cpu MHz         : 2394.000
2842    cache size      : 4096 KB
2843    physical id     : 0
2844    siblings        : 2
2845    core id         : 0
2846    cpu cores       : 2
2847    fpu             : yes
2848    fpu_exception   : yes
2849    cpuid level     : 10
2850    wp              : yes
2851    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
2852                      mtrr pge mca cmov pat pse36 clflush dts acpi
2853                      mmx fxsr sse sse2 ss ht tm syscall nx lm
2854                      constant_tsc pni monitor ds_cpl vmx est tm2
2855                      cx16 xtpr lahf_lm
2856    bogomips        : 4798.78
2857    clflush size    : 64
2858    cache_alignment : 64
2859    address sizes   : 36 bits physical, 48 bits virtual
2860    power management:
2861 */
amd64g_dirtyhelper_CPUID_sse3_and_cx16(VexGuestAMD64State * st)2862 void amd64g_dirtyhelper_CPUID_sse3_and_cx16 ( VexGuestAMD64State* st )
2863 {
2864 #  define SET_ABCD(_a,_b,_c,_d)                \
2865       do { st->guest_RAX = (ULong)(_a);        \
2866            st->guest_RBX = (ULong)(_b);        \
2867            st->guest_RCX = (ULong)(_c);        \
2868            st->guest_RDX = (ULong)(_d);        \
2869       } while (0)
2870 
2871    switch (0xFFFFFFFF & st->guest_RAX) {
2872       case 0x00000000:
2873          SET_ABCD(0x0000000a, 0x756e6547, 0x6c65746e, 0x49656e69);
2874          break;
2875       case 0x00000001:
2876          SET_ABCD(0x000006f6, 0x00020800, 0x0000e3bd, 0xbfebfbff);
2877          break;
2878       case 0x00000002:
2879          SET_ABCD(0x05b0b101, 0x005657f0, 0x00000000, 0x2cb43049);
2880          break;
2881       case 0x00000003:
2882          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2883          break;
2884       case 0x00000004: {
2885          switch (0xFFFFFFFF & st->guest_RCX) {
2886             case 0x00000000: SET_ABCD(0x04000121, 0x01c0003f,
2887                                       0x0000003f, 0x00000001); break;
2888             case 0x00000001: SET_ABCD(0x04000122, 0x01c0003f,
2889                                       0x0000003f, 0x00000001); break;
2890             case 0x00000002: SET_ABCD(0x04004143, 0x03c0003f,
2891                                       0x00000fff, 0x00000001); break;
2892             default:         SET_ABCD(0x00000000, 0x00000000,
2893                                       0x00000000, 0x00000000); break;
2894          }
2895          break;
2896       }
2897       case 0x00000005:
2898          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00000020);
2899          break;
2900       case 0x00000006:
2901          SET_ABCD(0x00000001, 0x00000002, 0x00000001, 0x00000000);
2902          break;
2903       case 0x00000007:
2904          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2905          break;
2906       case 0x00000008:
2907          SET_ABCD(0x00000400, 0x00000000, 0x00000000, 0x00000000);
2908          break;
2909       case 0x00000009:
2910          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2911          break;
2912       case 0x0000000a:
2913       unhandled_eax_value:
2914          SET_ABCD(0x07280202, 0x00000000, 0x00000000, 0x00000000);
2915          break;
2916       case 0x80000000:
2917          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
2918          break;
2919       case 0x80000001:
2920          SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x20100800);
2921          break;
2922       case 0x80000002:
2923          SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
2924          break;
2925       case 0x80000003:
2926          SET_ABCD(0x43203229, 0x20205550, 0x20202020, 0x20202020);
2927          break;
2928       case 0x80000004:
2929          SET_ABCD(0x30303636, 0x20402020, 0x30342e32, 0x007a4847);
2930          break;
2931       case 0x80000005:
2932          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2933          break;
2934       case 0x80000006:
2935          SET_ABCD(0x00000000, 0x00000000, 0x10008040, 0x00000000);
2936          break;
2937       case 0x80000007:
2938          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2939          break;
2940       case 0x80000008:
2941          SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
2942          break;
2943       default:
2944          goto unhandled_eax_value;
2945    }
2946 #  undef SET_ABCD
2947 }
2948 
2949 
2950 /* Claim to be the following CPU (4 x ...), which is sse4.2 and cx16
2951    capable.
2952 
2953    vendor_id       : GenuineIntel
2954    cpu family      : 6
2955    model           : 37
2956    model name      : Intel(R) Core(TM) i5 CPU         670  @ 3.47GHz
2957    stepping        : 2
2958    cpu MHz         : 3334.000
2959    cache size      : 4096 KB
2960    physical id     : 0
2961    siblings        : 4
2962    core id         : 0
2963    cpu cores       : 2
2964    apicid          : 0
2965    initial apicid  : 0
2966    fpu             : yes
2967    fpu_exception   : yes
2968    cpuid level     : 11
2969    wp              : yes
2970    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
2971                      mtrr pge mca cmov pat pse36 clflush dts acpi
2972                      mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
2973                      lm constant_tsc arch_perfmon pebs bts rep_good
2974                      xtopology nonstop_tsc aperfmperf pni pclmulqdq
2975                      dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16
2976                      xtpr pdcm sse4_1 sse4_2 popcnt aes lahf_lm ida
2977                      arat tpr_shadow vnmi flexpriority ept vpid
2978    bogomips        : 6957.57
2979    clflush size    : 64
2980    cache_alignment : 64
2981    address sizes   : 36 bits physical, 48 bits virtual
2982    power management:
2983 */
amd64g_dirtyhelper_CPUID_sse42_and_cx16(VexGuestAMD64State * st)2984 void amd64g_dirtyhelper_CPUID_sse42_and_cx16 ( VexGuestAMD64State* st )
2985 {
2986 #  define SET_ABCD(_a,_b,_c,_d)                \
2987       do { st->guest_RAX = (ULong)(_a);        \
2988            st->guest_RBX = (ULong)(_b);        \
2989            st->guest_RCX = (ULong)(_c);        \
2990            st->guest_RDX = (ULong)(_d);        \
2991       } while (0)
2992 
2993    UInt old_eax = (UInt)st->guest_RAX;
2994    UInt old_ecx = (UInt)st->guest_RCX;
2995 
2996    switch (old_eax) {
2997       case 0x00000000:
2998          SET_ABCD(0x0000000b, 0x756e6547, 0x6c65746e, 0x49656e69);
2999          break;
3000       case 0x00000001:
3001          SET_ABCD(0x00020652, 0x00100800, 0x0298e3ff, 0xbfebfbff);
3002          break;
3003       case 0x00000002:
3004          SET_ABCD(0x55035a01, 0x00f0b2e3, 0x00000000, 0x09ca212c);
3005          break;
3006       case 0x00000003:
3007          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3008          break;
3009       case 0x00000004:
3010          switch (old_ecx) {
3011             case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
3012                                       0x0000003f, 0x00000000); break;
3013             case 0x00000001: SET_ABCD(0x1c004122, 0x00c0003f,
3014                                       0x0000007f, 0x00000000); break;
3015             case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
3016                                       0x000001ff, 0x00000000); break;
3017             case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
3018                                       0x00000fff, 0x00000002); break;
3019             default:         SET_ABCD(0x00000000, 0x00000000,
3020                                       0x00000000, 0x00000000); break;
3021          }
3022          break;
3023       case 0x00000005:
3024          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
3025          break;
3026       case 0x00000006:
3027          SET_ABCD(0x00000007, 0x00000002, 0x00000001, 0x00000000);
3028          break;
3029       case 0x00000007:
3030          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3031          break;
3032       case 0x00000008:
3033          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3034          break;
3035       case 0x00000009:
3036          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3037          break;
3038       case 0x0000000a:
3039          SET_ABCD(0x07300403, 0x00000004, 0x00000000, 0x00000603);
3040          break;
3041       case 0x0000000b:
3042          switch (old_ecx) {
3043             case 0x00000000:
3044                SET_ABCD(0x00000001, 0x00000002,
3045                         0x00000100, 0x00000000); break;
3046             case 0x00000001:
3047                SET_ABCD(0x00000004, 0x00000004,
3048                         0x00000201, 0x00000000); break;
3049             default:
3050                SET_ABCD(0x00000000, 0x00000000,
3051                         old_ecx,    0x00000000); break;
3052          }
3053          break;
3054       case 0x0000000c:
3055          SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
3056          break;
3057       case 0x0000000d:
3058          switch (old_ecx) {
3059             case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
3060                                       0x00000100, 0x00000000); break;
3061             case 0x00000001: SET_ABCD(0x00000004, 0x00000004,
3062                                       0x00000201, 0x00000000); break;
3063             default:         SET_ABCD(0x00000000, 0x00000000,
3064                                       old_ecx,    0x00000000); break;
3065          }
3066          break;
3067       case 0x80000000:
3068          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
3069          break;
3070       case 0x80000001:
3071          SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
3072          break;
3073       case 0x80000002:
3074          SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
3075          break;
3076       case 0x80000003:
3077          SET_ABCD(0x35692029, 0x55504320, 0x20202020, 0x20202020);
3078          break;
3079       case 0x80000004:
3080          SET_ABCD(0x30373620, 0x20402020, 0x37342e33, 0x007a4847);
3081          break;
3082       case 0x80000005:
3083          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3084          break;
3085       case 0x80000006:
3086          SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
3087          break;
3088       case 0x80000007:
3089          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
3090          break;
3091       case 0x80000008:
3092          SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
3093          break;
3094       default:
3095          SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
3096          break;
3097    }
3098 #  undef SET_ABCD
3099 }
3100 
3101 
3102 /* Claim to be the following CPU (4 x ...), which is AVX and cx16
3103    capable.  Plus (kludge!) it "supports" HTM.
3104 
3105    Also with the following change: claim that XSaveOpt is not
3106    available, by cpuid(eax=0xD,ecx=1).eax[0] returns 0, compared to 1
3107    on the real CPU.  Consequently, programs that correctly observe
3108    these CPUID values should only try to use 3 of the 8 XSave-family
3109    instructions: XGETBV, XSAVE and XRSTOR.  In particular this avoids
3110    having to implement the compacted or optimised save/restore
3111    variants.
3112 
3113    vendor_id       : GenuineIntel
3114    cpu family      : 6
3115    model           : 42
3116    model name      : Intel(R) Core(TM) i5-2300 CPU @ 2.80GHz
3117    stepping        : 7
3118    cpu MHz         : 1600.000
3119    cache size      : 6144 KB
3120    physical id     : 0
3121    siblings        : 4
3122    core id         : 3
3123    cpu cores       : 4
3124    apicid          : 6
3125    initial apicid  : 6
3126    fpu             : yes
3127    fpu_exception   : yes
3128    cpuid level     : 13
3129    wp              : yes
3130    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
3131                      mtrr pge mca cmov pat pse36 clflush dts acpi
3132                      mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
3133                      lm constant_tsc arch_perfmon pebs bts rep_good
3134                      nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq
3135                      dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16
3136                      xtpr pdcm sse4_1 sse4_2 popcnt aes xsave avx
3137                      lahf_lm ida arat epb xsaveopt pln pts dts
3138                      tpr_shadow vnmi flexpriority ept vpid
3139 
3140    bogomips        : 5768.94
3141    clflush size    : 64
3142    cache_alignment : 64
3143    address sizes   : 36 bits physical, 48 bits virtual
3144    power management:
3145 */
amd64g_dirtyhelper_CPUID_avx_and_cx16(VexGuestAMD64State * st)3146 void amd64g_dirtyhelper_CPUID_avx_and_cx16 ( VexGuestAMD64State* st )
3147 {
3148 #  define SET_ABCD(_a,_b,_c,_d)                \
3149       do { st->guest_RAX = (ULong)(_a);        \
3150            st->guest_RBX = (ULong)(_b);        \
3151            st->guest_RCX = (ULong)(_c);        \
3152            st->guest_RDX = (ULong)(_d);        \
3153       } while (0)
3154 
3155    UInt old_eax = (UInt)st->guest_RAX;
3156    UInt old_ecx = (UInt)st->guest_RCX;
3157 
3158    switch (old_eax) {
3159       case 0x00000000:
3160          SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
3161          break;
3162       case 0x00000001:
3163          SET_ABCD(0x000206a7, 0x00100800, 0x1f9ae3bf, 0xbfebfbff);
3164          break;
3165       case 0x00000002:
3166          SET_ABCD(0x76035a01, 0x00f0b0ff, 0x00000000, 0x00ca0000);
3167          break;
3168       case 0x00000003:
3169          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3170          break;
3171       case 0x00000004:
3172          switch (old_ecx) {
3173             case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
3174                                       0x0000003f, 0x00000000); break;
3175             case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
3176                                       0x0000003f, 0x00000000); break;
3177             case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
3178                                       0x000001ff, 0x00000000); break;
3179             case 0x00000003: SET_ABCD(0x1c03c163, 0x02c0003f,
3180                                       0x00001fff, 0x00000006); break;
3181             default:         SET_ABCD(0x00000000, 0x00000000,
3182                                       0x00000000, 0x00000000); break;
3183          }
3184          break;
3185       case 0x00000005:
3186          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
3187          break;
3188       case 0x00000006:
3189          SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
3190          break;
3191       case 0x00000007:
3192          SET_ABCD(0x00000000, 0x00000800, 0x00000000, 0x00000000);
3193          break;
3194       case 0x00000008:
3195          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3196          break;
3197       case 0x00000009:
3198          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3199          break;
3200       case 0x0000000a:
3201          SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
3202          break;
3203       case 0x0000000b:
3204          switch (old_ecx) {
3205             case 0x00000000:
3206                SET_ABCD(0x00000001, 0x00000001,
3207                         0x00000100, 0x00000000); break;
3208             case 0x00000001:
3209                SET_ABCD(0x00000004, 0x00000004,
3210                         0x00000201, 0x00000000); break;
3211             default:
3212                SET_ABCD(0x00000000, 0x00000000,
3213                         old_ecx,    0x00000000); break;
3214          }
3215          break;
3216       case 0x0000000c:
3217          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3218          break;
3219       case 0x0000000d:
3220          switch (old_ecx) {
3221             case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
3222                                       0x00000340, 0x00000000); break;
3223             case 0x00000001: SET_ABCD(0x00000000, 0x00000000,
3224                                       0x00000000, 0x00000000); break;
3225             case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
3226                                       0x00000000, 0x00000000); break;
3227             default:         SET_ABCD(0x00000000, 0x00000000,
3228                                       0x00000000, 0x00000000); break;
3229          }
3230          break;
3231       case 0x0000000e:
3232          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3233          break;
3234       case 0x0000000f:
3235          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3236          break;
3237       case 0x80000000:
3238          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
3239          break;
3240       case 0x80000001:
3241          SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
3242          break;
3243       case 0x80000002:
3244          SET_ABCD(0x20202020, 0x20202020, 0x65746e49, 0x2952286c);
3245          break;
3246       case 0x80000003:
3247          SET_ABCD(0x726f4320, 0x4d542865, 0x35692029, 0x3033322d);
3248          break;
3249       case 0x80000004:
3250          SET_ABCD(0x50432030, 0x20402055, 0x30382e32, 0x007a4847);
3251          break;
3252       case 0x80000005:
3253          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3254          break;
3255       case 0x80000006:
3256          SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
3257          break;
3258       case 0x80000007:
3259          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
3260          break;
3261       case 0x80000008:
3262          SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
3263          break;
3264       default:
3265          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3266          break;
3267    }
3268 #  undef SET_ABCD
3269 }
3270 
3271 
3272 /* Claim to be the following CPU (4 x ...), which is AVX2 capable.
3273 
3274    With the following change: claim that XSaveOpt is not available, by
3275    cpuid(eax=0xD,ecx=1).eax[0] returns 0, compared to 1 on the real
3276    CPU.  Consequently, programs that correctly observe these CPUID
3277    values should only try to use 3 of the 8 XSave-family instructions:
3278    XGETBV, XSAVE and XRSTOR.  In particular this avoids having to
3279    implement the compacted or optimised save/restore variants.
3280 
3281    vendor_id       : GenuineIntel
3282    cpu family      : 6
3283    model           : 60
3284    model name      : Intel(R) Core(TM) i7-4910MQ CPU @ 2.90GHz
3285    stepping        : 3
3286    microcode       : 0x1c
3287    cpu MHz         : 919.957
3288    cache size      : 8192 KB
3289    physical id     : 0
3290    siblings        : 4
3291    core id         : 3
3292    cpu cores       : 4
3293    apicid          : 6
3294    initial apicid  : 6
3295    fpu             : yes
3296    fpu_exception   : yes
3297    cpuid level     : 13
3298    wp              : yes
3299    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca
3300                      cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht
3301                      tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc
3302                      arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc
3303                      aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl
3304                      vmx smx est tm2 ssse3 fma cx16 xtpr pdcm pcid sse4_1
3305                      sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave
3306                      avx f16c rdrand lahf_lm abm ida arat epb pln pts dtherm
3307                      tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust
3308                      bmi1 avx2 smep bmi2 erms invpcid xsaveopt
3309    bugs            :
3310    bogomips        : 5786.68
3311    clflush size    : 64
3312    cache_alignment : 64
3313    address sizes   : 39 bits physical, 48 bits virtual
3314    power management:
3315 */
amd64g_dirtyhelper_CPUID_avx2(VexGuestAMD64State * st)3316 void amd64g_dirtyhelper_CPUID_avx2 ( VexGuestAMD64State* st )
3317 {
3318 #  define SET_ABCD(_a,_b,_c,_d)                \
3319       do { st->guest_RAX = (ULong)(_a);        \
3320            st->guest_RBX = (ULong)(_b);        \
3321            st->guest_RCX = (ULong)(_c);        \
3322            st->guest_RDX = (ULong)(_d);        \
3323       } while (0)
3324 
3325    UInt old_eax = (UInt)st->guest_RAX;
3326    UInt old_ecx = (UInt)st->guest_RCX;
3327 
3328    switch (old_eax) {
3329       case 0x00000000:
3330          SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
3331          break;
3332       case 0x00000001:
3333          /* Don't advertise RDRAND support, bit 30 in ECX.  */
3334          SET_ABCD(0x000306c3, 0x02100800, 0x3ffafbff, 0xbfebfbff);
3335          break;
3336       case 0x00000002:
3337          SET_ABCD(0x76036301, 0x00f0b6ff, 0x00000000, 0x00c10000);
3338          break;
3339       case 0x00000003:
3340          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3341          break;
3342       case 0x00000004:
3343          switch (old_ecx) {
3344             case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
3345                                       0x0000003f, 0x00000000); break;
3346             case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
3347                                       0x0000003f, 0x00000000); break;
3348             case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
3349                                       0x000001ff, 0x00000000); break;
3350             case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
3351                                       0x00001fff, 0x00000006); break;
3352             default:         SET_ABCD(0x00000000, 0x00000000,
3353                                       0x00000000, 0x00000000); break;
3354          }
3355          break;
3356       case 0x00000005:
3357          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00042120);
3358          break;
3359       case 0x00000006:
3360          SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
3361          break;
3362       case 0x00000007:
3363          switch (old_ecx) {
3364             case 0x00000000: SET_ABCD(0x00000000, 0x000027ab,
3365                                       0x00000000, 0x00000000); break;
3366             default:         SET_ABCD(0x00000000, 0x00000000,
3367                                       0x00000000, 0x00000000); break;
3368          }
3369          break;
3370       case 0x00000008:
3371          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3372          break;
3373       case 0x00000009:
3374          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3375          break;
3376       case 0x0000000a:
3377          SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
3378          break;
3379       case 0x0000000b:
3380          switch (old_ecx) {
3381             case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
3382                                       0x00000100, 0x00000002); break;
3383             case 0x00000001: SET_ABCD(0x00000004, 0x00000008,
3384                                       0x00000201, 0x00000002); break;
3385             default:         SET_ABCD(0x00000000, 0x00000000,
3386                                       old_ecx,    0x00000002); break;
3387          }
3388          break;
3389       case 0x0000000c:
3390          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3391          break;
3392       case 0x0000000d:
3393          switch (old_ecx) {
3394             case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
3395                                       0x00000340, 0x00000000); break;
3396             case 0x00000001: SET_ABCD(0x00000000, 0x00000000,
3397                                       0x00000000, 0x00000000); break;
3398             case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
3399                                       0x00000000, 0x00000000); break;
3400             default:         SET_ABCD(0x00000000, 0x00000000,
3401                                       0x00000000, 0x00000000); break;
3402          }
3403          break;
3404       case 0x80000000:
3405          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
3406          break;
3407       case 0x80000001:
3408          SET_ABCD(0x00000000, 0x00000000, 0x00000021, 0x2c100800);
3409          break;
3410       case 0x80000002:
3411          SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
3412          break;
3413       case 0x80000003:
3414          SET_ABCD(0x37692029, 0x3139342d, 0x20514d30, 0x20555043);
3415          break;
3416       case 0x80000004:
3417          SET_ABCD(0x2e322040, 0x48473039, 0x0000007a, 0x00000000);
3418          break;
3419       case 0x80000005:
3420          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3421          break;
3422       case 0x80000006:
3423          SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
3424          break;
3425       case 0x80000007:
3426          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
3427          break;
3428       case 0x80000008:
3429          SET_ABCD(0x00003027, 0x00000000, 0x00000000, 0x00000000);
3430          break;
3431       default:
3432          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3433          break;
3434    }
3435 #  undef SET_ABCD
3436 }
3437 
3438 
3439 /*---------------------------------------------------------------*/
3440 /*--- Misc integer helpers, including rotates and crypto.     ---*/
3441 /*---------------------------------------------------------------*/
3442 
amd64g_calculate_RCR(ULong arg,ULong rot_amt,ULong rflags_in,Long szIN)3443 ULong amd64g_calculate_RCR ( ULong arg,
3444                              ULong rot_amt,
3445                              ULong rflags_in,
3446                              Long  szIN )
3447 {
3448    Bool  wantRflags = toBool(szIN < 0);
3449    ULong sz         = wantRflags ? (-szIN) : szIN;
3450    ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
3451    ULong cf=0, of=0, tempcf;
3452 
3453    switch (sz) {
3454       case 8:
3455          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3456          of        = ((arg >> 63) ^ cf) & 1;
3457          while (tempCOUNT > 0) {
3458             tempcf = arg & 1;
3459             arg    = (arg >> 1) | (cf << 63);
3460             cf     = tempcf;
3461             tempCOUNT--;
3462          }
3463          break;
3464       case 4:
3465          while (tempCOUNT >= 33) tempCOUNT -= 33;
3466          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3467          of        = ((arg >> 31) ^ cf) & 1;
3468          while (tempCOUNT > 0) {
3469             tempcf = arg & 1;
3470             arg    = ((arg >> 1) & 0x7FFFFFFFULL) | (cf << 31);
3471             cf     = tempcf;
3472             tempCOUNT--;
3473          }
3474          break;
3475       case 2:
3476          while (tempCOUNT >= 17) tempCOUNT -= 17;
3477          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3478          of        = ((arg >> 15) ^ cf) & 1;
3479          while (tempCOUNT > 0) {
3480             tempcf = arg & 1;
3481             arg    = ((arg >> 1) & 0x7FFFULL) | (cf << 15);
3482             cf     = tempcf;
3483             tempCOUNT--;
3484          }
3485          break;
3486       case 1:
3487          while (tempCOUNT >= 9) tempCOUNT -= 9;
3488          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3489          of        = ((arg >> 7) ^ cf) & 1;
3490          while (tempCOUNT > 0) {
3491             tempcf = arg & 1;
3492             arg    = ((arg >> 1) & 0x7FULL) | (cf << 7);
3493             cf     = tempcf;
3494             tempCOUNT--;
3495          }
3496          break;
3497       default:
3498          vpanic("calculate_RCR(amd64g): invalid size");
3499    }
3500 
3501    cf &= 1;
3502    of &= 1;
3503    rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
3504    rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
3505 
3506    /* caller can ask to have back either the resulting flags or
3507       resulting value, but not both */
3508    return wantRflags ? rflags_in : arg;
3509 }
3510 
amd64g_calculate_RCL(ULong arg,ULong rot_amt,ULong rflags_in,Long szIN)3511 ULong amd64g_calculate_RCL ( ULong arg,
3512                              ULong rot_amt,
3513                              ULong rflags_in,
3514                              Long  szIN )
3515 {
3516    Bool  wantRflags = toBool(szIN < 0);
3517    ULong sz         = wantRflags ? (-szIN) : szIN;
3518    ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
3519    ULong cf=0, of=0, tempcf;
3520 
3521    switch (sz) {
3522       case 8:
3523          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3524          while (tempCOUNT > 0) {
3525             tempcf = (arg >> 63) & 1;
3526             arg    = (arg << 1) | (cf & 1);
3527             cf     = tempcf;
3528             tempCOUNT--;
3529          }
3530          of = ((arg >> 63) ^ cf) & 1;
3531          break;
3532       case 4:
3533          while (tempCOUNT >= 33) tempCOUNT -= 33;
3534          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3535          while (tempCOUNT > 0) {
3536             tempcf = (arg >> 31) & 1;
3537             arg    = 0xFFFFFFFFULL & ((arg << 1) | (cf & 1));
3538             cf     = tempcf;
3539             tempCOUNT--;
3540          }
3541          of = ((arg >> 31) ^ cf) & 1;
3542          break;
3543       case 2:
3544          while (tempCOUNT >= 17) tempCOUNT -= 17;
3545          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3546          while (tempCOUNT > 0) {
3547             tempcf = (arg >> 15) & 1;
3548             arg    = 0xFFFFULL & ((arg << 1) | (cf & 1));
3549             cf     = tempcf;
3550             tempCOUNT--;
3551          }
3552          of = ((arg >> 15) ^ cf) & 1;
3553          break;
3554       case 1:
3555          while (tempCOUNT >= 9) tempCOUNT -= 9;
3556          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3557          while (tempCOUNT > 0) {
3558             tempcf = (arg >> 7) & 1;
3559             arg    = 0xFFULL & ((arg << 1) | (cf & 1));
3560             cf     = tempcf;
3561             tempCOUNT--;
3562          }
3563          of = ((arg >> 7) ^ cf) & 1;
3564          break;
3565       default:
3566          vpanic("calculate_RCL(amd64g): invalid size");
3567    }
3568 
3569    cf &= 1;
3570    of &= 1;
3571    rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
3572    rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
3573 
3574    return wantRflags ? rflags_in : arg;
3575 }
3576 
3577 /* Taken from gf2x-0.9.5, released under GPLv2+ (later versions LGPLv2+)
3578  * svn://scm.gforge.inria.fr/svn/gf2x/trunk/hardware/opteron/gf2x_mul1.h@25
3579  */
amd64g_calculate_pclmul(ULong a,ULong b,ULong which)3580 ULong amd64g_calculate_pclmul(ULong a, ULong b, ULong which)
3581 {
3582     ULong hi, lo, tmp, A[16];
3583 
3584    A[0] = 0;            A[1] = a;
3585    A[2] = A[1] << 1;    A[3] = A[2] ^ a;
3586    A[4] = A[2] << 1;    A[5] = A[4] ^ a;
3587    A[6] = A[3] << 1;    A[7] = A[6] ^ a;
3588    A[8] = A[4] << 1;    A[9] = A[8] ^ a;
3589    A[10] = A[5] << 1;   A[11] = A[10] ^ a;
3590    A[12] = A[6] << 1;   A[13] = A[12] ^ a;
3591    A[14] = A[7] << 1;   A[15] = A[14] ^ a;
3592 
3593    lo = (A[b >> 60] << 4) ^ A[(b >> 56) & 15];
3594    hi = lo >> 56;
3595    lo = (lo << 8) ^ (A[(b >> 52) & 15] << 4) ^ A[(b >> 48) & 15];
3596    hi = (hi << 8) | (lo >> 56);
3597    lo = (lo << 8) ^ (A[(b >> 44) & 15] << 4) ^ A[(b >> 40) & 15];
3598    hi = (hi << 8) | (lo >> 56);
3599    lo = (lo << 8) ^ (A[(b >> 36) & 15] << 4) ^ A[(b >> 32) & 15];
3600    hi = (hi << 8) | (lo >> 56);
3601    lo = (lo << 8) ^ (A[(b >> 28) & 15] << 4) ^ A[(b >> 24) & 15];
3602    hi = (hi << 8) | (lo >> 56);
3603    lo = (lo << 8) ^ (A[(b >> 20) & 15] << 4) ^ A[(b >> 16) & 15];
3604    hi = (hi << 8) | (lo >> 56);
3605    lo = (lo << 8) ^ (A[(b >> 12) & 15] << 4) ^ A[(b >> 8) & 15];
3606    hi = (hi << 8) | (lo >> 56);
3607    lo = (lo << 8) ^ (A[(b >> 4) & 15] << 4) ^ A[b & 15];
3608 
3609    ULong m0 = -1;
3610    m0 /= 255;
3611    tmp = -((a >> 63) & 1); tmp &= ((b & (m0 * 0xfe)) >> 1); hi = hi ^ tmp;
3612    tmp = -((a >> 62) & 1); tmp &= ((b & (m0 * 0xfc)) >> 2); hi = hi ^ tmp;
3613    tmp = -((a >> 61) & 1); tmp &= ((b & (m0 * 0xf8)) >> 3); hi = hi ^ tmp;
3614    tmp = -((a >> 60) & 1); tmp &= ((b & (m0 * 0xf0)) >> 4); hi = hi ^ tmp;
3615    tmp = -((a >> 59) & 1); tmp &= ((b & (m0 * 0xe0)) >> 5); hi = hi ^ tmp;
3616    tmp = -((a >> 58) & 1); tmp &= ((b & (m0 * 0xc0)) >> 6); hi = hi ^ tmp;
3617    tmp = -((a >> 57) & 1); tmp &= ((b & (m0 * 0x80)) >> 7); hi = hi ^ tmp;
3618 
3619    return which ? hi : lo;
3620 }
3621 
3622 
3623 /* CALLED FROM GENERATED CODE */
3624 /* DIRTY HELPER (non-referentially-transparent) */
3625 /* Horrible hack.  On non-amd64 platforms, return 1. */
amd64g_dirtyhelper_RDTSC(void)3626 ULong amd64g_dirtyhelper_RDTSC ( void )
3627 {
3628 #  if defined(__x86_64__)
3629    UInt  eax, edx;
3630    __asm__ __volatile__("rdtsc" : "=a" (eax), "=d" (edx));
3631    return (((ULong)edx) << 32) | ((ULong)eax);
3632 #  else
3633    return 1ULL;
3634 #  endif
3635 }
3636 
3637 /* CALLED FROM GENERATED CODE */
3638 /* DIRTY HELPER (non-referentially-transparent) */
3639 /* Horrible hack.  On non-amd64 platforms, return 1. */
3640 /* This uses a different calling convention from _RDTSC just above
3641    only because of the difficulty of returning 96 bits from a C
3642    function -- RDTSC returns 64 bits and so is simple by comparison,
3643    on amd64. */
amd64g_dirtyhelper_RDTSCP(VexGuestAMD64State * st)3644 void amd64g_dirtyhelper_RDTSCP ( VexGuestAMD64State* st )
3645 {
3646 #  if defined(__x86_64__)
3647    UInt eax, ecx, edx;
3648    __asm__ __volatile__("rdtscp" : "=a" (eax), "=d" (edx), "=c" (ecx));
3649    st->guest_RAX = (ULong)eax;
3650    st->guest_RCX = (ULong)ecx;
3651    st->guest_RDX = (ULong)edx;
3652 #  else
3653    /* Do nothing. */
3654 #  endif
3655 }
3656 
3657 /* CALLED FROM GENERATED CODE */
3658 /* DIRTY HELPER (non-referentially-transparent) */
3659 /* Horrible hack.  On non-amd64 platforms, return 0. */
amd64g_dirtyhelper_IN(ULong portno,ULong sz)3660 ULong amd64g_dirtyhelper_IN ( ULong portno, ULong sz/*1,2 or 4*/ )
3661 {
3662 #  if defined(__x86_64__)
3663    ULong r = 0;
3664    portno &= 0xFFFF;
3665    switch (sz) {
3666       case 4:
3667          __asm__ __volatile__("movq $0,%%rax; inl %w1,%%eax; movq %%rax,%0"
3668                               : "=a" (r) : "Nd" (portno));
3669 	 break;
3670       case 2:
3671          __asm__ __volatile__("movq $0,%%rax; inw %w1,%w0"
3672                               : "=a" (r) : "Nd" (portno));
3673 	 break;
3674       case 1:
3675          __asm__ __volatile__("movq $0,%%rax; inb %w1,%b0"
3676                               : "=a" (r) : "Nd" (portno));
3677 	 break;
3678       default:
3679          break; /* note: no 64-bit version of insn exists */
3680    }
3681    return r;
3682 #  else
3683    return 0;
3684 #  endif
3685 }
3686 
3687 
3688 /* CALLED FROM GENERATED CODE */
3689 /* DIRTY HELPER (non-referentially-transparent) */
3690 /* Horrible hack.  On non-amd64 platforms, do nothing. */
amd64g_dirtyhelper_OUT(ULong portno,ULong data,ULong sz)3691 void amd64g_dirtyhelper_OUT ( ULong portno, ULong data, ULong sz/*1,2 or 4*/ )
3692 {
3693 #  if defined(__x86_64__)
3694    portno &= 0xFFFF;
3695    switch (sz) {
3696       case 4:
3697          __asm__ __volatile__("movq %0,%%rax; outl %%eax, %w1"
3698                               : : "a" (data), "Nd" (portno));
3699 	 break;
3700       case 2:
3701          __asm__ __volatile__("outw %w0, %w1"
3702                               : : "a" (data), "Nd" (portno));
3703 	 break;
3704       case 1:
3705          __asm__ __volatile__("outb %b0, %w1"
3706                               : : "a" (data), "Nd" (portno));
3707 	 break;
3708       default:
3709          break; /* note: no 64-bit version of insn exists */
3710    }
3711 #  else
3712    /* do nothing */
3713 #  endif
3714 }
3715 
3716 /* CALLED FROM GENERATED CODE */
3717 /* DIRTY HELPER (non-referentially-transparent) */
3718 /* Horrible hack.  On non-amd64 platforms, do nothing. */
3719 /* op = 0: call the native SGDT instruction.
3720    op = 1: call the native SIDT instruction.
3721 */
amd64g_dirtyhelper_SxDT(void * address,ULong op)3722 void amd64g_dirtyhelper_SxDT ( void *address, ULong op ) {
3723 #  if defined(__x86_64__)
3724    switch (op) {
3725       case 0:
3726          __asm__ __volatile__("sgdt (%0)" : : "r" (address) : "memory");
3727          break;
3728       case 1:
3729          __asm__ __volatile__("sidt (%0)" : : "r" (address) : "memory");
3730          break;
3731       default:
3732          vpanic("amd64g_dirtyhelper_SxDT");
3733    }
3734 #  else
3735    /* do nothing */
3736    UChar* p = (UChar*)address;
3737    p[0] = p[1] = p[2] = p[3] = p[4] = p[5] = 0;
3738    p[6] = p[7] = p[8] = p[9] = 0;
3739 #  endif
3740 }
3741 
3742 /*---------------------------------------------------------------*/
3743 /*--- Helpers for MMX/SSE/SSE2.                               ---*/
3744 /*---------------------------------------------------------------*/
3745 
abdU8(UChar xx,UChar yy)3746 static inline UChar abdU8 ( UChar xx, UChar yy ) {
3747    return toUChar(xx>yy ? xx-yy : yy-xx);
3748 }
3749 
mk32x2(UInt w1,UInt w0)3750 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
3751    return (((ULong)w1) << 32) | ((ULong)w0);
3752 }
3753 
sel16x4_3(ULong w64)3754 static inline UShort sel16x4_3 ( ULong w64 ) {
3755    UInt hi32 = toUInt(w64 >> 32);
3756    return toUShort(hi32 >> 16);
3757 }
sel16x4_2(ULong w64)3758 static inline UShort sel16x4_2 ( ULong w64 ) {
3759    UInt hi32 = toUInt(w64 >> 32);
3760    return toUShort(hi32);
3761 }
sel16x4_1(ULong w64)3762 static inline UShort sel16x4_1 ( ULong w64 ) {
3763    UInt lo32 = toUInt(w64);
3764    return toUShort(lo32 >> 16);
3765 }
sel16x4_0(ULong w64)3766 static inline UShort sel16x4_0 ( ULong w64 ) {
3767    UInt lo32 = toUInt(w64);
3768    return toUShort(lo32);
3769 }
3770 
sel8x8_7(ULong w64)3771 static inline UChar sel8x8_7 ( ULong w64 ) {
3772    UInt hi32 = toUInt(w64 >> 32);
3773    return toUChar(hi32 >> 24);
3774 }
sel8x8_6(ULong w64)3775 static inline UChar sel8x8_6 ( ULong w64 ) {
3776    UInt hi32 = toUInt(w64 >> 32);
3777    return toUChar(hi32 >> 16);
3778 }
sel8x8_5(ULong w64)3779 static inline UChar sel8x8_5 ( ULong w64 ) {
3780    UInt hi32 = toUInt(w64 >> 32);
3781    return toUChar(hi32 >> 8);
3782 }
sel8x8_4(ULong w64)3783 static inline UChar sel8x8_4 ( ULong w64 ) {
3784    UInt hi32 = toUInt(w64 >> 32);
3785    return toUChar(hi32 >> 0);
3786 }
sel8x8_3(ULong w64)3787 static inline UChar sel8x8_3 ( ULong w64 ) {
3788    UInt lo32 = toUInt(w64);
3789    return toUChar(lo32 >> 24);
3790 }
sel8x8_2(ULong w64)3791 static inline UChar sel8x8_2 ( ULong w64 ) {
3792    UInt lo32 = toUInt(w64);
3793    return toUChar(lo32 >> 16);
3794 }
sel8x8_1(ULong w64)3795 static inline UChar sel8x8_1 ( ULong w64 ) {
3796    UInt lo32 = toUInt(w64);
3797    return toUChar(lo32 >> 8);
3798 }
sel8x8_0(ULong w64)3799 static inline UChar sel8x8_0 ( ULong w64 ) {
3800    UInt lo32 = toUInt(w64);
3801    return toUChar(lo32 >> 0);
3802 }
3803 
3804 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calculate_mmx_pmaddwd(ULong xx,ULong yy)3805 ULong amd64g_calculate_mmx_pmaddwd ( ULong xx, ULong yy )
3806 {
3807    return
3808       mk32x2(
3809          (((Int)(Short)sel16x4_3(xx)) * ((Int)(Short)sel16x4_3(yy)))
3810             + (((Int)(Short)sel16x4_2(xx)) * ((Int)(Short)sel16x4_2(yy))),
3811          (((Int)(Short)sel16x4_1(xx)) * ((Int)(Short)sel16x4_1(yy)))
3812             + (((Int)(Short)sel16x4_0(xx)) * ((Int)(Short)sel16x4_0(yy)))
3813       );
3814 }
3815 
3816 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calculate_mmx_psadbw(ULong xx,ULong yy)3817 ULong amd64g_calculate_mmx_psadbw ( ULong xx, ULong yy )
3818 {
3819    UInt t = 0;
3820    t += (UInt)abdU8( sel8x8_7(xx), sel8x8_7(yy) );
3821    t += (UInt)abdU8( sel8x8_6(xx), sel8x8_6(yy) );
3822    t += (UInt)abdU8( sel8x8_5(xx), sel8x8_5(yy) );
3823    t += (UInt)abdU8( sel8x8_4(xx), sel8x8_4(yy) );
3824    t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
3825    t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
3826    t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
3827    t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
3828    t &= 0xFFFF;
3829    return (ULong)t;
3830 }
3831 
3832 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calculate_sse_phminposuw(ULong sLo,ULong sHi)3833 ULong amd64g_calculate_sse_phminposuw ( ULong sLo, ULong sHi )
3834 {
3835    UShort t, min;
3836    UInt   idx;
3837    t = sel16x4_0(sLo); if (True)    { min = t; idx = 0; }
3838    t = sel16x4_1(sLo); if (t < min) { min = t; idx = 1; }
3839    t = sel16x4_2(sLo); if (t < min) { min = t; idx = 2; }
3840    t = sel16x4_3(sLo); if (t < min) { min = t; idx = 3; }
3841    t = sel16x4_0(sHi); if (t < min) { min = t; idx = 4; }
3842    t = sel16x4_1(sHi); if (t < min) { min = t; idx = 5; }
3843    t = sel16x4_2(sHi); if (t < min) { min = t; idx = 6; }
3844    t = sel16x4_3(sHi); if (t < min) { min = t; idx = 7; }
3845    return ((ULong)(idx << 16)) | ((ULong)min);
3846 }
3847 
3848 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calc_crc32b(ULong crcIn,ULong b)3849 ULong amd64g_calc_crc32b ( ULong crcIn, ULong b )
3850 {
3851    UInt  i;
3852    ULong crc = (b & 0xFFULL) ^ crcIn;
3853    for (i = 0; i < 8; i++)
3854       crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
3855    return crc;
3856 }
3857 
3858 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calc_crc32w(ULong crcIn,ULong w)3859 ULong amd64g_calc_crc32w ( ULong crcIn, ULong w )
3860 {
3861    UInt  i;
3862    ULong crc = (w & 0xFFFFULL) ^ crcIn;
3863    for (i = 0; i < 16; i++)
3864       crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
3865    return crc;
3866 }
3867 
3868 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calc_crc32l(ULong crcIn,ULong l)3869 ULong amd64g_calc_crc32l ( ULong crcIn, ULong l )
3870 {
3871    UInt i;
3872    ULong crc = (l & 0xFFFFFFFFULL) ^ crcIn;
3873    for (i = 0; i < 32; i++)
3874       crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
3875    return crc;
3876 }
3877 
3878 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calc_crc32q(ULong crcIn,ULong q)3879 ULong amd64g_calc_crc32q ( ULong crcIn, ULong q )
3880 {
3881    ULong crc = amd64g_calc_crc32l(crcIn, q);
3882    return amd64g_calc_crc32l(crc, q >> 32);
3883 }
3884 
3885 
3886 /* .. helper for next fn .. */
sad_8x4(ULong xx,ULong yy)3887 static inline ULong sad_8x4 ( ULong xx, ULong yy )
3888 {
3889    UInt t = 0;
3890    t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
3891    t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
3892    t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
3893    t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
3894    return (ULong)t;
3895 }
3896 
3897 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calc_mpsadbw(ULong sHi,ULong sLo,ULong dHi,ULong dLo,ULong imm_and_return_control_bit)3898 ULong amd64g_calc_mpsadbw ( ULong sHi, ULong sLo,
3899                             ULong dHi, ULong dLo,
3900                             ULong imm_and_return_control_bit )
3901 {
3902    UInt imm8     = imm_and_return_control_bit & 7;
3903    Bool calcHi   = (imm_and_return_control_bit >> 7) & 1;
3904    UInt srcOffsL = imm8 & 3; /* src offs in 32-bit (L) chunks */
3905    UInt dstOffsL = (imm8 >> 2) & 1; /* dst offs in ditto chunks */
3906    /* For src we only need 32 bits, so get them into the
3907       lower half of a 64 bit word. */
3908    ULong src = ((srcOffsL & 2) ? sHi : sLo) >> (32 * (srcOffsL & 1));
3909    /* For dst we need to get hold of 56 bits (7 bytes) from a total of
3910       11 bytes.  If calculating the low part of the result, need bytes
3911       dstOffsL * 4 + (0 .. 6); if calculating the high part,
3912       dstOffsL * 4 + (4 .. 10). */
3913    ULong dst;
3914    /* dstOffL = 0, Lo  ->  0 .. 6
3915       dstOffL = 1, Lo  ->  4 .. 10
3916       dstOffL = 0, Hi  ->  4 .. 10
3917       dstOffL = 1, Hi  ->  8 .. 14
3918    */
3919    if (calcHi && dstOffsL) {
3920       /* 8 .. 14 */
3921       dst = dHi & 0x00FFFFFFFFFFFFFFULL;
3922    }
3923    else if (!calcHi && !dstOffsL) {
3924       /* 0 .. 6 */
3925       dst = dLo & 0x00FFFFFFFFFFFFFFULL;
3926    }
3927    else {
3928       /* 4 .. 10 */
3929       dst = (dLo >> 32) | ((dHi & 0x00FFFFFFULL) << 32);
3930    }
3931    ULong r0  = sad_8x4( dst >>  0, src );
3932    ULong r1  = sad_8x4( dst >>  8, src );
3933    ULong r2  = sad_8x4( dst >> 16, src );
3934    ULong r3  = sad_8x4( dst >> 24, src );
3935    ULong res = (r3 << 48) | (r2 << 32) | (r1 << 16) | r0;
3936    return res;
3937 }
3938 
3939 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calculate_pext(ULong src_masked,ULong mask)3940 ULong amd64g_calculate_pext ( ULong src_masked, ULong mask )
3941 {
3942    ULong dst = 0;
3943    ULong src_bit;
3944    ULong dst_bit = 1;
3945    for (src_bit = 1; src_bit; src_bit <<= 1) {
3946       if (mask & src_bit) {
3947          if (src_masked & src_bit) dst |= dst_bit;
3948          dst_bit <<= 1;
3949       }
3950    }
3951    return dst;
3952 }
3953 
3954 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calculate_pdep(ULong src,ULong mask)3955 ULong amd64g_calculate_pdep ( ULong src, ULong mask )
3956 {
3957    ULong dst = 0;
3958    ULong dst_bit;
3959    ULong src_bit = 1;
3960    for (dst_bit = 1; dst_bit; dst_bit <<= 1) {
3961       if (mask & dst_bit) {
3962          if (src & src_bit) dst |= dst_bit;
3963          src_bit <<= 1;
3964       }
3965    }
3966    return dst;
3967 }
3968 
3969 /*---------------------------------------------------------------*/
3970 /*--- Helpers for SSE4.2 PCMP{E,I}STR{I,M}                    ---*/
3971 /*---------------------------------------------------------------*/
3972 
zmask_from_V128(V128 * arg)3973 static UInt zmask_from_V128 ( V128* arg )
3974 {
3975    UInt i, res = 0;
3976    for (i = 0; i < 16; i++) {
3977       res |=  ((arg->w8[i] == 0) ? 1 : 0) << i;
3978    }
3979    return res;
3980 }
3981 
zmask_from_V128_wide(V128 * arg)3982 static UInt zmask_from_V128_wide ( V128* arg )
3983 {
3984    UInt i, res = 0;
3985    for (i = 0; i < 8; i++) {
3986       res |=  ((arg->w16[i] == 0) ? 1 : 0) << i;
3987    }
3988    return res;
3989 }
3990 
3991 /* Helps with PCMP{I,E}STR{I,M}.
3992 
3993    CALLED FROM GENERATED CODE: DIRTY HELPER(s).  (But not really,
3994    actually it could be a clean helper, but for the fact that we can't
3995    pass by value 2 x V128 to a clean helper, nor have one returned.)
3996    Reads guest state, writes to guest state for the xSTRM cases, no
3997    accesses of memory, is a pure function.
3998 
3999    opc_and_imm contains (4th byte of opcode << 8) | the-imm8-byte so
4000    the callee knows which I/E and I/M variant it is dealing with and
4001    what the specific operation is.  4th byte of opcode is in the range
4002    0x60 to 0x63:
4003        istri  66 0F 3A 63
4004        istrm  66 0F 3A 62
4005        estri  66 0F 3A 61
4006        estrm  66 0F 3A 60
4007 
4008    gstOffL and gstOffR are the guest state offsets for the two XMM
4009    register inputs.  We never have to deal with the memory case since
4010    that is handled by pre-loading the relevant value into the fake
4011    XMM16 register.
4012 
4013    For ESTRx variants, edxIN and eaxIN hold the values of those two
4014    registers.
4015 
4016    In all cases, the bottom 16 bits of the result contain the new
4017    OSZACP %rflags values.  For xSTRI variants, bits[31:16] of the
4018    result hold the new %ecx value.  For xSTRM variants, the helper
4019    writes the result directly to the guest XMM0.
4020 
4021    Declarable side effects: in all cases, reads guest state at
4022    [gstOffL, +16) and [gstOffR, +16).  For xSTRM variants, also writes
4023    guest_XMM0.
4024 
4025    Is expected to be called with opc_and_imm combinations which have
4026    actually been validated, and will assert if otherwise.  The front
4027    end should ensure we're only called with verified values.
4028 */
amd64g_dirtyhelper_PCMPxSTRx(VexGuestAMD64State * gst,HWord opc4_and_imm,HWord gstOffL,HWord gstOffR,HWord edxIN,HWord eaxIN)4029 ULong amd64g_dirtyhelper_PCMPxSTRx (
4030           VexGuestAMD64State* gst,
4031           HWord opc4_and_imm,
4032           HWord gstOffL, HWord gstOffR,
4033           HWord edxIN, HWord eaxIN
4034        )
4035 {
4036    HWord opc4 = (opc4_and_imm >> 8) & 0xFF;
4037    HWord imm8 = opc4_and_imm & 0xFF;
4038    HWord isISTRx = opc4 & 2;
4039    HWord isxSTRM = (opc4 & 1) ^ 1;
4040    vassert((opc4 & 0xFC) == 0x60); /* 0x60 .. 0x63 */
4041    HWord wide = (imm8 & 1);
4042 
4043    // where the args are
4044    V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
4045    V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
4046 
4047    /* Create the arg validity masks, either from the vectors
4048       themselves or from the supplied edx/eax values. */
4049    // FIXME: this is only right for the 8-bit data cases.
4050    // At least that is asserted above.
4051    UInt zmaskL, zmaskR;
4052 
4053    // temp spot for the resulting flags and vector.
4054    V128 resV;
4055    UInt resOSZACP;
4056 
4057    // for checking whether case was handled
4058    Bool ok = False;
4059 
4060    if (wide) {
4061       if (isISTRx) {
4062          zmaskL = zmask_from_V128_wide(argL);
4063          zmaskR = zmask_from_V128_wide(argR);
4064       } else {
4065          Int tmp;
4066          tmp = edxIN & 0xFFFFFFFF;
4067          if (tmp < -8) tmp = -8;
4068          if (tmp > 8)  tmp = 8;
4069          if (tmp < 0)  tmp = -tmp;
4070          vassert(tmp >= 0 && tmp <= 8);
4071          zmaskL = (1 << tmp) & 0xFF;
4072          tmp = eaxIN & 0xFFFFFFFF;
4073          if (tmp < -8) tmp = -8;
4074          if (tmp > 8)  tmp = 8;
4075          if (tmp < 0)  tmp = -tmp;
4076          vassert(tmp >= 0 && tmp <= 8);
4077          zmaskR = (1 << tmp) & 0xFF;
4078       }
4079       // do the meyaath
4080       ok = compute_PCMPxSTRx_wide (
4081               &resV, &resOSZACP, argL, argR,
4082               zmaskL, zmaskR, imm8, (Bool)isxSTRM
4083            );
4084    } else {
4085       if (isISTRx) {
4086          zmaskL = zmask_from_V128(argL);
4087          zmaskR = zmask_from_V128(argR);
4088       } else {
4089          Int tmp;
4090          tmp = edxIN & 0xFFFFFFFF;
4091          if (tmp < -16) tmp = -16;
4092          if (tmp > 16)  tmp = 16;
4093          if (tmp < 0)   tmp = -tmp;
4094          vassert(tmp >= 0 && tmp <= 16);
4095          zmaskL = (1 << tmp) & 0xFFFF;
4096          tmp = eaxIN & 0xFFFFFFFF;
4097          if (tmp < -16) tmp = -16;
4098          if (tmp > 16)  tmp = 16;
4099          if (tmp < 0)   tmp = -tmp;
4100          vassert(tmp >= 0 && tmp <= 16);
4101          zmaskR = (1 << tmp) & 0xFFFF;
4102       }
4103       // do the meyaath
4104       ok = compute_PCMPxSTRx (
4105               &resV, &resOSZACP, argL, argR,
4106               zmaskL, zmaskR, imm8, (Bool)isxSTRM
4107            );
4108    }
4109 
4110    // front end shouldn't pass us any imm8 variants we can't
4111    // handle.  Hence:
4112    vassert(ok);
4113 
4114    // So, finally we need to get the results back to the caller.
4115    // In all cases, the new OSZACP value is the lowest 16 of
4116    // the return value.
4117    if (isxSTRM) {
4118       gst->guest_YMM0[0] = resV.w32[0];
4119       gst->guest_YMM0[1] = resV.w32[1];
4120       gst->guest_YMM0[2] = resV.w32[2];
4121       gst->guest_YMM0[3] = resV.w32[3];
4122       return resOSZACP & 0x8D5;
4123    } else {
4124       UInt newECX = resV.w32[0] & 0xFFFF;
4125       return (newECX << 16) | (resOSZACP & 0x8D5);
4126    }
4127 }
4128 
4129 /*---------------------------------------------------------------*/
4130 /*--- AES primitives and helpers                              ---*/
4131 /*---------------------------------------------------------------*/
4132 /* a 16 x 16 matrix */
4133 static const UChar sbox[256] = {                   // row nr
4134    0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, // 1
4135    0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
4136    0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, // 2
4137    0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
4138    0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, // 3
4139    0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
4140    0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, // 4
4141    0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
4142    0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, // 5
4143    0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
4144    0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, // 6
4145    0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
4146    0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, // 7
4147    0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
4148    0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, // 8
4149    0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
4150    0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, // 9
4151    0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
4152    0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, //10
4153    0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
4154    0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, //11
4155    0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
4156    0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, //12
4157    0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
4158    0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, //13
4159    0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
4160    0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, //14
4161    0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
4162    0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, //15
4163    0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
4164    0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, //16
4165    0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
4166 };
SubBytes(V128 * v)4167 static void SubBytes (V128* v)
4168 {
4169    V128 r;
4170    UInt i;
4171    for (i = 0; i < 16; i++)
4172       r.w8[i] = sbox[v->w8[i]];
4173    *v = r;
4174 }
4175 
4176 /* a 16 x 16 matrix */
4177 static const UChar invsbox[256] = {                // row nr
4178    0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, // 1
4179    0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
4180    0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, // 2
4181    0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
4182    0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, // 3
4183    0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
4184    0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, // 4
4185    0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
4186    0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, // 5
4187    0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
4188    0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, // 6
4189    0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
4190    0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, // 7
4191    0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
4192    0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, // 8
4193    0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
4194    0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, // 9
4195    0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
4196    0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, //10
4197    0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
4198    0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, //11
4199    0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
4200    0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, //12
4201    0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
4202    0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, //13
4203    0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
4204    0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, //14
4205    0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
4206    0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, //15
4207    0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
4208    0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, //16
4209    0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
4210 };
InvSubBytes(V128 * v)4211 static void InvSubBytes (V128* v)
4212 {
4213    V128 r;
4214    UInt i;
4215    for (i = 0; i < 16; i++)
4216       r.w8[i] = invsbox[v->w8[i]];
4217    *v = r;
4218 }
4219 
4220 static const UChar ShiftRows_op[16] =
4221    {11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0};
ShiftRows(V128 * v)4222 static void ShiftRows (V128* v)
4223 {
4224    V128 r;
4225    UInt i;
4226    for (i = 0; i < 16; i++)
4227       r.w8[i] = v->w8[ShiftRows_op[15-i]];
4228    *v = r;
4229 }
4230 
4231 static const UChar InvShiftRows_op[16] =
4232    {3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0};
InvShiftRows(V128 * v)4233 static void InvShiftRows (V128* v)
4234 {
4235    V128 r;
4236    UInt i;
4237    for (i = 0; i < 16; i++)
4238       r.w8[i] = v->w8[InvShiftRows_op[15-i]];
4239    *v = r;
4240 }
4241 
4242 /* Multiplication of the finite fields elements of AES.
4243    See "A Specification for The AES Algorithm Rijndael
4244         (by Joan Daemen & Vincent Rijmen)"
4245         Dr. Brian Gladman, v3.1, 3rd March 2001. */
4246 /* N values so that (hex) xy = 0x03^N.
4247    0x00 cannot be used. We put 0xff for this value.*/
4248 /* a 16 x 16 matrix */
4249 static const UChar Nxy[256] = {                    // row nr
4250    0xff, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, // 1
4251    0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03,
4252    0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, // 2
4253    0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1,
4254    0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, // 3
4255    0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78,
4256    0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, // 4
4257    0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e,
4258    0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, // 5
4259    0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38,
4260    0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, // 6
4261    0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10,
4262    0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, // 7
4263    0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba,
4264    0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, // 8
4265    0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57,
4266    0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, // 9
4267    0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8,
4268    0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, //10
4269    0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0,
4270    0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, //11
4271    0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7,
4272    0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, //12
4273    0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d,
4274    0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, //13
4275    0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1,
4276    0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, //14
4277    0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab,
4278    0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, //15
4279    0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5,
4280    0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, //16
4281    0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07
4282 };
4283 
4284 /* E values so that E = 0x03^xy. */
4285 static const UChar Exy[256] = {                    // row nr
4286    0x01, 0x03, 0x05, 0x0f, 0x11, 0x33, 0x55, 0xff, // 1
4287    0x1a, 0x2e, 0x72, 0x96, 0xa1, 0xf8, 0x13, 0x35,
4288    0x5f, 0xe1, 0x38, 0x48, 0xd8, 0x73, 0x95, 0xa4, // 2
4289    0xf7, 0x02, 0x06, 0x0a, 0x1e, 0x22, 0x66, 0xaa,
4290    0xe5, 0x34, 0x5c, 0xe4, 0x37, 0x59, 0xeb, 0x26, // 3
4291    0x6a, 0xbe, 0xd9, 0x70, 0x90, 0xab, 0xe6, 0x31,
4292    0x53, 0xf5, 0x04, 0x0c, 0x14, 0x3c, 0x44, 0xcc, // 4
4293    0x4f, 0xd1, 0x68, 0xb8, 0xd3, 0x6e, 0xb2, 0xcd,
4294    0x4c, 0xd4, 0x67, 0xa9, 0xe0, 0x3b, 0x4d, 0xd7, // 5
4295    0x62, 0xa6, 0xf1, 0x08, 0x18, 0x28, 0x78, 0x88,
4296    0x83, 0x9e, 0xb9, 0xd0, 0x6b, 0xbd, 0xdc, 0x7f, // 6
4297    0x81, 0x98, 0xb3, 0xce, 0x49, 0xdb, 0x76, 0x9a,
4298    0xb5, 0xc4, 0x57, 0xf9, 0x10, 0x30, 0x50, 0xf0, // 7
4299    0x0b, 0x1d, 0x27, 0x69, 0xbb, 0xd6, 0x61, 0xa3,
4300    0xfe, 0x19, 0x2b, 0x7d, 0x87, 0x92, 0xad, 0xec, // 8
4301    0x2f, 0x71, 0x93, 0xae, 0xe9, 0x20, 0x60, 0xa0,
4302    0xfb, 0x16, 0x3a, 0x4e, 0xd2, 0x6d, 0xb7, 0xc2, // 9
4303    0x5d, 0xe7, 0x32, 0x56, 0xfa, 0x15, 0x3f, 0x41,
4304    0xc3, 0x5e, 0xe2, 0x3d, 0x47, 0xc9, 0x40, 0xc0, //10
4305    0x5b, 0xed, 0x2c, 0x74, 0x9c, 0xbf, 0xda, 0x75,
4306    0x9f, 0xba, 0xd5, 0x64, 0xac, 0xef, 0x2a, 0x7e, //11
4307    0x82, 0x9d, 0xbc, 0xdf, 0x7a, 0x8e, 0x89, 0x80,
4308    0x9b, 0xb6, 0xc1, 0x58, 0xe8, 0x23, 0x65, 0xaf, //12
4309    0xea, 0x25, 0x6f, 0xb1, 0xc8, 0x43, 0xc5, 0x54,
4310    0xfc, 0x1f, 0x21, 0x63, 0xa5, 0xf4, 0x07, 0x09, //13
4311    0x1b, 0x2d, 0x77, 0x99, 0xb0, 0xcb, 0x46, 0xca,
4312    0x45, 0xcf, 0x4a, 0xde, 0x79, 0x8b, 0x86, 0x91, //14
4313    0xa8, 0xe3, 0x3e, 0x42, 0xc6, 0x51, 0xf3, 0x0e,
4314    0x12, 0x36, 0x5a, 0xee, 0x29, 0x7b, 0x8d, 0x8c, //15
4315    0x8f, 0x8a, 0x85, 0x94, 0xa7, 0xf2, 0x0d, 0x17,
4316    0x39, 0x4b, 0xdd, 0x7c, 0x84, 0x97, 0xa2, 0xfd, //16
4317    0x1c, 0x24, 0x6c, 0xb4, 0xc7, 0x52, 0xf6, 0x01};
4318 
ff_mul(UChar u1,UChar u2)4319 static inline UChar ff_mul(UChar u1, UChar u2)
4320 {
4321    if ((u1 > 0) && (u2 > 0)) {
4322       UInt ui = Nxy[u1] + Nxy[u2];
4323       if (ui >= 255)
4324          ui = ui - 255;
4325       return Exy[ui];
4326    } else {
4327       return 0;
4328    };
4329 }
4330 
MixColumns(V128 * v)4331 static void MixColumns (V128* v)
4332 {
4333    V128 r;
4334    Int j;
4335 #define P(x,row,col) (x)->w8[((row)*4+(col))]
4336    for (j = 0; j < 4; j++) {
4337       P(&r,j,0) = ff_mul(0x02, P(v,j,0)) ^ ff_mul(0x03, P(v,j,1))
4338          ^ P(v,j,2) ^ P(v,j,3);
4339       P(&r,j,1) = P(v,j,0) ^ ff_mul( 0x02, P(v,j,1) )
4340          ^ ff_mul(0x03, P(v,j,2) ) ^ P(v,j,3);
4341       P(&r,j,2) = P(v,j,0) ^ P(v,j,1) ^ ff_mul( 0x02, P(v,j,2) )
4342          ^ ff_mul(0x03, P(v,j,3) );
4343       P(&r,j,3) = ff_mul(0x03, P(v,j,0) ) ^ P(v,j,1) ^ P(v,j,2)
4344          ^ ff_mul( 0x02, P(v,j,3) );
4345    }
4346    *v = r;
4347 #undef P
4348 }
4349 
InvMixColumns(V128 * v)4350 static void InvMixColumns (V128* v)
4351 {
4352    V128 r;
4353    Int j;
4354 #define P(x,row,col) (x)->w8[((row)*4+(col))]
4355    for (j = 0; j < 4; j++) {
4356       P(&r,j,0) = ff_mul(0x0e, P(v,j,0) ) ^ ff_mul(0x0b, P(v,j,1) )
4357          ^ ff_mul(0x0d,P(v,j,2) ) ^ ff_mul(0x09, P(v,j,3) );
4358       P(&r,j,1) = ff_mul(0x09, P(v,j,0) ) ^ ff_mul(0x0e, P(v,j,1) )
4359          ^ ff_mul(0x0b,P(v,j,2) ) ^ ff_mul(0x0d, P(v,j,3) );
4360       P(&r,j,2) = ff_mul(0x0d, P(v,j,0) ) ^ ff_mul(0x09, P(v,j,1) )
4361          ^ ff_mul(0x0e,P(v,j,2) ) ^ ff_mul(0x0b, P(v,j,3) );
4362       P(&r,j,3) = ff_mul(0x0b, P(v,j,0) ) ^ ff_mul(0x0d, P(v,j,1) )
4363          ^ ff_mul(0x09,P(v,j,2) ) ^ ff_mul(0x0e, P(v,j,3) );
4364    }
4365    *v = r;
4366 #undef P
4367 
4368 }
4369 
4370 /* For description, see definition in guest_amd64_defs.h */
amd64g_dirtyhelper_AES(VexGuestAMD64State * gst,HWord opc4,HWord gstOffD,HWord gstOffL,HWord gstOffR)4371 void amd64g_dirtyhelper_AES (
4372           VexGuestAMD64State* gst,
4373           HWord opc4, HWord gstOffD,
4374           HWord gstOffL, HWord gstOffR
4375        )
4376 {
4377    // where the args are
4378    V128* argD = (V128*)( ((UChar*)gst) + gstOffD );
4379    V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
4380    V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
4381    V128  r;
4382 
4383    switch (opc4) {
4384       case 0xDC: /* AESENC */
4385       case 0xDD: /* AESENCLAST */
4386          r = *argR;
4387          ShiftRows (&r);
4388          SubBytes  (&r);
4389          if (opc4 == 0xDC)
4390             MixColumns (&r);
4391          argD->w64[0] = r.w64[0] ^ argL->w64[0];
4392          argD->w64[1] = r.w64[1] ^ argL->w64[1];
4393          break;
4394 
4395       case 0xDE: /* AESDEC */
4396       case 0xDF: /* AESDECLAST */
4397          r = *argR;
4398          InvShiftRows (&r);
4399          InvSubBytes (&r);
4400          if (opc4 == 0xDE)
4401             InvMixColumns (&r);
4402          argD->w64[0] = r.w64[0] ^ argL->w64[0];
4403          argD->w64[1] = r.w64[1] ^ argL->w64[1];
4404          break;
4405 
4406       case 0xDB: /* AESIMC */
4407          *argD = *argL;
4408          InvMixColumns (argD);
4409          break;
4410       default: vassert(0);
4411    }
4412 }
4413 
RotWord(UInt w32)4414 static inline UInt RotWord (UInt   w32)
4415 {
4416    return ((w32 >> 8) | (w32 << 24));
4417 }
4418 
SubWord(UInt w32)4419 static inline UInt SubWord (UInt   w32)
4420 {
4421    UChar *w8;
4422    UChar *r8;
4423    UInt res;
4424    w8 = (UChar*) &w32;
4425    r8 = (UChar*) &res;
4426    r8[0] = sbox[w8[0]];
4427    r8[1] = sbox[w8[1]];
4428    r8[2] = sbox[w8[2]];
4429    r8[3] = sbox[w8[3]];
4430    return res;
4431 }
4432 
4433 /* For description, see definition in guest_amd64_defs.h */
amd64g_dirtyhelper_AESKEYGENASSIST(VexGuestAMD64State * gst,HWord imm8,HWord gstOffL,HWord gstOffR)4434 extern void amd64g_dirtyhelper_AESKEYGENASSIST (
4435           VexGuestAMD64State* gst,
4436           HWord imm8,
4437           HWord gstOffL, HWord gstOffR
4438        )
4439 {
4440    // where the args are
4441    V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
4442    V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
4443 
4444    // We have to create the result in a temporary in the
4445    // case where the src and dst regs are the same.  See #341698.
4446    V128 tmp;
4447 
4448    tmp.w32[3] = RotWord (SubWord (argL->w32[3])) ^ imm8;
4449    tmp.w32[2] = SubWord (argL->w32[3]);
4450    tmp.w32[1] = RotWord (SubWord (argL->w32[1])) ^ imm8;
4451    tmp.w32[0] = SubWord (argL->w32[1]);
4452 
4453    argR->w32[3] = tmp.w32[3];
4454    argR->w32[2] = tmp.w32[2];
4455    argR->w32[1] = tmp.w32[1];
4456    argR->w32[0] = tmp.w32[0];
4457 }
4458 
4459 
4460 
4461 /*---------------------------------------------------------------*/
4462 /*--- Helpers for dealing with, and describing,               ---*/
4463 /*--- guest state as a whole.                                 ---*/
4464 /*---------------------------------------------------------------*/
4465 
4466 /* Initialise the entire amd64 guest state. */
4467 /* VISIBLE TO LIBVEX CLIENT */
LibVEX_GuestAMD64_initialise(VexGuestAMD64State * vex_state)4468 void LibVEX_GuestAMD64_initialise ( /*OUT*/VexGuestAMD64State* vex_state )
4469 {
4470    vex_state->host_EvC_FAILADDR = 0;
4471    vex_state->host_EvC_COUNTER = 0;
4472    vex_state->pad0 = 0;
4473 
4474    vex_state->guest_RAX = 0;
4475    vex_state->guest_RCX = 0;
4476    vex_state->guest_RDX = 0;
4477    vex_state->guest_RBX = 0;
4478    vex_state->guest_RSP = 0;
4479    vex_state->guest_RBP = 0;
4480    vex_state->guest_RSI = 0;
4481    vex_state->guest_RDI = 0;
4482    vex_state->guest_R8  = 0;
4483    vex_state->guest_R9  = 0;
4484    vex_state->guest_R10 = 0;
4485    vex_state->guest_R11 = 0;
4486    vex_state->guest_R12 = 0;
4487    vex_state->guest_R13 = 0;
4488    vex_state->guest_R14 = 0;
4489    vex_state->guest_R15 = 0;
4490 
4491    vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
4492    vex_state->guest_CC_DEP1 = 0;
4493    vex_state->guest_CC_DEP2 = 0;
4494    vex_state->guest_CC_NDEP = 0;
4495 
4496    vex_state->guest_DFLAG   = 1; /* forwards */
4497    vex_state->guest_IDFLAG  = 0;
4498    vex_state->guest_ACFLAG  = 0;
4499 
4500    /* HACK: represent the offset associated with a constant %fs.
4501       Typically, on linux, this assumes that %fs is only ever zero (main
4502       thread) or 0x63. */
4503    vex_state->guest_FS_CONST = 0;
4504 
4505    vex_state->guest_RIP = 0;
4506 
4507    /* Initialise the simulated FPU */
4508    amd64g_dirtyhelper_FINIT( vex_state );
4509 
4510    /* Initialise the AVX state. */
4511 #  define AVXZERO(_ymm) \
4512       do { _ymm[0]=_ymm[1]=_ymm[2]=_ymm[3] = 0; \
4513            _ymm[4]=_ymm[5]=_ymm[6]=_ymm[7] = 0; \
4514       } while (0)
4515    vex_state->guest_SSEROUND = (ULong)Irrm_NEAREST;
4516    AVXZERO(vex_state->guest_YMM0);
4517    AVXZERO(vex_state->guest_YMM1);
4518    AVXZERO(vex_state->guest_YMM2);
4519    AVXZERO(vex_state->guest_YMM3);
4520    AVXZERO(vex_state->guest_YMM4);
4521    AVXZERO(vex_state->guest_YMM5);
4522    AVXZERO(vex_state->guest_YMM6);
4523    AVXZERO(vex_state->guest_YMM7);
4524    AVXZERO(vex_state->guest_YMM8);
4525    AVXZERO(vex_state->guest_YMM9);
4526    AVXZERO(vex_state->guest_YMM10);
4527    AVXZERO(vex_state->guest_YMM11);
4528    AVXZERO(vex_state->guest_YMM12);
4529    AVXZERO(vex_state->guest_YMM13);
4530    AVXZERO(vex_state->guest_YMM14);
4531    AVXZERO(vex_state->guest_YMM15);
4532    AVXZERO(vex_state->guest_YMM16);
4533 
4534 #  undef AVXZERO
4535 
4536    vex_state->guest_EMNOTE = EmNote_NONE;
4537 
4538    /* These should not ever be either read or written, but we
4539       initialise them anyway. */
4540    vex_state->guest_CMSTART = 0;
4541    vex_state->guest_CMLEN   = 0;
4542 
4543    vex_state->guest_NRADDR   = 0;
4544    vex_state->guest_SC_CLASS = 0;
4545    vex_state->guest_GS_CONST = 0;
4546 
4547    vex_state->guest_IP_AT_SYSCALL = 0;
4548    vex_state->pad1 = 0;
4549 }
4550 
4551 
4552 /* Figure out if any part of the guest state contained in minoff
4553    .. maxoff requires precise memory exceptions.  If in doubt return
4554    True (but this generates significantly slower code).
4555 
4556    By default we enforce precise exns for guest %RSP, %RBP and %RIP
4557    only.  These are the minimum needed to extract correct stack
4558    backtraces from amd64 code.
4559 
4560    Only %RSP is needed in mode VexRegUpdSpAtMemAccess.
4561 */
guest_amd64_state_requires_precise_mem_exns(Int minoff,Int maxoff,VexRegisterUpdates pxControl)4562 Bool guest_amd64_state_requires_precise_mem_exns (
4563         Int minoff, Int maxoff, VexRegisterUpdates pxControl
4564      )
4565 {
4566    Int rbp_min = offsetof(VexGuestAMD64State, guest_RBP);
4567    Int rbp_max = rbp_min + 8 - 1;
4568    Int rsp_min = offsetof(VexGuestAMD64State, guest_RSP);
4569    Int rsp_max = rsp_min + 8 - 1;
4570    Int rip_min = offsetof(VexGuestAMD64State, guest_RIP);
4571    Int rip_max = rip_min + 8 - 1;
4572 
4573    if (maxoff < rsp_min || minoff > rsp_max) {
4574       /* no overlap with rsp */
4575       if (pxControl == VexRegUpdSpAtMemAccess)
4576          return False; // We only need to check stack pointer.
4577    } else {
4578       return True;
4579    }
4580 
4581    if (maxoff < rbp_min || minoff > rbp_max) {
4582       /* no overlap with rbp */
4583    } else {
4584       return True;
4585    }
4586 
4587    if (maxoff < rip_min || minoff > rip_max) {
4588       /* no overlap with eip */
4589    } else {
4590       return True;
4591    }
4592 
4593    return False;
4594 }
4595 
4596 
4597 #define ALWAYSDEFD(field)                             \
4598     { offsetof(VexGuestAMD64State, field),            \
4599       (sizeof ((VexGuestAMD64State*)0)->field) }
4600 
4601 VexGuestLayout
4602    amd64guest_layout
4603       = {
4604           /* Total size of the guest state, in bytes. */
4605           .total_sizeB = sizeof(VexGuestAMD64State),
4606 
4607           /* Describe the stack pointer. */
4608           .offset_SP = offsetof(VexGuestAMD64State,guest_RSP),
4609           .sizeof_SP = 8,
4610 
4611           /* Describe the frame pointer. */
4612           .offset_FP = offsetof(VexGuestAMD64State,guest_RBP),
4613           .sizeof_FP = 8,
4614 
4615           /* Describe the instruction pointer. */
4616           .offset_IP = offsetof(VexGuestAMD64State,guest_RIP),
4617           .sizeof_IP = 8,
4618 
4619           /* Describe any sections to be regarded by Memcheck as
4620              'always-defined'. */
4621           .n_alwaysDefd = 16,
4622 
4623           /* flags thunk: OP and NDEP are always defd, whereas DEP1
4624              and DEP2 have to be tracked.  See detailed comment in
4625              gdefs.h on meaning of thunk fields. */
4626           .alwaysDefd
4627              = { /*  0 */ ALWAYSDEFD(guest_CC_OP),
4628                  /*  1 */ ALWAYSDEFD(guest_CC_NDEP),
4629 		 /*  2 */ ALWAYSDEFD(guest_DFLAG),
4630                  /*  3 */ ALWAYSDEFD(guest_IDFLAG),
4631                  /*  4 */ ALWAYSDEFD(guest_RIP),
4632                  /*  5 */ ALWAYSDEFD(guest_FS_CONST),
4633                  /*  6 */ ALWAYSDEFD(guest_FTOP),
4634                  /*  7 */ ALWAYSDEFD(guest_FPTAG),
4635                  /*  8 */ ALWAYSDEFD(guest_FPROUND),
4636                  /*  9 */ ALWAYSDEFD(guest_FC3210),
4637                  // /* */ ALWAYSDEFD(guest_CS),
4638                  // /* */ ALWAYSDEFD(guest_DS),
4639                  // /* */ ALWAYSDEFD(guest_ES),
4640                  // /* */ ALWAYSDEFD(guest_FS),
4641                  // /* */ ALWAYSDEFD(guest_GS),
4642                  // /* */ ALWAYSDEFD(guest_SS),
4643                  // /* */ ALWAYSDEFD(guest_LDT),
4644                  // /* */ ALWAYSDEFD(guest_GDT),
4645                  /* 10 */ ALWAYSDEFD(guest_EMNOTE),
4646                  /* 11 */ ALWAYSDEFD(guest_SSEROUND),
4647                  /* 12 */ ALWAYSDEFD(guest_CMSTART),
4648                  /* 13 */ ALWAYSDEFD(guest_CMLEN),
4649                  /* 14 */ ALWAYSDEFD(guest_SC_CLASS),
4650                  /* 15 */ ALWAYSDEFD(guest_IP_AT_SYSCALL)
4651                }
4652         };
4653 
4654 
4655 /*---------------------------------------------------------------*/
4656 /*--- end                               guest_amd64_helpers.c ---*/
4657 /*---------------------------------------------------------------*/
4658