1
2 /*---------------------------------------------------------------*/
3 /*--- begin guest_amd64_helpers.c ---*/
4 /*---------------------------------------------------------------*/
5
6 /*
7 This file is part of Valgrind, a dynamic binary instrumentation
8 framework.
9
10 Copyright (C) 2004-2017 OpenWorks LLP
11 info@open-works.net
12
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
17
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, write to the Free Software
25 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26 02110-1301, USA.
27
28 The GNU General Public License is contained in the file COPYING.
29
30 Neither the names of the U.S. Department of Energy nor the
31 University of California nor the names of its contributors may be
32 used to endorse or promote products derived from this software
33 without prior written permission.
34 */
35
36 #include "libvex_basictypes.h"
37 #include "libvex_emnote.h"
38 #include "libvex_guest_amd64.h"
39 #include "libvex_ir.h"
40 #include "libvex.h"
41
42 #include "main_util.h"
43 #include "main_globals.h"
44 #include "guest_generic_bb_to_IR.h"
45 #include "guest_amd64_defs.h"
46 #include "guest_generic_x87.h"
47
48
49 /* This file contains helper functions for amd64 guest code.
50 Calls to these functions are generated by the back end.
51 These calls are of course in the host machine code and
52 this file will be compiled to host machine code, so that
53 all makes sense.
54
55 Only change the signatures of these helper functions very
56 carefully. If you change the signature here, you'll have to change
57 the parameters passed to it in the IR calls constructed by
58 guest-amd64/toIR.c.
59
60 The convention used is that all functions called from generated
61 code are named amd64g_<something>, and any function whose name lacks
62 that prefix is not called from generated code. Note that some
63 LibVEX_* functions can however be called by VEX's client, but that
64 is not the same as calling them from VEX-generated code.
65 */
66
67
68 /* Set to 1 to get detailed profiling info about use of the flag
69 machinery. */
70 #define PROFILE_RFLAGS 0
71
72
73 /*---------------------------------------------------------------*/
74 /*--- %rflags run-time helpers. ---*/
75 /*---------------------------------------------------------------*/
76
77 /* Do 64x64 -> 128 signed/unsigned multiplies, for computing flags
78 after imulq/mulq. */
79
mullS64(Long u,Long v,Long * rHi,Long * rLo)80 static void mullS64 ( Long u, Long v, Long* rHi, Long* rLo )
81 {
82 const Long halfMask = 0xFFFFFFFFLL;
83 ULong u0, v0, w0;
84 Long u1, v1, w1, w2, t;
85 u0 = u & halfMask;
86 u1 = u >> 32;
87 v0 = v & halfMask;
88 v1 = v >> 32;
89 w0 = u0 * v0;
90 t = u1 * v0 + (w0 >> 32);
91 w1 = t & halfMask;
92 w2 = t >> 32;
93 w1 = u0 * v1 + w1;
94 *rHi = u1 * v1 + w2 + (w1 >> 32);
95 *rLo = (Long)((ULong)u * (ULong)v);
96 }
97
mullU64(ULong u,ULong v,ULong * rHi,ULong * rLo)98 static void mullU64 ( ULong u, ULong v, ULong* rHi, ULong* rLo )
99 {
100 const ULong halfMask = 0xFFFFFFFFULL;
101 ULong u0, v0, w0;
102 ULong u1, v1, w1,w2,t;
103 u0 = u & halfMask;
104 u1 = u >> 32;
105 v0 = v & halfMask;
106 v1 = v >> 32;
107 w0 = u0 * v0;
108 t = u1 * v0 + (w0 >> 32);
109 w1 = t & halfMask;
110 w2 = t >> 32;
111 w1 = u0 * v1 + w1;
112 *rHi = u1 * v1 + w2 + (w1 >> 32);
113 *rLo = u * v;
114 }
115
116
117 static const UChar parity_table[256] = {
118 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
119 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
120 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
121 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
122 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
123 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
124 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
125 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
126 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
127 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
128 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
129 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
130 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
131 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
132 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
133 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
134 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
135 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
136 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
137 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
138 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
139 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
140 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
141 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
142 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
143 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
144 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
145 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
146 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
147 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
148 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
149 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
150 };
151
152 /* generalised left-shifter */
lshift(Long x,Int n)153 static inline Long lshift ( Long x, Int n )
154 {
155 if (n >= 0)
156 return (ULong)x << n;
157 else
158 return x >> (-n);
159 }
160
161 /* identity on ULong */
idULong(ULong x)162 static inline ULong idULong ( ULong x )
163 {
164 return x;
165 }
166
167
168 #define PREAMBLE(__data_bits) \
169 /* const */ ULong DATA_MASK \
170 = __data_bits==8 \
171 ? 0xFFULL \
172 : (__data_bits==16 \
173 ? 0xFFFFULL \
174 : (__data_bits==32 \
175 ? 0xFFFFFFFFULL \
176 : 0xFFFFFFFFFFFFFFFFULL)); \
177 /* const */ ULong SIGN_MASK = 1ULL << (__data_bits - 1); \
178 /* const */ ULong CC_DEP1 = cc_dep1_formal; \
179 /* const */ ULong CC_DEP2 = cc_dep2_formal; \
180 /* const */ ULong CC_NDEP = cc_ndep_formal; \
181 /* Four bogus assignments, which hopefully gcc can */ \
182 /* optimise away, and which stop it complaining about */ \
183 /* unused variables. */ \
184 SIGN_MASK = SIGN_MASK; \
185 DATA_MASK = DATA_MASK; \
186 CC_DEP2 = CC_DEP2; \
187 CC_NDEP = CC_NDEP;
188
189
190 /*-------------------------------------------------------------*/
191
192 #define ACTIONS_ADD(DATA_BITS,DATA_UTYPE) \
193 { \
194 PREAMBLE(DATA_BITS); \
195 { ULong cf, pf, af, zf, sf, of; \
196 ULong argL, argR, res; \
197 argL = CC_DEP1; \
198 argR = CC_DEP2; \
199 res = argL + argR; \
200 cf = (DATA_UTYPE)res < (DATA_UTYPE)argL; \
201 pf = parity_table[(UChar)res]; \
202 af = (res ^ argL ^ argR) & 0x10; \
203 zf = ((DATA_UTYPE)res == 0) << 6; \
204 sf = lshift(res, 8 - DATA_BITS) & 0x80; \
205 of = lshift((argL ^ argR ^ -1) & (argL ^ res), \
206 12 - DATA_BITS) & AMD64G_CC_MASK_O; \
207 return cf | pf | af | zf | sf | of; \
208 } \
209 }
210
211 /*-------------------------------------------------------------*/
212
213 #define ACTIONS_SUB(DATA_BITS,DATA_UTYPE) \
214 { \
215 PREAMBLE(DATA_BITS); \
216 { ULong cf, pf, af, zf, sf, of; \
217 ULong argL, argR, res; \
218 argL = CC_DEP1; \
219 argR = CC_DEP2; \
220 res = argL - argR; \
221 cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR; \
222 pf = parity_table[(UChar)res]; \
223 af = (res ^ argL ^ argR) & 0x10; \
224 zf = ((DATA_UTYPE)res == 0) << 6; \
225 sf = lshift(res, 8 - DATA_BITS) & 0x80; \
226 of = lshift((argL ^ argR) & (argL ^ res), \
227 12 - DATA_BITS) & AMD64G_CC_MASK_O; \
228 return cf | pf | af | zf | sf | of; \
229 } \
230 }
231
232 /*-------------------------------------------------------------*/
233
234 #define ACTIONS_ADC(DATA_BITS,DATA_UTYPE) \
235 { \
236 PREAMBLE(DATA_BITS); \
237 { ULong cf, pf, af, zf, sf, of; \
238 ULong argL, argR, oldC, res; \
239 oldC = CC_NDEP & AMD64G_CC_MASK_C; \
240 argL = CC_DEP1; \
241 argR = CC_DEP2 ^ oldC; \
242 res = (argL + argR) + oldC; \
243 if (oldC) \
244 cf = (DATA_UTYPE)res <= (DATA_UTYPE)argL; \
245 else \
246 cf = (DATA_UTYPE)res < (DATA_UTYPE)argL; \
247 pf = parity_table[(UChar)res]; \
248 af = (res ^ argL ^ argR) & 0x10; \
249 zf = ((DATA_UTYPE)res == 0) << 6; \
250 sf = lshift(res, 8 - DATA_BITS) & 0x80; \
251 of = lshift((argL ^ argR ^ -1) & (argL ^ res), \
252 12 - DATA_BITS) & AMD64G_CC_MASK_O; \
253 return cf | pf | af | zf | sf | of; \
254 } \
255 }
256
257 /*-------------------------------------------------------------*/
258
259 #define ACTIONS_SBB(DATA_BITS,DATA_UTYPE) \
260 { \
261 PREAMBLE(DATA_BITS); \
262 { ULong cf, pf, af, zf, sf, of; \
263 ULong argL, argR, oldC, res; \
264 oldC = CC_NDEP & AMD64G_CC_MASK_C; \
265 argL = CC_DEP1; \
266 argR = CC_DEP2 ^ oldC; \
267 res = (argL - argR) - oldC; \
268 if (oldC) \
269 cf = (DATA_UTYPE)argL <= (DATA_UTYPE)argR; \
270 else \
271 cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR; \
272 pf = parity_table[(UChar)res]; \
273 af = (res ^ argL ^ argR) & 0x10; \
274 zf = ((DATA_UTYPE)res == 0) << 6; \
275 sf = lshift(res, 8 - DATA_BITS) & 0x80; \
276 of = lshift((argL ^ argR) & (argL ^ res), \
277 12 - DATA_BITS) & AMD64G_CC_MASK_O; \
278 return cf | pf | af | zf | sf | of; \
279 } \
280 }
281
282 /*-------------------------------------------------------------*/
283
284 #define ACTIONS_LOGIC(DATA_BITS,DATA_UTYPE) \
285 { \
286 PREAMBLE(DATA_BITS); \
287 { ULong cf, pf, af, zf, sf, of; \
288 cf = 0; \
289 pf = parity_table[(UChar)CC_DEP1]; \
290 af = 0; \
291 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \
292 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
293 of = 0; \
294 return cf | pf | af | zf | sf | of; \
295 } \
296 }
297
298 /*-------------------------------------------------------------*/
299
300 #define ACTIONS_INC(DATA_BITS,DATA_UTYPE) \
301 { \
302 PREAMBLE(DATA_BITS); \
303 { ULong cf, pf, af, zf, sf, of; \
304 ULong argL, argR, res; \
305 res = CC_DEP1; \
306 argL = res - 1; \
307 argR = 1; \
308 cf = CC_NDEP & AMD64G_CC_MASK_C; \
309 pf = parity_table[(UChar)res]; \
310 af = (res ^ argL ^ argR) & 0x10; \
311 zf = ((DATA_UTYPE)res == 0) << 6; \
312 sf = lshift(res, 8 - DATA_BITS) & 0x80; \
313 of = ((res & DATA_MASK) == SIGN_MASK) << 11; \
314 return cf | pf | af | zf | sf | of; \
315 } \
316 }
317
318 /*-------------------------------------------------------------*/
319
320 #define ACTIONS_DEC(DATA_BITS,DATA_UTYPE) \
321 { \
322 PREAMBLE(DATA_BITS); \
323 { ULong cf, pf, af, zf, sf, of; \
324 ULong argL, argR, res; \
325 res = CC_DEP1; \
326 argL = res + 1; \
327 argR = 1; \
328 cf = CC_NDEP & AMD64G_CC_MASK_C; \
329 pf = parity_table[(UChar)res]; \
330 af = (res ^ argL ^ argR) & 0x10; \
331 zf = ((DATA_UTYPE)res == 0) << 6; \
332 sf = lshift(res, 8 - DATA_BITS) & 0x80; \
333 of = ((res & DATA_MASK) \
334 == ((ULong)SIGN_MASK - 1)) << 11; \
335 return cf | pf | af | zf | sf | of; \
336 } \
337 }
338
339 /*-------------------------------------------------------------*/
340
341 #define ACTIONS_SHL(DATA_BITS,DATA_UTYPE) \
342 { \
343 PREAMBLE(DATA_BITS); \
344 { ULong cf, pf, af, zf, sf, of; \
345 cf = (CC_DEP2 >> (DATA_BITS - 1)) & AMD64G_CC_MASK_C; \
346 pf = parity_table[(UChar)CC_DEP1]; \
347 af = 0; /* undefined */ \
348 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \
349 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
350 /* of is defined if shift count == 1 */ \
351 of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) \
352 & AMD64G_CC_MASK_O; \
353 return cf | pf | af | zf | sf | of; \
354 } \
355 }
356
357 /*-------------------------------------------------------------*/
358
359 #define ACTIONS_SHR(DATA_BITS,DATA_UTYPE) \
360 { \
361 PREAMBLE(DATA_BITS); \
362 { ULong cf, pf, af, zf, sf, of; \
363 cf = CC_DEP2 & 1; \
364 pf = parity_table[(UChar)CC_DEP1]; \
365 af = 0; /* undefined */ \
366 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \
367 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
368 /* of is defined if shift count == 1 */ \
369 of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) \
370 & AMD64G_CC_MASK_O; \
371 return cf | pf | af | zf | sf | of; \
372 } \
373 }
374
375 /*-------------------------------------------------------------*/
376
377 /* ROL: cf' = lsb(result). of' = msb(result) ^ lsb(result). */
378 /* DEP1 = result, NDEP = old flags */
379 #define ACTIONS_ROL(DATA_BITS,DATA_UTYPE) \
380 { \
381 PREAMBLE(DATA_BITS); \
382 { ULong fl \
383 = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C)) \
384 | (AMD64G_CC_MASK_C & CC_DEP1) \
385 | (AMD64G_CC_MASK_O & (lshift(CC_DEP1, \
386 11-(DATA_BITS-1)) \
387 ^ lshift(CC_DEP1, 11))); \
388 return fl; \
389 } \
390 }
391
392 /*-------------------------------------------------------------*/
393
394 /* ROR: cf' = msb(result). of' = msb(result) ^ msb-1(result). */
395 /* DEP1 = result, NDEP = old flags */
396 #define ACTIONS_ROR(DATA_BITS,DATA_UTYPE) \
397 { \
398 PREAMBLE(DATA_BITS); \
399 { ULong fl \
400 = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C)) \
401 | (AMD64G_CC_MASK_C & (CC_DEP1 >> (DATA_BITS-1))) \
402 | (AMD64G_CC_MASK_O & (lshift(CC_DEP1, \
403 11-(DATA_BITS-1)) \
404 ^ lshift(CC_DEP1, 11-(DATA_BITS-1)+1))); \
405 return fl; \
406 } \
407 }
408
409 /*-------------------------------------------------------------*/
410
411 #define ACTIONS_UMUL(DATA_BITS, DATA_UTYPE, NARROWtoU, \
412 DATA_U2TYPE, NARROWto2U) \
413 { \
414 PREAMBLE(DATA_BITS); \
415 { ULong cf, pf, af, zf, sf, of; \
416 DATA_UTYPE hi; \
417 DATA_UTYPE lo \
418 = NARROWtoU( ((DATA_UTYPE)CC_DEP1) \
419 * ((DATA_UTYPE)CC_DEP2) ); \
420 DATA_U2TYPE rr \
421 = NARROWto2U( \
422 ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP1)) \
423 * ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP2)) ); \
424 hi = NARROWtoU(rr >>/*u*/ DATA_BITS); \
425 cf = (hi != 0); \
426 pf = parity_table[(UChar)lo]; \
427 af = 0; /* undefined */ \
428 zf = (lo == 0) << 6; \
429 sf = lshift(lo, 8 - DATA_BITS) & 0x80; \
430 of = cf << 11; \
431 return cf | pf | af | zf | sf | of; \
432 } \
433 }
434
435 /*-------------------------------------------------------------*/
436
437 #define ACTIONS_SMUL(DATA_BITS, DATA_STYPE, NARROWtoS, \
438 DATA_S2TYPE, NARROWto2S) \
439 { \
440 PREAMBLE(DATA_BITS); \
441 { ULong cf, pf, af, zf, sf, of; \
442 DATA_STYPE hi; \
443 DATA_STYPE lo \
444 = NARROWtoS( ((DATA_S2TYPE)(DATA_STYPE)CC_DEP1) \
445 * ((DATA_S2TYPE)(DATA_STYPE)CC_DEP2) ); \
446 DATA_S2TYPE rr \
447 = NARROWto2S( \
448 ((DATA_S2TYPE)((DATA_STYPE)CC_DEP1)) \
449 * ((DATA_S2TYPE)((DATA_STYPE)CC_DEP2)) ); \
450 hi = NARROWtoS(rr >>/*s*/ DATA_BITS); \
451 cf = (hi != (lo >>/*s*/ (DATA_BITS-1))); \
452 pf = parity_table[(UChar)lo]; \
453 af = 0; /* undefined */ \
454 zf = (lo == 0) << 6; \
455 sf = lshift(lo, 8 - DATA_BITS) & 0x80; \
456 of = cf << 11; \
457 return cf | pf | af | zf | sf | of; \
458 } \
459 }
460
461 /*-------------------------------------------------------------*/
462
463 #define ACTIONS_UMULQ \
464 { \
465 PREAMBLE(64); \
466 { ULong cf, pf, af, zf, sf, of; \
467 ULong lo, hi; \
468 mullU64( (ULong)CC_DEP1, (ULong)CC_DEP2, &hi, &lo ); \
469 cf = (hi != 0); \
470 pf = parity_table[(UChar)lo]; \
471 af = 0; /* undefined */ \
472 zf = (lo == 0) << 6; \
473 sf = lshift(lo, 8 - 64) & 0x80; \
474 of = cf << 11; \
475 return cf | pf | af | zf | sf | of; \
476 } \
477 }
478
479 /*-------------------------------------------------------------*/
480
481 #define ACTIONS_SMULQ \
482 { \
483 PREAMBLE(64); \
484 { ULong cf, pf, af, zf, sf, of; \
485 Long lo, hi; \
486 mullS64( (Long)CC_DEP1, (Long)CC_DEP2, &hi, &lo ); \
487 cf = (hi != (lo >>/*s*/ (64-1))); \
488 pf = parity_table[(UChar)lo]; \
489 af = 0; /* undefined */ \
490 zf = (lo == 0) << 6; \
491 sf = lshift(lo, 8 - 64) & 0x80; \
492 of = cf << 11; \
493 return cf | pf | af | zf | sf | of; \
494 } \
495 }
496
497 /*-------------------------------------------------------------*/
498
499 #define ACTIONS_ANDN(DATA_BITS,DATA_UTYPE) \
500 { \
501 PREAMBLE(DATA_BITS); \
502 { ULong cf, pf, af, zf, sf, of; \
503 cf = 0; \
504 pf = 0; \
505 af = 0; \
506 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \
507 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
508 of = 0; \
509 return cf | pf | af | zf | sf | of; \
510 } \
511 }
512
513 /*-------------------------------------------------------------*/
514
515 #define ACTIONS_BLSI(DATA_BITS,DATA_UTYPE) \
516 { \
517 PREAMBLE(DATA_BITS); \
518 { ULong cf, pf, af, zf, sf, of; \
519 cf = ((DATA_UTYPE)CC_DEP2 != 0); \
520 pf = 0; \
521 af = 0; \
522 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \
523 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
524 of = 0; \
525 return cf | pf | af | zf | sf | of; \
526 } \
527 }
528
529 /*-------------------------------------------------------------*/
530
531 #define ACTIONS_BLSMSK(DATA_BITS,DATA_UTYPE) \
532 { \
533 PREAMBLE(DATA_BITS); \
534 { Long cf, pf, af, zf, sf, of; \
535 cf = ((DATA_UTYPE)CC_DEP2 == 0); \
536 pf = 0; \
537 af = 0; \
538 zf = 0; \
539 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
540 of = 0; \
541 return cf | pf | af | zf | sf | of; \
542 } \
543 }
544
545 /*-------------------------------------------------------------*/
546
547 #define ACTIONS_BLSR(DATA_BITS,DATA_UTYPE) \
548 { \
549 PREAMBLE(DATA_BITS); \
550 { ULong cf, pf, af, zf, sf, of; \
551 cf = ((DATA_UTYPE)CC_DEP2 == 0); \
552 pf = 0; \
553 af = 0; \
554 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \
555 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
556 of = 0; \
557 return cf | pf | af | zf | sf | of; \
558 } \
559 }
560
561 /*-------------------------------------------------------------*/
562
563 #define ACTIONS_ADX(DATA_BITS,DATA_UTYPE,FLAGNAME) \
564 { \
565 PREAMBLE(DATA_BITS); \
566 { ULong ocf; /* o or c */ \
567 ULong argL, argR, oldOC, res; \
568 oldOC = (CC_NDEP >> AMD64G_CC_SHIFT_##FLAGNAME) & 1; \
569 argL = CC_DEP1; \
570 argR = CC_DEP2 ^ oldOC; \
571 res = (argL + argR) + oldOC; \
572 if (oldOC) \
573 ocf = (DATA_UTYPE)res <= (DATA_UTYPE)argL; \
574 else \
575 ocf = (DATA_UTYPE)res < (DATA_UTYPE)argL; \
576 return (CC_NDEP & ~AMD64G_CC_MASK_##FLAGNAME) \
577 | (ocf << AMD64G_CC_SHIFT_##FLAGNAME); \
578 } \
579 }
580
581 /*-------------------------------------------------------------*/
582
583
584 #if PROFILE_RFLAGS
585
586 static Bool initted = False;
587
588 /* C flag, fast route */
589 static UInt tabc_fast[AMD64G_CC_OP_NUMBER];
590 /* C flag, slow route */
591 static UInt tabc_slow[AMD64G_CC_OP_NUMBER];
592 /* table for calculate_cond */
593 static UInt tab_cond[AMD64G_CC_OP_NUMBER][16];
594 /* total entry counts for calc_all, calc_c, calc_cond. */
595 static UInt n_calc_all = 0;
596 static UInt n_calc_c = 0;
597 static UInt n_calc_cond = 0;
598
599 #define SHOW_COUNTS_NOW (0 == (0x3FFFFF & (n_calc_all+n_calc_c+n_calc_cond)))
600
601
showCounts(void)602 static void showCounts ( void )
603 {
604 Int op, co;
605 HChar ch;
606 vex_printf("\nTotal calls: calc_all=%u calc_cond=%u calc_c=%u\n",
607 n_calc_all, n_calc_cond, n_calc_c);
608
609 vex_printf(" cSLOW cFAST O NO B NB Z NZ BE NBE"
610 " S NS P NP L NL LE NLE\n");
611 vex_printf(" -----------------------------------------------------"
612 "----------------------------------------\n");
613 for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
614
615 ch = ' ';
616 if (op > 0 && (op-1) % 4 == 0)
617 ch = 'B';
618 if (op > 0 && (op-1) % 4 == 1)
619 ch = 'W';
620 if (op > 0 && (op-1) % 4 == 2)
621 ch = 'L';
622 if (op > 0 && (op-1) % 4 == 3)
623 ch = 'Q';
624
625 vex_printf("%2d%c: ", op, ch);
626 vex_printf("%6u ", tabc_slow[op]);
627 vex_printf("%6u ", tabc_fast[op]);
628 for (co = 0; co < 16; co++) {
629 Int n = tab_cond[op][co];
630 if (n >= 1000) {
631 vex_printf(" %3dK", n / 1000);
632 } else
633 if (n >= 0) {
634 vex_printf(" %3d ", n );
635 } else {
636 vex_printf(" ");
637 }
638 }
639 vex_printf("\n");
640 }
641 vex_printf("\n");
642 }
643
initCounts(void)644 static void initCounts ( void )
645 {
646 Int op, co;
647 initted = True;
648 for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
649 tabc_fast[op] = tabc_slow[op] = 0;
650 for (co = 0; co < 16; co++)
651 tab_cond[op][co] = 0;
652 }
653 }
654
655 #endif /* PROFILE_RFLAGS */
656
657
658 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
659 /* Calculate all the 6 flags from the supplied thunk parameters.
660 Worker function, not directly called from generated code. */
661 static
amd64g_calculate_rflags_all_WRK(ULong cc_op,ULong cc_dep1_formal,ULong cc_dep2_formal,ULong cc_ndep_formal)662 ULong amd64g_calculate_rflags_all_WRK ( ULong cc_op,
663 ULong cc_dep1_formal,
664 ULong cc_dep2_formal,
665 ULong cc_ndep_formal )
666 {
667 switch (cc_op) {
668 case AMD64G_CC_OP_COPY:
669 return cc_dep1_formal
670 & (AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z
671 | AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P);
672
673 case AMD64G_CC_OP_ADDB: ACTIONS_ADD( 8, UChar );
674 case AMD64G_CC_OP_ADDW: ACTIONS_ADD( 16, UShort );
675 case AMD64G_CC_OP_ADDL: ACTIONS_ADD( 32, UInt );
676 case AMD64G_CC_OP_ADDQ: ACTIONS_ADD( 64, ULong );
677
678 case AMD64G_CC_OP_ADCB: ACTIONS_ADC( 8, UChar );
679 case AMD64G_CC_OP_ADCW: ACTIONS_ADC( 16, UShort );
680 case AMD64G_CC_OP_ADCL: ACTIONS_ADC( 32, UInt );
681 case AMD64G_CC_OP_ADCQ: ACTIONS_ADC( 64, ULong );
682
683 case AMD64G_CC_OP_SUBB: ACTIONS_SUB( 8, UChar );
684 case AMD64G_CC_OP_SUBW: ACTIONS_SUB( 16, UShort );
685 case AMD64G_CC_OP_SUBL: ACTIONS_SUB( 32, UInt );
686 case AMD64G_CC_OP_SUBQ: ACTIONS_SUB( 64, ULong );
687
688 case AMD64G_CC_OP_SBBB: ACTIONS_SBB( 8, UChar );
689 case AMD64G_CC_OP_SBBW: ACTIONS_SBB( 16, UShort );
690 case AMD64G_CC_OP_SBBL: ACTIONS_SBB( 32, UInt );
691 case AMD64G_CC_OP_SBBQ: ACTIONS_SBB( 64, ULong );
692
693 case AMD64G_CC_OP_LOGICB: ACTIONS_LOGIC( 8, UChar );
694 case AMD64G_CC_OP_LOGICW: ACTIONS_LOGIC( 16, UShort );
695 case AMD64G_CC_OP_LOGICL: ACTIONS_LOGIC( 32, UInt );
696 case AMD64G_CC_OP_LOGICQ: ACTIONS_LOGIC( 64, ULong );
697
698 case AMD64G_CC_OP_INCB: ACTIONS_INC( 8, UChar );
699 case AMD64G_CC_OP_INCW: ACTIONS_INC( 16, UShort );
700 case AMD64G_CC_OP_INCL: ACTIONS_INC( 32, UInt );
701 case AMD64G_CC_OP_INCQ: ACTIONS_INC( 64, ULong );
702
703 case AMD64G_CC_OP_DECB: ACTIONS_DEC( 8, UChar );
704 case AMD64G_CC_OP_DECW: ACTIONS_DEC( 16, UShort );
705 case AMD64G_CC_OP_DECL: ACTIONS_DEC( 32, UInt );
706 case AMD64G_CC_OP_DECQ: ACTIONS_DEC( 64, ULong );
707
708 case AMD64G_CC_OP_SHLB: ACTIONS_SHL( 8, UChar );
709 case AMD64G_CC_OP_SHLW: ACTIONS_SHL( 16, UShort );
710 case AMD64G_CC_OP_SHLL: ACTIONS_SHL( 32, UInt );
711 case AMD64G_CC_OP_SHLQ: ACTIONS_SHL( 64, ULong );
712
713 case AMD64G_CC_OP_SHRB: ACTIONS_SHR( 8, UChar );
714 case AMD64G_CC_OP_SHRW: ACTIONS_SHR( 16, UShort );
715 case AMD64G_CC_OP_SHRL: ACTIONS_SHR( 32, UInt );
716 case AMD64G_CC_OP_SHRQ: ACTIONS_SHR( 64, ULong );
717
718 case AMD64G_CC_OP_ROLB: ACTIONS_ROL( 8, UChar );
719 case AMD64G_CC_OP_ROLW: ACTIONS_ROL( 16, UShort );
720 case AMD64G_CC_OP_ROLL: ACTIONS_ROL( 32, UInt );
721 case AMD64G_CC_OP_ROLQ: ACTIONS_ROL( 64, ULong );
722
723 case AMD64G_CC_OP_RORB: ACTIONS_ROR( 8, UChar );
724 case AMD64G_CC_OP_RORW: ACTIONS_ROR( 16, UShort );
725 case AMD64G_CC_OP_RORL: ACTIONS_ROR( 32, UInt );
726 case AMD64G_CC_OP_RORQ: ACTIONS_ROR( 64, ULong );
727
728 case AMD64G_CC_OP_UMULB: ACTIONS_UMUL( 8, UChar, toUChar,
729 UShort, toUShort );
730 case AMD64G_CC_OP_UMULW: ACTIONS_UMUL( 16, UShort, toUShort,
731 UInt, toUInt );
732 case AMD64G_CC_OP_UMULL: ACTIONS_UMUL( 32, UInt, toUInt,
733 ULong, idULong );
734
735 case AMD64G_CC_OP_UMULQ: ACTIONS_UMULQ;
736
737 case AMD64G_CC_OP_SMULB: ACTIONS_SMUL( 8, Char, toUChar,
738 Short, toUShort );
739 case AMD64G_CC_OP_SMULW: ACTIONS_SMUL( 16, Short, toUShort,
740 Int, toUInt );
741 case AMD64G_CC_OP_SMULL: ACTIONS_SMUL( 32, Int, toUInt,
742 Long, idULong );
743
744 case AMD64G_CC_OP_SMULQ: ACTIONS_SMULQ;
745
746 case AMD64G_CC_OP_ANDN32: ACTIONS_ANDN( 32, UInt );
747 case AMD64G_CC_OP_ANDN64: ACTIONS_ANDN( 64, ULong );
748
749 case AMD64G_CC_OP_BLSI32: ACTIONS_BLSI( 32, UInt );
750 case AMD64G_CC_OP_BLSI64: ACTIONS_BLSI( 64, ULong );
751
752 case AMD64G_CC_OP_BLSMSK32: ACTIONS_BLSMSK( 32, UInt );
753 case AMD64G_CC_OP_BLSMSK64: ACTIONS_BLSMSK( 64, ULong );
754
755 case AMD64G_CC_OP_BLSR32: ACTIONS_BLSR( 32, UInt );
756 case AMD64G_CC_OP_BLSR64: ACTIONS_BLSR( 64, ULong );
757
758 case AMD64G_CC_OP_ADCX32: ACTIONS_ADX( 32, UInt, C );
759 case AMD64G_CC_OP_ADCX64: ACTIONS_ADX( 64, ULong, C );
760
761 case AMD64G_CC_OP_ADOX32: ACTIONS_ADX( 32, UInt, O );
762 case AMD64G_CC_OP_ADOX64: ACTIONS_ADX( 64, ULong, O );
763
764 default:
765 /* shouldn't really make these calls from generated code */
766 vex_printf("amd64g_calculate_rflags_all_WRK(AMD64)"
767 "( %llu, 0x%llx, 0x%llx, 0x%llx )\n",
768 cc_op, cc_dep1_formal, cc_dep2_formal, cc_ndep_formal );
769 vpanic("amd64g_calculate_rflags_all_WRK(AMD64)");
770 }
771 }
772
773
774 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
775 /* Calculate all the 6 flags from the supplied thunk parameters. */
amd64g_calculate_rflags_all(ULong cc_op,ULong cc_dep1,ULong cc_dep2,ULong cc_ndep)776 ULong amd64g_calculate_rflags_all ( ULong cc_op,
777 ULong cc_dep1,
778 ULong cc_dep2,
779 ULong cc_ndep )
780 {
781 # if PROFILE_RFLAGS
782 if (!initted) initCounts();
783 n_calc_all++;
784 if (SHOW_COUNTS_NOW) showCounts();
785 # endif
786 return
787 amd64g_calculate_rflags_all_WRK ( cc_op, cc_dep1, cc_dep2, cc_ndep );
788 }
789
790
791 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
792 /* Calculate just the carry flag from the supplied thunk parameters. */
amd64g_calculate_rflags_c(ULong cc_op,ULong cc_dep1,ULong cc_dep2,ULong cc_ndep)793 ULong amd64g_calculate_rflags_c ( ULong cc_op,
794 ULong cc_dep1,
795 ULong cc_dep2,
796 ULong cc_ndep )
797 {
798 # if PROFILE_RFLAGS
799 if (!initted) initCounts();
800 n_calc_c++;
801 tabc_fast[cc_op]++;
802 if (SHOW_COUNTS_NOW) showCounts();
803 # endif
804
805 /* Fast-case some common ones. */
806 switch (cc_op) {
807 case AMD64G_CC_OP_COPY:
808 return (cc_dep1 >> AMD64G_CC_SHIFT_C) & 1;
809 case AMD64G_CC_OP_LOGICQ:
810 case AMD64G_CC_OP_LOGICL:
811 case AMD64G_CC_OP_LOGICW:
812 case AMD64G_CC_OP_LOGICB:
813 return 0;
814 // case AMD64G_CC_OP_SUBL:
815 // return ((UInt)cc_dep1) < ((UInt)cc_dep2)
816 // ? AMD64G_CC_MASK_C : 0;
817 // case AMD64G_CC_OP_SUBW:
818 // return ((UInt)(cc_dep1 & 0xFFFF)) < ((UInt)(cc_dep2 & 0xFFFF))
819 // ? AMD64G_CC_MASK_C : 0;
820 // case AMD64G_CC_OP_SUBB:
821 // return ((UInt)(cc_dep1 & 0xFF)) < ((UInt)(cc_dep2 & 0xFF))
822 // ? AMD64G_CC_MASK_C : 0;
823 // case AMD64G_CC_OP_INCL:
824 // case AMD64G_CC_OP_DECL:
825 // return cc_ndep & AMD64G_CC_MASK_C;
826 default:
827 break;
828 }
829
830 # if PROFILE_RFLAGS
831 tabc_fast[cc_op]--;
832 tabc_slow[cc_op]++;
833 # endif
834
835 return amd64g_calculate_rflags_all_WRK(cc_op,cc_dep1,cc_dep2,cc_ndep)
836 & AMD64G_CC_MASK_C;
837 }
838
839
840 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
841 /* returns 1 or 0 */
amd64g_calculate_condition(ULong cond,ULong cc_op,ULong cc_dep1,ULong cc_dep2,ULong cc_ndep)842 ULong amd64g_calculate_condition ( ULong/*AMD64Condcode*/ cond,
843 ULong cc_op,
844 ULong cc_dep1,
845 ULong cc_dep2,
846 ULong cc_ndep )
847 {
848 ULong rflags = amd64g_calculate_rflags_all_WRK(cc_op, cc_dep1,
849 cc_dep2, cc_ndep);
850 ULong of,sf,zf,cf,pf;
851 ULong inv = cond & 1;
852
853 # if PROFILE_RFLAGS
854 if (!initted) initCounts();
855 tab_cond[cc_op][cond]++;
856 n_calc_cond++;
857 if (SHOW_COUNTS_NOW) showCounts();
858 # endif
859
860 switch (cond) {
861 case AMD64CondNO:
862 case AMD64CondO: /* OF == 1 */
863 of = rflags >> AMD64G_CC_SHIFT_O;
864 return 1 & (inv ^ of);
865
866 case AMD64CondNZ:
867 case AMD64CondZ: /* ZF == 1 */
868 zf = rflags >> AMD64G_CC_SHIFT_Z;
869 return 1 & (inv ^ zf);
870
871 case AMD64CondNB:
872 case AMD64CondB: /* CF == 1 */
873 cf = rflags >> AMD64G_CC_SHIFT_C;
874 return 1 & (inv ^ cf);
875 break;
876
877 case AMD64CondNBE:
878 case AMD64CondBE: /* (CF or ZF) == 1 */
879 cf = rflags >> AMD64G_CC_SHIFT_C;
880 zf = rflags >> AMD64G_CC_SHIFT_Z;
881 return 1 & (inv ^ (cf | zf));
882 break;
883
884 case AMD64CondNS:
885 case AMD64CondS: /* SF == 1 */
886 sf = rflags >> AMD64G_CC_SHIFT_S;
887 return 1 & (inv ^ sf);
888
889 case AMD64CondNP:
890 case AMD64CondP: /* PF == 1 */
891 pf = rflags >> AMD64G_CC_SHIFT_P;
892 return 1 & (inv ^ pf);
893
894 case AMD64CondNL:
895 case AMD64CondL: /* (SF xor OF) == 1 */
896 sf = rflags >> AMD64G_CC_SHIFT_S;
897 of = rflags >> AMD64G_CC_SHIFT_O;
898 return 1 & (inv ^ (sf ^ of));
899 break;
900
901 case AMD64CondNLE:
902 case AMD64CondLE: /* ((SF xor OF) or ZF) == 1 */
903 sf = rflags >> AMD64G_CC_SHIFT_S;
904 of = rflags >> AMD64G_CC_SHIFT_O;
905 zf = rflags >> AMD64G_CC_SHIFT_Z;
906 return 1 & (inv ^ ((sf ^ of) | zf));
907 break;
908
909 default:
910 /* shouldn't really make these calls from generated code */
911 vex_printf("amd64g_calculate_condition"
912 "( %llu, %llu, 0x%llx, 0x%llx, 0x%llx )\n",
913 cond, cc_op, cc_dep1, cc_dep2, cc_ndep );
914 vpanic("amd64g_calculate_condition");
915 }
916 }
917
918
919 /* VISIBLE TO LIBVEX CLIENT */
LibVEX_GuestAMD64_get_rflags(const VexGuestAMD64State * vex_state)920 ULong LibVEX_GuestAMD64_get_rflags ( /*IN*/const VexGuestAMD64State* vex_state )
921 {
922 ULong rflags = amd64g_calculate_rflags_all_WRK(
923 vex_state->guest_CC_OP,
924 vex_state->guest_CC_DEP1,
925 vex_state->guest_CC_DEP2,
926 vex_state->guest_CC_NDEP
927 );
928 Long dflag = vex_state->guest_DFLAG;
929 vassert(dflag == 1 || dflag == -1);
930 if (dflag == -1)
931 rflags |= (1<<10);
932 if (vex_state->guest_IDFLAG == 1)
933 rflags |= (1<<21);
934 if (vex_state->guest_ACFLAG == 1)
935 rflags |= (1<<18);
936
937 return rflags;
938 }
939
940 /* VISIBLE TO LIBVEX CLIENT */
941 void
LibVEX_GuestAMD64_put_rflags(ULong rflags,VexGuestAMD64State * vex_state)942 LibVEX_GuestAMD64_put_rflags ( ULong rflags,
943 /*MOD*/VexGuestAMD64State* vex_state )
944 {
945 /* D flag */
946 if (rflags & AMD64G_CC_MASK_D) {
947 vex_state->guest_DFLAG = -1;
948 rflags &= ~AMD64G_CC_MASK_D;
949 }
950 else
951 vex_state->guest_DFLAG = 1;
952
953 /* ID flag */
954 if (rflags & AMD64G_CC_MASK_ID) {
955 vex_state->guest_IDFLAG = 1;
956 rflags &= ~AMD64G_CC_MASK_ID;
957 }
958 else
959 vex_state->guest_IDFLAG = 0;
960
961 /* AC flag */
962 if (rflags & AMD64G_CC_MASK_AC) {
963 vex_state->guest_ACFLAG = 1;
964 rflags &= ~AMD64G_CC_MASK_AC;
965 }
966 else
967 vex_state->guest_ACFLAG = 0;
968
969 UInt cc_mask = AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z |
970 AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P;
971 vex_state->guest_CC_OP = AMD64G_CC_OP_COPY;
972 vex_state->guest_CC_DEP1 = rflags & cc_mask;
973 vex_state->guest_CC_DEP2 = 0;
974 vex_state->guest_CC_NDEP = 0;
975 }
976
977 /* VISIBLE TO LIBVEX CLIENT */
978 void
LibVEX_GuestAMD64_put_rflag_c(ULong new_carry_flag,VexGuestAMD64State * vex_state)979 LibVEX_GuestAMD64_put_rflag_c ( ULong new_carry_flag,
980 /*MOD*/VexGuestAMD64State* vex_state )
981 {
982 ULong oszacp = amd64g_calculate_rflags_all_WRK(
983 vex_state->guest_CC_OP,
984 vex_state->guest_CC_DEP1,
985 vex_state->guest_CC_DEP2,
986 vex_state->guest_CC_NDEP
987 );
988 if (new_carry_flag & 1) {
989 oszacp |= AMD64G_CC_MASK_C;
990 } else {
991 oszacp &= ~AMD64G_CC_MASK_C;
992 }
993 vex_state->guest_CC_OP = AMD64G_CC_OP_COPY;
994 vex_state->guest_CC_DEP1 = oszacp;
995 vex_state->guest_CC_DEP2 = 0;
996 vex_state->guest_CC_NDEP = 0;
997 }
998
999
1000 /*---------------------------------------------------------------*/
1001 /*--- %rflags translation-time function specialisers. ---*/
1002 /*--- These help iropt specialise calls the above run-time ---*/
1003 /*--- %rflags functions. ---*/
1004 /*---------------------------------------------------------------*/
1005
1006 /* Used by the optimiser to try specialisations. Returns an
1007 equivalent expression, or NULL if none. */
1008
isU64(IRExpr * e,ULong n)1009 static inline Bool isU64 ( IRExpr* e, ULong n )
1010 {
1011 return e->tag == Iex_Const
1012 && e->Iex.Const.con->tag == Ico_U64
1013 && e->Iex.Const.con->Ico.U64 == n;
1014 }
1015
1016 /* Returns N if W64 is a value of the form 1 << N for N in 1 to 31,
1017 and zero in any other case. */
isU64_1_shl_N_literal(ULong w64)1018 static Int isU64_1_shl_N_literal ( ULong w64 )
1019 {
1020 if (w64 < (1ULL << 1) || w64 > (1ULL << 31))
1021 return 0;
1022 if ((w64 & (w64 - 1)) != 0)
1023 return 0;
1024 /* At this point, we know w64 is a power of two in the range 2^1 .. 2^31,
1025 and we only need to find out which one it is. */
1026 for (Int n = 1; n <= 31; n++) {
1027 if (w64 == (1ULL << n))
1028 return n;
1029 }
1030 /* Consequently we should never get here. */
1031 /*UNREACHED*/
1032 vassert(0);
1033 return 0;
1034 }
1035
1036 /* Returns N if E is an immediate of the form 1 << N for N in 1 to 31,
1037 and zero in any other case. */
isU64_1_shl_N(IRExpr * e)1038 static Int isU64_1_shl_N ( IRExpr* e )
1039 {
1040 if (e->tag != Iex_Const || e->Iex.Const.con->tag != Ico_U64)
1041 return 0;
1042 ULong w64 = e->Iex.Const.con->Ico.U64;
1043 return isU64_1_shl_N_literal(w64);
1044 }
1045
1046 /* Returns N if E is an immediate of the form (1 << N) - 1 for N in 1 to 31,
1047 and zero in any other case. */
isU64_1_shl_N_minus_1(IRExpr * e)1048 static Int isU64_1_shl_N_minus_1 ( IRExpr* e )
1049 {
1050 if (e->tag != Iex_Const || e->Iex.Const.con->tag != Ico_U64)
1051 return 0;
1052 ULong w64 = e->Iex.Const.con->Ico.U64;
1053 // This isn't actually necessary since isU64_1_shl_N_literal will return
1054 // zero given a zero argument, but still ..
1055 if (w64 == 0xFFFFFFFFFFFFFFFFULL)
1056 return 0;
1057 return isU64_1_shl_N_literal(w64 + 1);
1058 }
1059
guest_amd64_spechelper(const HChar * function_name,IRExpr ** args,IRStmt ** precedingStmts,Int n_precedingStmts)1060 IRExpr* guest_amd64_spechelper ( const HChar* function_name,
1061 IRExpr** args,
1062 IRStmt** precedingStmts,
1063 Int n_precedingStmts )
1064 {
1065 # define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
1066 # define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
1067 # define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
1068 # define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
1069 # define mkU8(_n) IRExpr_Const(IRConst_U8(_n))
1070
1071 Int i, arity = 0;
1072 for (i = 0; args[i]; i++)
1073 arity++;
1074 # if 0
1075 vex_printf("spec request:\n");
1076 vex_printf(" %s ", function_name);
1077 for (i = 0; i < arity; i++) {
1078 vex_printf(" ");
1079 ppIRExpr(args[i]);
1080 }
1081 vex_printf("\n");
1082 # endif
1083
1084 /* --------- specialising "amd64g_calculate_condition" --------- */
1085
1086 if (vex_streq(function_name, "amd64g_calculate_condition")) {
1087 /* specialise calls to above "calculate condition" function */
1088 IRExpr *cond, *cc_op, *cc_dep1, *cc_dep2;
1089 vassert(arity == 5);
1090 cond = args[0];
1091 cc_op = args[1];
1092 cc_dep1 = args[2];
1093 cc_dep2 = args[3];
1094
1095 /*---------------- ADDQ ----------------*/
1096
1097 if (isU64(cc_op, AMD64G_CC_OP_ADDQ) && isU64(cond, AMD64CondZ)) {
1098 /* long long add, then Z --> test (dst+src == 0) */
1099 return unop(Iop_1Uto64,
1100 binop(Iop_CmpEQ64,
1101 binop(Iop_Add64, cc_dep1, cc_dep2),
1102 mkU64(0)));
1103 }
1104
1105 /*---------------- ADDL ----------------*/
1106
1107 if (isU64(cc_op, AMD64G_CC_OP_ADDL) && isU64(cond, AMD64CondO)) {
1108 /* This is very commonly generated by Javascript JITs, for
1109 the idiom "do a 32-bit add and jump to out-of-line code if
1110 an overflow occurs". */
1111 /* long add, then O (overflow)
1112 --> ((dep1 ^ dep2 ^ -1) & (dep1 ^ (dep1 + dep2)))[31]
1113 --> (((dep1 ^ dep2 ^ -1) & (dep1 ^ (dep1 +64 dep2))) >>u 31) & 1
1114 --> (((not(dep1 ^ dep2)) & (dep1 ^ (dep1 +64 dep2))) >>u 31) & 1
1115 */
1116 vassert(isIRAtom(cc_dep1));
1117 vassert(isIRAtom(cc_dep2));
1118 return
1119 binop(Iop_And64,
1120 binop(Iop_Shr64,
1121 binop(Iop_And64,
1122 unop(Iop_Not64,
1123 binop(Iop_Xor64, cc_dep1, cc_dep2)),
1124 binop(Iop_Xor64,
1125 cc_dep1,
1126 binop(Iop_Add64, cc_dep1, cc_dep2))),
1127 mkU8(31)),
1128 mkU64(1));
1129
1130 }
1131
1132 /*---------------- SUBQ ----------------*/
1133
1134 /* 0, */
1135 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondO)) {
1136 /* long long sub/cmp, then O (overflow)
1137 --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2)))[63]
1138 --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2))) >>u 63
1139 */
1140 vassert(isIRAtom(cc_dep1));
1141 vassert(isIRAtom(cc_dep2));
1142 return binop(Iop_Shr64,
1143 binop(Iop_And64,
1144 binop(Iop_Xor64, cc_dep1, cc_dep2),
1145 binop(Iop_Xor64,
1146 cc_dep1,
1147 binop(Iop_Sub64, cc_dep1, cc_dep2))),
1148 mkU8(63));
1149 }
1150 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNO)) {
1151 /* No action. Never yet found a test case. */
1152 }
1153
1154 /* 2, 3 */
1155 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondB)) {
1156 /* long long sub/cmp, then B (unsigned less than)
1157 --> test dst <u src */
1158 return unop(Iop_1Uto64,
1159 binop(Iop_CmpLT64U, cc_dep1, cc_dep2));
1160 }
1161 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNB)) {
1162 /* long long sub/cmp, then NB (unsigned greater than or equal)
1163 --> test src <=u dst */
1164 /* Note, args are opposite way round from the usual */
1165 return unop(Iop_1Uto64,
1166 binop(Iop_CmpLE64U, cc_dep2, cc_dep1));
1167 }
1168
1169 /* 4, 5 */
1170 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondZ)) {
1171 /* long long sub/cmp, then Z --> test dst==src */
1172 return unop(Iop_1Uto64,
1173 binop(Iop_CmpEQ64,cc_dep1,cc_dep2));
1174 }
1175 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNZ)) {
1176 /* long long sub/cmp, then NZ --> test dst!=src */
1177 return unop(Iop_1Uto64,
1178 binop(Iop_CmpNE64,cc_dep1,cc_dep2));
1179 }
1180
1181 /* 6, 7 */
1182 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondBE)) {
1183 /* long long sub/cmp, then BE (unsigned less than or equal)
1184 --> test dst <=u src */
1185 return unop(Iop_1Uto64,
1186 binop(Iop_CmpLE64U, cc_dep1, cc_dep2));
1187 }
1188 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNBE)) {
1189 /* long long sub/cmp, then NBE (unsigned greater than)
1190 --> test !(dst <=u src) */
1191 return binop(Iop_Xor64,
1192 unop(Iop_1Uto64,
1193 binop(Iop_CmpLE64U, cc_dep1, cc_dep2)),
1194 mkU64(1));
1195 }
1196
1197 /* 8, 9 */
1198 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondS)) {
1199 /* long long sub/cmp, then S (negative)
1200 --> (dst-src)[63]
1201 --> (dst-src) >>u 63 */
1202 return binop(Iop_Shr64,
1203 binop(Iop_Sub64, cc_dep1, cc_dep2),
1204 mkU8(63));
1205 }
1206 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNS)) {
1207 /* long long sub/cmp, then NS (not negative)
1208 --> (dst-src)[63] ^ 1
1209 --> ((dst-src) >>u 63) ^ 1 */
1210 return binop(Iop_Xor64,
1211 binop(Iop_Shr64,
1212 binop(Iop_Sub64, cc_dep1, cc_dep2),
1213 mkU8(63)),
1214 mkU64(1));
1215 }
1216
1217 /* 12, 13 */
1218 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondL)) {
1219 /* long long sub/cmp, then L (signed less than)
1220 --> test dst <s src */
1221 return unop(Iop_1Uto64,
1222 binop(Iop_CmpLT64S, cc_dep1, cc_dep2));
1223 }
1224 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNL)) {
1225 /* long long sub/cmp, then NL (signed greater than or equal)
1226 --> test dst >=s src
1227 --> test src <=s dst */
1228 return unop(Iop_1Uto64,
1229 binop(Iop_CmpLE64S, cc_dep2, cc_dep1));
1230 }
1231
1232 /* 14, 15 */
1233 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondLE)) {
1234 /* long long sub/cmp, then LE (signed less than or equal)
1235 --> test dst <=s src */
1236 return unop(Iop_1Uto64,
1237 binop(Iop_CmpLE64S, cc_dep1, cc_dep2));
1238 }
1239 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNLE)) {
1240 /* long sub/cmp, then NLE (signed greater than)
1241 --> test !(dst <=s src)
1242 --> test (dst >s src)
1243 --> test (src <s dst) */
1244 return unop(Iop_1Uto64,
1245 binop(Iop_CmpLT64S, cc_dep2, cc_dep1));
1246
1247 }
1248
1249 /*---------------- SUBL ----------------*/
1250
1251 /* 0, */
1252 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondO)) {
1253 /* This is very commonly generated by Javascript JITs, for
1254 the idiom "do a 32-bit subtract and jump to out-of-line
1255 code if an overflow occurs". */
1256 /* long sub/cmp, then O (overflow)
1257 --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2)))[31]
1258 --> (((dep1 ^ dep2) & (dep1 ^ (dep1 -64 dep2))) >>u 31) & 1
1259 */
1260 vassert(isIRAtom(cc_dep1));
1261 vassert(isIRAtom(cc_dep2));
1262 return
1263 binop(Iop_And64,
1264 binop(Iop_Shr64,
1265 binop(Iop_And64,
1266 binop(Iop_Xor64, cc_dep1, cc_dep2),
1267 binop(Iop_Xor64,
1268 cc_dep1,
1269 binop(Iop_Sub64, cc_dep1, cc_dep2))),
1270 mkU8(31)),
1271 mkU64(1));
1272 }
1273 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNO)) {
1274 /* No action. Never yet found a test case. */
1275 }
1276
1277 /* 2, 3 */
1278 {
1279 /* It appears that LLVM 5.0 and later have a new way to find out
1280 whether the top N bits of a word W are all zero, by computing
1281
1282 W <u 0---(N-1)---0 1 0---0 or
1283 W <=u 0---(N-1)---0 0 1---1
1284
1285 In particular, the result will be defined if the top N bits of W
1286 are defined, even if the trailing bits -- those corresponding to
1287 the rightmost 0---0 / 1---1 section -- are undefined. Rather than
1288 make Memcheck more complex, we detect this case where we can and
1289 shift out the irrelevant and potentially undefined bits. */
1290 Int n = 0;
1291 Bool is_NB_or_NBE = False;
1292 if (isU64(cc_op, AMD64G_CC_OP_SUBL)) {
1293 if (isU64(cond, AMD64CondB) || isU64(cond, AMD64CondNB)) {
1294 /* long sub/cmp, then B (unsigned less than),
1295 where dep2 is a power of 2:
1296 -> CmpLT32U(dep1, 1 << N)
1297 -> CmpEQ32(dep1 >>u N, 0)
1298 and
1299 long sub/cmp, then NB (unsigned greater than or equal),
1300 where dep2 is a power of 2:
1301 -> CmpGE32U(dep1, 1 << N)
1302 -> CmpNE32(dep1 >>u N, 0)
1303 This avoids CmpLT32U/CmpGE32U being applied to potentially
1304 uninitialised bits in the area being shifted out. */
1305 n = isU64_1_shl_N(cc_dep2);
1306 is_NB_or_NBE = isU64(cond, AMD64CondNB);
1307 } else if (isU64(cond, AMD64CondBE) || isU64(cond, AMD64CondNBE)) {
1308 /* long sub/cmp, then BE (unsigned less than or equal),
1309 where dep2 is a power of 2 minus 1:
1310 -> CmpLE32U(dep1, (1 << N) - 1)
1311 -> CmpEQ32(dep1 >>u N, 0)
1312 and
1313 long sub/cmp, then NBE (unsigned greater than),
1314 where dep2 is a power of 2 minus 1:
1315 -> CmpGT32U(dep1, (1 << N) - 1)
1316 -> CmpNE32(dep1 >>u N, 0)
1317 This avoids CmpLE32U/CmpGT32U being applied to potentially
1318 uninitialised bits in the area being shifted out. */
1319 n = isU64_1_shl_N_minus_1(cc_dep2);
1320 is_NB_or_NBE = isU64(cond, AMD64CondNBE);
1321 }
1322 }
1323 if (n > 0) {
1324 vassert(n >= 1 && n <= 31);
1325 return unop(Iop_1Uto64,
1326 binop(is_NB_or_NBE ? Iop_CmpNE32 : Iop_CmpEQ32,
1327 binop(Iop_Shr32, unop(Iop_64to32, cc_dep1),
1328 mkU8(n)),
1329 mkU32(0)));
1330 }
1331 }
1332 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondB)) {
1333 /* long sub/cmp, then B (unsigned less than)
1334 --> test dst <u src */
1335 return unop(Iop_1Uto64,
1336 binop(Iop_CmpLT32U,
1337 unop(Iop_64to32, cc_dep1),
1338 unop(Iop_64to32, cc_dep2)));
1339 }
1340 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNB)) {
1341 /* long sub/cmp, then NB (unsigned greater than or equal)
1342 --> test src <=u dst */
1343 /* Note, args are opposite way round from the usual */
1344 return unop(Iop_1Uto64,
1345 binop(Iop_CmpLE32U,
1346 unop(Iop_64to32, cc_dep2),
1347 unop(Iop_64to32, cc_dep1)));
1348 }
1349
1350 /* 4, 5 */
1351 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondZ)) {
1352 /* long sub/cmp, then Z --> test dst==src */
1353 return unop(Iop_1Uto64,
1354 binop(Iop_CmpEQ32,
1355 unop(Iop_64to32, cc_dep1),
1356 unop(Iop_64to32, cc_dep2)));
1357 }
1358 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNZ)) {
1359 /* long sub/cmp, then NZ --> test dst!=src */
1360 return unop(Iop_1Uto64,
1361 binop(Iop_CmpNE32,
1362 unop(Iop_64to32, cc_dep1),
1363 unop(Iop_64to32, cc_dep2)));
1364 }
1365
1366 /* 6, 7 */
1367 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondBE)) {
1368 /* long sub/cmp, then BE (unsigned less than or equal)
1369 --> test dst <=u src */
1370 return unop(Iop_1Uto64,
1371 binop(Iop_CmpLE32U,
1372 unop(Iop_64to32, cc_dep1),
1373 unop(Iop_64to32, cc_dep2)));
1374 }
1375 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNBE)) {
1376 /* long sub/cmp, then NBE (unsigned greater than)
1377 --> test src <u dst */
1378 /* Note, args are opposite way round from the usual */
1379 return unop(Iop_1Uto64,
1380 binop(Iop_CmpLT32U,
1381 unop(Iop_64to32, cc_dep2),
1382 unop(Iop_64to32, cc_dep1)));
1383 }
1384
1385 /* 8, 9 */
1386 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondS)) {
1387 /* long sub/cmp, then S (negative)
1388 --> (dst-src)[31]
1389 --> ((dst -64 src) >>u 31) & 1
1390 Pointless to narrow the args to 32 bit before the subtract. */
1391 return binop(Iop_And64,
1392 binop(Iop_Shr64,
1393 binop(Iop_Sub64, cc_dep1, cc_dep2),
1394 mkU8(31)),
1395 mkU64(1));
1396 }
1397 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNS)) {
1398 /* long sub/cmp, then NS (not negative)
1399 --> (dst-src)[31] ^ 1
1400 --> (((dst -64 src) >>u 31) & 1) ^ 1
1401 Pointless to narrow the args to 32 bit before the subtract. */
1402 return binop(Iop_Xor64,
1403 binop(Iop_And64,
1404 binop(Iop_Shr64,
1405 binop(Iop_Sub64, cc_dep1, cc_dep2),
1406 mkU8(31)),
1407 mkU64(1)),
1408 mkU64(1));
1409 }
1410
1411 /* 12, 13 */
1412 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondL)) {
1413 /* long sub/cmp, then L (signed less than)
1414 --> test dst <s src */
1415 return unop(Iop_1Uto64,
1416 binop(Iop_CmpLT32S,
1417 unop(Iop_64to32, cc_dep1),
1418 unop(Iop_64to32, cc_dep2)));
1419 }
1420 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNL)) {
1421 /* long sub/cmp, then NL (signed greater than or equal)
1422 --> test dst >=s src
1423 --> test src <=s dst */
1424 return unop(Iop_1Uto64,
1425 binop(Iop_CmpLE32S,
1426 unop(Iop_64to32, cc_dep2),
1427 unop(Iop_64to32, cc_dep1)));
1428 }
1429
1430 /* 14, 15 */
1431 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondLE)) {
1432 /* long sub/cmp, then LE (signed less than or equal)
1433 --> test dst <=s src */
1434 return unop(Iop_1Uto64,
1435 binop(Iop_CmpLE32S,
1436 unop(Iop_64to32, cc_dep1),
1437 unop(Iop_64to32, cc_dep2)));
1438
1439 }
1440 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNLE)) {
1441 /* long sub/cmp, then NLE (signed greater than)
1442 --> test !(dst <=s src)
1443 --> test (dst >s src)
1444 --> test (src <s dst) */
1445 return unop(Iop_1Uto64,
1446 binop(Iop_CmpLT32S,
1447 unop(Iop_64to32, cc_dep2),
1448 unop(Iop_64to32, cc_dep1)));
1449
1450 }
1451
1452 /*---------------- SUBW ----------------*/
1453
1454 /* 4, 5 */
1455 if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondZ)) {
1456 /* word sub/cmp, then Z --> test dst==src */
1457 return unop(Iop_1Uto64,
1458 binop(Iop_CmpEQ16,
1459 unop(Iop_64to16,cc_dep1),
1460 unop(Iop_64to16,cc_dep2)));
1461 }
1462 if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondNZ)) {
1463 /* word sub/cmp, then NZ --> test dst!=src */
1464 return unop(Iop_1Uto64,
1465 binop(Iop_CmpNE16,
1466 unop(Iop_64to16,cc_dep1),
1467 unop(Iop_64to16,cc_dep2)));
1468 }
1469
1470 /* 6, */
1471 if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondBE)) {
1472 /* word sub/cmp, then BE (unsigned less than or equal)
1473 --> test dst <=u src */
1474 return unop(Iop_1Uto64,
1475 binop(Iop_CmpLE64U,
1476 binop(Iop_Shl64, cc_dep1, mkU8(48)),
1477 binop(Iop_Shl64, cc_dep2, mkU8(48))));
1478 }
1479
1480 /* 8, 9 */
1481 if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondS)
1482 && isU64(cc_dep2, 0)) {
1483 /* word sub/cmp of zero, then S --> test (dst-0 <s 0)
1484 --> test dst <s 0
1485 --> (ULong)dst[15]
1486 This is yet another scheme by which clang figures out if the
1487 top bit of a word is 1 or 0. See also LOGICB/CondS below. */
1488 /* Note: isU64(cc_dep2, 0) is correct, even though this is
1489 for an 16-bit comparison, since the args to the helper
1490 function are always U64s. */
1491 return binop(Iop_And64,
1492 binop(Iop_Shr64,cc_dep1,mkU8(15)),
1493 mkU64(1));
1494 }
1495 if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondNS)
1496 && isU64(cc_dep2, 0)) {
1497 /* word sub/cmp of zero, then NS --> test !(dst-0 <s 0)
1498 --> test !(dst <s 0)
1499 --> (ULong) !dst[15]
1500 */
1501 return binop(Iop_Xor64,
1502 binop(Iop_And64,
1503 binop(Iop_Shr64,cc_dep1,mkU8(15)),
1504 mkU64(1)),
1505 mkU64(1));
1506 }
1507
1508 /* 14, */
1509 if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondLE)) {
1510 /* word sub/cmp, then LE (signed less than or equal)
1511 --> test dst <=s src */
1512 return unop(Iop_1Uto64,
1513 binop(Iop_CmpLE64S,
1514 binop(Iop_Shl64,cc_dep1,mkU8(48)),
1515 binop(Iop_Shl64,cc_dep2,mkU8(48))));
1516
1517 }
1518
1519 /*---------------- SUBB ----------------*/
1520
1521 /* 2, 3 */
1522 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondB)) {
1523 /* byte sub/cmp, then B (unsigned less than)
1524 --> test dst <u src */
1525 return unop(Iop_1Uto64,
1526 binop(Iop_CmpLT64U,
1527 binop(Iop_And64, cc_dep1, mkU64(0xFF)),
1528 binop(Iop_And64, cc_dep2, mkU64(0xFF))));
1529 }
1530 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNB)) {
1531 /* byte sub/cmp, then NB (unsigned greater than or equal)
1532 --> test src <=u dst */
1533 /* Note, args are opposite way round from the usual */
1534 return unop(Iop_1Uto64,
1535 binop(Iop_CmpLE64U,
1536 binop(Iop_And64, cc_dep2, mkU64(0xFF)),
1537 binop(Iop_And64, cc_dep1, mkU64(0xFF))));
1538 }
1539
1540 /* 4, 5 */
1541 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondZ)) {
1542 /* byte sub/cmp, then Z --> test dst==src */
1543 return unop(Iop_1Uto64,
1544 binop(Iop_CmpEQ8,
1545 unop(Iop_64to8,cc_dep1),
1546 unop(Iop_64to8,cc_dep2)));
1547 }
1548 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNZ)) {
1549 /* byte sub/cmp, then NZ --> test dst!=src */
1550 return unop(Iop_1Uto64,
1551 binop(Iop_CmpNE8,
1552 unop(Iop_64to8,cc_dep1),
1553 unop(Iop_64to8,cc_dep2)));
1554 }
1555
1556 /* 6, */
1557 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondBE)) {
1558 /* byte sub/cmp, then BE (unsigned less than or equal)
1559 --> test dst <=u src */
1560 return unop(Iop_1Uto64,
1561 binop(Iop_CmpLE64U,
1562 binop(Iop_And64, cc_dep1, mkU64(0xFF)),
1563 binop(Iop_And64, cc_dep2, mkU64(0xFF))));
1564 }
1565
1566 /* 8, 9 */
1567 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondS)
1568 && isU64(cc_dep2, 0)) {
1569 /* byte sub/cmp of zero, then S --> test (dst-0 <s 0)
1570 --> test dst <s 0
1571 --> (ULong)dst[7]
1572 This is yet another scheme by which gcc figures out if the
1573 top bit of a byte is 1 or 0. See also LOGICB/CondS below. */
1574 /* Note: isU64(cc_dep2, 0) is correct, even though this is
1575 for an 8-bit comparison, since the args to the helper
1576 function are always U64s. */
1577 return binop(Iop_And64,
1578 binop(Iop_Shr64,cc_dep1,mkU8(7)),
1579 mkU64(1));
1580 }
1581 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNS)
1582 && isU64(cc_dep2, 0)) {
1583 /* byte sub/cmp of zero, then NS --> test !(dst-0 <s 0)
1584 --> test !(dst <s 0)
1585 --> (ULong) !dst[7]
1586 */
1587 return binop(Iop_Xor64,
1588 binop(Iop_And64,
1589 binop(Iop_Shr64,cc_dep1,mkU8(7)),
1590 mkU64(1)),
1591 mkU64(1));
1592 }
1593
1594 /*---------------- LOGICQ ----------------*/
1595
1596 if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondZ)) {
1597 /* long long and/or/xor, then Z --> test dst==0 */
1598 return unop(Iop_1Uto64,
1599 binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
1600 }
1601 if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondNZ)) {
1602 /* long long and/or/xor, then NZ --> test dst!=0 */
1603 return unop(Iop_1Uto64,
1604 binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
1605 }
1606
1607 if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondL)) {
1608 /* long long and/or/xor, then L
1609 LOGIC sets SF and ZF according to the
1610 result and makes OF be zero. L computes SF ^ OF, but
1611 OF is zero, so this reduces to SF -- which will be 1 iff
1612 the result is < signed 0. Hence ...
1613 */
1614 return unop(Iop_1Uto64,
1615 binop(Iop_CmpLT64S,
1616 cc_dep1,
1617 mkU64(0)));
1618 }
1619
1620 /*---------------- LOGICL ----------------*/
1621
1622 if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondZ)) {
1623 /* long and/or/xor, then Z --> test dst==0 */
1624 return unop(Iop_1Uto64,
1625 binop(Iop_CmpEQ32,
1626 unop(Iop_64to32, cc_dep1),
1627 mkU32(0)));
1628 }
1629 if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNZ)) {
1630 /* long and/or/xor, then NZ --> test dst!=0 */
1631 return unop(Iop_1Uto64,
1632 binop(Iop_CmpNE32,
1633 unop(Iop_64to32, cc_dep1),
1634 mkU32(0)));
1635 }
1636
1637 if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondLE)) {
1638 /* long and/or/xor, then LE
1639 This is pretty subtle. LOGIC sets SF and ZF according to the
1640 result and makes OF be zero. LE computes (SF ^ OF) | ZF, but
1641 OF is zero, so this reduces to SF | ZF -- which will be 1 iff
1642 the result is <=signed 0. Hence ...
1643 */
1644 return unop(Iop_1Uto64,
1645 binop(Iop_CmpLE32S,
1646 unop(Iop_64to32, cc_dep1),
1647 mkU32(0)));
1648 }
1649
1650 if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondS)) {
1651 /* long and/or/xor, then S --> (ULong)result[31] */
1652 return binop(Iop_And64,
1653 binop(Iop_Shr64, cc_dep1, mkU8(31)),
1654 mkU64(1));
1655 }
1656 if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNS)) {
1657 /* long and/or/xor, then S --> (ULong) ~ result[31] */
1658 return binop(Iop_Xor64,
1659 binop(Iop_And64,
1660 binop(Iop_Shr64, cc_dep1, mkU8(31)),
1661 mkU64(1)),
1662 mkU64(1));
1663 }
1664
1665 /*---------------- LOGICW ----------------*/
1666
1667 if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondZ)) {
1668 /* word and/or/xor, then Z --> test dst==0 */
1669 // Use CmpEQ32 rather than CmpEQ64 here, so that Memcheck instruments
1670 // it exactly at EdcAUTO.
1671 return unop(Iop_1Uto64,
1672 binop(Iop_CmpEQ32,
1673 unop(Iop_16Uto32, unop(Iop_64to16, cc_dep1)),
1674 mkU32(0)));
1675 }
1676 if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondNZ)) {
1677 /* word and/or/xor, then NZ --> test dst!=0 */
1678 // Use CmpNE32 rather than CmpNE64 here, so that Memcheck instruments
1679 // it exactly at EdcAUTO.
1680 return unop(Iop_1Uto64,
1681 binop(Iop_CmpNE32,
1682 unop(Iop_16Uto32, unop(Iop_64to16, cc_dep1)),
1683 mkU32(0)));
1684 }
1685
1686 /*---------------- LOGICB ----------------*/
1687
1688 if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondZ)) {
1689 /* byte and/or/xor, then Z --> test dst==0 */
1690 // Use CmpEQ32 rather than CmpEQ64 here, so that Memcheck instruments
1691 // it exactly at EdcAUTO.
1692 return unop(Iop_1Uto64,
1693 binop(Iop_CmpEQ32,
1694 unop(Iop_8Uto32, unop(Iop_64to8, cc_dep1)),
1695 mkU32(0)));
1696 }
1697 if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNZ)) {
1698 /* byte and/or/xor, then NZ --> test dst!=0 */
1699 // Use CmpNE32 rather than CmpNE64 here, so that Memcheck instruments
1700 // it exactly at EdcAUTO.
1701 return unop(Iop_1Uto64,
1702 binop(Iop_CmpNE32,
1703 unop(Iop_8Uto32, unop(Iop_64to8, cc_dep1)),
1704 mkU32(0)));
1705 }
1706
1707 if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondS)) {
1708 /* this is an idiom gcc sometimes uses to find out if the top
1709 bit of a byte register is set: eg testb %al,%al; js ..
1710 Since it just depends on the top bit of the byte, extract
1711 that bit and explicitly get rid of all the rest. This
1712 helps memcheck avoid false positives in the case where any
1713 of the other bits in the byte are undefined. */
1714 /* byte and/or/xor, then S --> (UInt)result[7] */
1715 return binop(Iop_And64,
1716 binop(Iop_Shr64,cc_dep1,mkU8(7)),
1717 mkU64(1));
1718 }
1719 if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNS)) {
1720 /* byte and/or/xor, then NS --> (UInt)!result[7] */
1721 return binop(Iop_Xor64,
1722 binop(Iop_And64,
1723 binop(Iop_Shr64,cc_dep1,mkU8(7)),
1724 mkU64(1)),
1725 mkU64(1));
1726 }
1727
1728 /*---------------- INCB ----------------*/
1729
1730 if (isU64(cc_op, AMD64G_CC_OP_INCB) && isU64(cond, AMD64CondLE)) {
1731 /* 8-bit inc, then LE --> sign bit of the arg */
1732 return binop(Iop_And64,
1733 binop(Iop_Shr64,
1734 binop(Iop_Sub64, cc_dep1, mkU64(1)),
1735 mkU8(7)),
1736 mkU64(1));
1737 }
1738
1739 /*---------------- INCW ----------------*/
1740
1741 if (isU64(cc_op, AMD64G_CC_OP_INCW) && isU64(cond, AMD64CondZ)) {
1742 /* 16-bit inc, then Z --> test dst == 0 */
1743 return unop(Iop_1Uto64,
1744 binop(Iop_CmpEQ64,
1745 binop(Iop_Shl64,cc_dep1,mkU8(48)),
1746 mkU64(0)));
1747 }
1748
1749 /*---------------- DECL ----------------*/
1750
1751 if (isU64(cc_op, AMD64G_CC_OP_DECL) && isU64(cond, AMD64CondZ)) {
1752 /* dec L, then Z --> test dst == 0 */
1753 return unop(Iop_1Uto64,
1754 binop(Iop_CmpEQ32,
1755 unop(Iop_64to32, cc_dep1),
1756 mkU32(0)));
1757 }
1758
1759 /*---------------- DECW ----------------*/
1760
1761 if (isU64(cc_op, AMD64G_CC_OP_DECW) && isU64(cond, AMD64CondNZ)) {
1762 /* 16-bit dec, then NZ --> test dst != 0 */
1763 return unop(Iop_1Uto64,
1764 binop(Iop_CmpNE64,
1765 binop(Iop_Shl64,cc_dep1,mkU8(48)),
1766 mkU64(0)));
1767 }
1768
1769 /*---------------- SHRQ ----------------*/
1770
1771 if (isU64(cc_op, AMD64G_CC_OP_SHRQ) && isU64(cond, AMD64CondZ)) {
1772 /* SHRQ, then Z --> test dep1 == 0 */
1773 return unop(Iop_1Uto64,
1774 binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
1775 }
1776 if (isU64(cc_op, AMD64G_CC_OP_SHRQ) && isU64(cond, AMD64CondNZ)) {
1777 /* SHRQ, then NZ --> test dep1 != 0 */
1778 return unop(Iop_1Uto64,
1779 binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
1780 }
1781
1782 /*---------------- SHRL ----------------*/
1783
1784 if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondZ)) {
1785 /* SHRL, then Z --> test dep1 == 0 */
1786 return unop(Iop_1Uto64,
1787 binop(Iop_CmpEQ32, unop(Iop_64to32, cc_dep1),
1788 mkU32(0)));
1789 }
1790 if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondNZ)) {
1791 /* SHRL, then NZ --> test dep1 != 0 */
1792 return unop(Iop_1Uto64,
1793 binop(Iop_CmpNE32, unop(Iop_64to32, cc_dep1),
1794 mkU32(0)));
1795 }
1796
1797 if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondS)) {
1798 /* SHRL/SARL, then S --> (ULong)result[31] */
1799 return binop(Iop_And64,
1800 binop(Iop_Shr64, cc_dep1, mkU8(31)),
1801 mkU64(1));
1802 }
1803 // The following looks correct to me, but never seems to happen because
1804 // the front end converts jns to js by switching the fallthrough vs
1805 // taken addresses. See jcc_01(). But then why do other conditions
1806 // considered by this function show up in both variants (xx and Nxx) ?
1807 //if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondNS)) {
1808 // /* SHRL/SARL, then NS --> (ULong) ~ result[31] */
1809 // vassert(0);
1810 // return binop(Iop_Xor64,
1811 // binop(Iop_And64,
1812 // binop(Iop_Shr64, cc_dep1, mkU8(31)),
1813 // mkU64(1)),
1814 // mkU64(1));
1815 //}
1816
1817 /*---------------- COPY ----------------*/
1818 /* This can happen, as a result of amd64 FP compares: "comisd ... ;
1819 jbe" for example. */
1820
1821 if (isU64(cc_op, AMD64G_CC_OP_COPY)
1822 && (isU64(cond, AMD64CondBE) || isU64(cond, AMD64CondNBE))) {
1823 /* COPY, then BE --> extract C and Z from dep1, and test (C
1824 or Z == 1). */
1825 /* COPY, then NBE --> extract C and Z from dep1, and test (C
1826 or Z == 0). */
1827 ULong nnn = isU64(cond, AMD64CondBE) ? 1 : 0;
1828 return
1829 unop(
1830 Iop_1Uto64,
1831 binop(
1832 Iop_CmpEQ64,
1833 binop(
1834 Iop_And64,
1835 binop(
1836 Iop_Or64,
1837 binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
1838 binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z))
1839 ),
1840 mkU64(1)
1841 ),
1842 mkU64(nnn)
1843 )
1844 );
1845 }
1846
1847 if (isU64(cc_op, AMD64G_CC_OP_COPY)
1848 && (isU64(cond, AMD64CondB) || isU64(cond, AMD64CondNB))) {
1849 /* COPY, then B --> extract C from dep1, and test (C == 1). */
1850 /* COPY, then NB --> extract C from dep1, and test (C == 0). */
1851 ULong nnn = isU64(cond, AMD64CondB) ? 1 : 0;
1852 return
1853 unop(
1854 Iop_1Uto64,
1855 binop(
1856 Iop_CmpEQ64,
1857 binop(
1858 Iop_And64,
1859 binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
1860 mkU64(1)
1861 ),
1862 mkU64(nnn)
1863 )
1864 );
1865 }
1866
1867 if (isU64(cc_op, AMD64G_CC_OP_COPY)
1868 && (isU64(cond, AMD64CondZ) || isU64(cond, AMD64CondNZ))) {
1869 /* COPY, then Z --> extract Z from dep1, and test (Z == 1). */
1870 /* COPY, then NZ --> extract Z from dep1, and test (Z == 0). */
1871 ULong nnn = isU64(cond, AMD64CondZ) ? 1 : 0;
1872 return
1873 unop(
1874 Iop_1Uto64,
1875 binop(
1876 Iop_CmpEQ64,
1877 binop(
1878 Iop_And64,
1879 binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z)),
1880 mkU64(1)
1881 ),
1882 mkU64(nnn)
1883 )
1884 );
1885 }
1886
1887 if (isU64(cc_op, AMD64G_CC_OP_COPY)
1888 && (isU64(cond, AMD64CondP) || isU64(cond, AMD64CondNP))) {
1889 /* COPY, then P --> extract P from dep1, and test (P == 1). */
1890 /* COPY, then NP --> extract P from dep1, and test (P == 0). */
1891 ULong nnn = isU64(cond, AMD64CondP) ? 1 : 0;
1892 return
1893 unop(
1894 Iop_1Uto64,
1895 binop(
1896 Iop_CmpEQ64,
1897 binop(
1898 Iop_And64,
1899 binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_P)),
1900 mkU64(1)
1901 ),
1902 mkU64(nnn)
1903 )
1904 );
1905 }
1906
1907 return NULL;
1908 }
1909
1910 /* --------- specialising "amd64g_calculate_rflags_c" --------- */
1911
1912 if (vex_streq(function_name, "amd64g_calculate_rflags_c")) {
1913 /* specialise calls to above "calculate_rflags_c" function */
1914 IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
1915 vassert(arity == 4);
1916 cc_op = args[0];
1917 cc_dep1 = args[1];
1918 cc_dep2 = args[2];
1919 cc_ndep = args[3];
1920
1921 if (isU64(cc_op, AMD64G_CC_OP_SUBQ)) {
1922 /* C after sub denotes unsigned less than */
1923 return unop(Iop_1Uto64,
1924 binop(Iop_CmpLT64U,
1925 cc_dep1,
1926 cc_dep2));
1927 }
1928 if (isU64(cc_op, AMD64G_CC_OP_SUBL)) {
1929 /* C after sub denotes unsigned less than */
1930 return unop(Iop_1Uto64,
1931 binop(Iop_CmpLT32U,
1932 unop(Iop_64to32, cc_dep1),
1933 unop(Iop_64to32, cc_dep2)));
1934 }
1935 if (isU64(cc_op, AMD64G_CC_OP_SUBB)) {
1936 /* C after sub denotes unsigned less than */
1937 return unop(Iop_1Uto64,
1938 binop(Iop_CmpLT64U,
1939 binop(Iop_And64,cc_dep1,mkU64(0xFF)),
1940 binop(Iop_And64,cc_dep2,mkU64(0xFF))));
1941 }
1942 if (isU64(cc_op, AMD64G_CC_OP_ADDQ)) {
1943 /* C after add denotes sum <u either arg */
1944 return unop(Iop_1Uto64,
1945 binop(Iop_CmpLT64U,
1946 binop(Iop_Add64, cc_dep1, cc_dep2),
1947 cc_dep1));
1948 }
1949 if (isU64(cc_op, AMD64G_CC_OP_ADDL)) {
1950 /* C after add denotes sum <u either arg */
1951 return unop(Iop_1Uto64,
1952 binop(Iop_CmpLT32U,
1953 unop(Iop_64to32, binop(Iop_Add64, cc_dep1, cc_dep2)),
1954 unop(Iop_64to32, cc_dep1)));
1955 }
1956 if (isU64(cc_op, AMD64G_CC_OP_LOGICQ)
1957 || isU64(cc_op, AMD64G_CC_OP_LOGICL)
1958 || isU64(cc_op, AMD64G_CC_OP_LOGICW)
1959 || isU64(cc_op, AMD64G_CC_OP_LOGICB)) {
1960 /* cflag after logic is zero */
1961 return mkU64(0);
1962 }
1963 if (isU64(cc_op, AMD64G_CC_OP_DECL) || isU64(cc_op, AMD64G_CC_OP_INCL)
1964 || isU64(cc_op, AMD64G_CC_OP_DECQ) || isU64(cc_op, AMD64G_CC_OP_INCQ)) {
1965 /* If the thunk is dec or inc, the cflag is supplied as CC_NDEP. */
1966 return cc_ndep;
1967 }
1968
1969 # if 0
1970 if (cc_op->tag == Iex_Const) {
1971 vex_printf("CFLAG "); ppIRExpr(cc_op); vex_printf("\n");
1972 }
1973 # endif
1974
1975 return NULL;
1976 }
1977
1978 # undef unop
1979 # undef binop
1980 # undef mkU64
1981 # undef mkU32
1982 # undef mkU8
1983
1984 return NULL;
1985 }
1986
1987
1988 /*---------------------------------------------------------------*/
1989 /*--- Supporting functions for x87 FPU activities. ---*/
1990 /*---------------------------------------------------------------*/
1991
host_is_little_endian(void)1992 static inline Bool host_is_little_endian ( void )
1993 {
1994 UInt x = 0x76543210;
1995 UChar* p = (UChar*)(&x);
1996 return toBool(*p == 0x10);
1997 }
1998
1999 /* Inspect a value and its tag, as per the x87 'FXAM' instruction. */
2000 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calculate_FXAM(ULong tag,ULong dbl)2001 ULong amd64g_calculate_FXAM ( ULong tag, ULong dbl )
2002 {
2003 Bool mantissaIsZero;
2004 Int bexp;
2005 UChar sign;
2006 UChar* f64;
2007
2008 vassert(host_is_little_endian());
2009
2010 /* vex_printf("calculate_FXAM ( %d, %llx ) .. ", tag, dbl ); */
2011
2012 f64 = (UChar*)(&dbl);
2013 sign = toUChar( (f64[7] >> 7) & 1 );
2014
2015 /* First off, if the tag indicates the register was empty,
2016 return 1,0,sign,1 */
2017 if (tag == 0) {
2018 /* vex_printf("Empty\n"); */
2019 return AMD64G_FC_MASK_C3 | 0 | (sign << AMD64G_FC_SHIFT_C1)
2020 | AMD64G_FC_MASK_C0;
2021 }
2022
2023 bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
2024 bexp &= 0x7FF;
2025
2026 mantissaIsZero
2027 = toBool(
2028 (f64[6] & 0x0F) == 0
2029 && (f64[5] | f64[4] | f64[3] | f64[2] | f64[1] | f64[0]) == 0
2030 );
2031
2032 /* If both exponent and mantissa are zero, the value is zero.
2033 Return 1,0,sign,0. */
2034 if (bexp == 0 && mantissaIsZero) {
2035 /* vex_printf("Zero\n"); */
2036 return AMD64G_FC_MASK_C3 | 0
2037 | (sign << AMD64G_FC_SHIFT_C1) | 0;
2038 }
2039
2040 /* If exponent is zero but mantissa isn't, it's a denormal.
2041 Return 1,1,sign,0. */
2042 if (bexp == 0 && !mantissaIsZero) {
2043 /* vex_printf("Denormal\n"); */
2044 return AMD64G_FC_MASK_C3 | AMD64G_FC_MASK_C2
2045 | (sign << AMD64G_FC_SHIFT_C1) | 0;
2046 }
2047
2048 /* If the exponent is 7FF and the mantissa is zero, this is an infinity.
2049 Return 0,1,sign,1. */
2050 if (bexp == 0x7FF && mantissaIsZero) {
2051 /* vex_printf("Inf\n"); */
2052 return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1)
2053 | AMD64G_FC_MASK_C0;
2054 }
2055
2056 /* If the exponent is 7FF and the mantissa isn't zero, this is a NaN.
2057 Return 0,0,sign,1. */
2058 if (bexp == 0x7FF && !mantissaIsZero) {
2059 /* vex_printf("NaN\n"); */
2060 return 0 | 0 | (sign << AMD64G_FC_SHIFT_C1) | AMD64G_FC_MASK_C0;
2061 }
2062
2063 /* Uh, ok, we give up. It must be a normal finite number.
2064 Return 0,1,sign,0.
2065 */
2066 /* vex_printf("normal\n"); */
2067 return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1) | 0;
2068 }
2069
2070
2071 /* This is used to implement both 'frstor' and 'fldenv'. The latter
2072 appears to differ from the former only in that the 8 FP registers
2073 themselves are not transferred into the guest state. */
2074 static
do_put_x87(Bool moveRegs,Fpu_State * x87_state,VexGuestAMD64State * vex_state)2075 VexEmNote do_put_x87 ( Bool moveRegs,
2076 /*IN*/Fpu_State* x87_state,
2077 /*OUT*/VexGuestAMD64State* vex_state )
2078 {
2079 Int stno, preg;
2080 UInt tag;
2081 ULong* vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2082 UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2083 UInt ftop = (x87_state->env[FP_ENV_STAT] >> 11) & 7;
2084 UInt tagw = x87_state->env[FP_ENV_TAG];
2085 UInt fpucw = x87_state->env[FP_ENV_CTRL];
2086 UInt c3210 = x87_state->env[FP_ENV_STAT] & 0x4700;
2087 VexEmNote ew;
2088 UInt fpround;
2089 ULong pair;
2090
2091 /* Copy registers and tags */
2092 for (stno = 0; stno < 8; stno++) {
2093 preg = (stno + ftop) & 7;
2094 tag = (tagw >> (2*preg)) & 3;
2095 if (tag == 3) {
2096 /* register is empty */
2097 /* hmm, if it's empty, does it still get written? Probably
2098 safer to say it does. If we don't, memcheck could get out
2099 of sync, in that it thinks all FP registers are defined by
2100 this helper, but in reality some have not been updated. */
2101 if (moveRegs)
2102 vexRegs[preg] = 0; /* IEEE754 64-bit zero */
2103 vexTags[preg] = 0;
2104 } else {
2105 /* register is non-empty */
2106 if (moveRegs)
2107 convert_f80le_to_f64le( &x87_state->reg[10*stno],
2108 (UChar*)&vexRegs[preg] );
2109 vexTags[preg] = 1;
2110 }
2111 }
2112
2113 /* stack pointer */
2114 vex_state->guest_FTOP = ftop;
2115
2116 /* status word */
2117 vex_state->guest_FC3210 = c3210;
2118
2119 /* handle the control word, setting FPROUND and detecting any
2120 emulation warnings. */
2121 pair = amd64g_check_fldcw ( (ULong)fpucw );
2122 fpround = (UInt)pair & 0xFFFFFFFFULL;
2123 ew = (VexEmNote)(pair >> 32);
2124
2125 vex_state->guest_FPROUND = fpround & 3;
2126
2127 /* emulation warnings --> caller */
2128 return ew;
2129 }
2130
2131
2132 /* Create an x87 FPU state from the guest state, as close as
2133 we can approximate it. */
2134 static
do_get_x87(VexGuestAMD64State * vex_state,Fpu_State * x87_state)2135 void do_get_x87 ( /*IN*/VexGuestAMD64State* vex_state,
2136 /*OUT*/Fpu_State* x87_state )
2137 {
2138 Int i, stno, preg;
2139 UInt tagw;
2140 ULong* vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2141 UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2142 UInt ftop = vex_state->guest_FTOP;
2143 UInt c3210 = vex_state->guest_FC3210;
2144
2145 for (i = 0; i < 14; i++)
2146 x87_state->env[i] = 0;
2147
2148 x87_state->env[1] = x87_state->env[3] = x87_state->env[5]
2149 = x87_state->env[13] = 0xFFFF;
2150 x87_state->env[FP_ENV_STAT]
2151 = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
2152 x87_state->env[FP_ENV_CTRL]
2153 = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
2154
2155 /* Dump the register stack in ST order. */
2156 tagw = 0;
2157 for (stno = 0; stno < 8; stno++) {
2158 preg = (stno + ftop) & 7;
2159 if (vexTags[preg] == 0) {
2160 /* register is empty */
2161 tagw |= (3 << (2*preg));
2162 convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2163 &x87_state->reg[10*stno] );
2164 } else {
2165 /* register is full. */
2166 tagw |= (0 << (2*preg));
2167 convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2168 &x87_state->reg[10*stno] );
2169 }
2170 }
2171 x87_state->env[FP_ENV_TAG] = toUShort(tagw);
2172 }
2173
2174
2175 /*---------------------------------------------------------------*/
2176 /*--- Supporting functions for XSAVE/FXSAVE. ---*/
2177 /*---------------------------------------------------------------*/
2178
2179 /* CALLED FROM GENERATED CODE */
2180 /* DIRTY HELPER (reads guest state, writes guest mem) */
2181 /* XSAVE component 0 is the x87 FPU state. */
amd64g_dirtyhelper_XSAVE_COMPONENT_0(VexGuestAMD64State * gst,HWord addr)2182 void amd64g_dirtyhelper_XSAVE_COMPONENT_0
2183 ( VexGuestAMD64State* gst, HWord addr )
2184 {
2185 /* Derived from values obtained from
2186 vendor_id : AuthenticAMD
2187 cpu family : 15
2188 model : 12
2189 model name : AMD Athlon(tm) 64 Processor 3200+
2190 stepping : 0
2191 cpu MHz : 2200.000
2192 cache size : 512 KB
2193 */
2194 /* Somewhat roundabout, but at least it's simple. */
2195 Fpu_State tmp;
2196 UShort* addrS = (UShort*)addr;
2197 UChar* addrC = (UChar*)addr;
2198 UShort fp_tags;
2199 UInt summary_tags;
2200 Int r, stno;
2201 UShort *srcS, *dstS;
2202
2203 do_get_x87( gst, &tmp );
2204
2205 /* Now build the proper fxsave x87 image from the fsave x87 image
2206 we just made. */
2207
2208 addrS[0] = tmp.env[FP_ENV_CTRL]; /* FCW: fpu control word */
2209 addrS[1] = tmp.env[FP_ENV_STAT]; /* FCW: fpu status word */
2210
2211 /* set addrS[2] in an endian-independent way */
2212 summary_tags = 0;
2213 fp_tags = tmp.env[FP_ENV_TAG];
2214 for (r = 0; r < 8; r++) {
2215 if ( ((fp_tags >> (2*r)) & 3) != 3 )
2216 summary_tags |= (1 << r);
2217 }
2218 addrC[4] = toUChar(summary_tags); /* FTW: tag summary byte */
2219 addrC[5] = 0; /* pad */
2220
2221 /* FOP: faulting fpu opcode. From experimentation, the real CPU
2222 does not write this field. (?!) */
2223 addrS[3] = 0; /* BOGUS */
2224
2225 /* RIP (Last x87 instruction pointer). From experimentation, the
2226 real CPU does not write this field. (?!) */
2227 addrS[4] = 0; /* BOGUS */
2228 addrS[5] = 0; /* BOGUS */
2229 addrS[6] = 0; /* BOGUS */
2230 addrS[7] = 0; /* BOGUS */
2231
2232 /* RDP (Last x87 data pointer). From experimentation, the real CPU
2233 does not write this field. (?!) */
2234 addrS[8] = 0; /* BOGUS */
2235 addrS[9] = 0; /* BOGUS */
2236 addrS[10] = 0; /* BOGUS */
2237 addrS[11] = 0; /* BOGUS */
2238
2239 /* addrS[13,12] are MXCSR -- not written */
2240 /* addrS[15,14] are MXCSR_MASK -- not written */
2241
2242 /* Copy in the FP registers, in ST order. */
2243 for (stno = 0; stno < 8; stno++) {
2244 srcS = (UShort*)(&tmp.reg[10*stno]);
2245 dstS = (UShort*)(&addrS[16 + 8*stno]);
2246 dstS[0] = srcS[0];
2247 dstS[1] = srcS[1];
2248 dstS[2] = srcS[2];
2249 dstS[3] = srcS[3];
2250 dstS[4] = srcS[4];
2251 dstS[5] = 0;
2252 dstS[6] = 0;
2253 dstS[7] = 0;
2254 }
2255 }
2256
2257
2258 /* CALLED FROM GENERATED CODE */
2259 /* DIRTY HELPER (reads guest state, writes guest mem) */
2260 /* XSAVE component 1 is the SSE state. */
amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS(VexGuestAMD64State * gst,HWord addr)2261 void amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS
2262 ( VexGuestAMD64State* gst, HWord addr )
2263 {
2264 UShort* addrS = (UShort*)addr;
2265 UInt mxcsr;
2266
2267 /* The only non-register parts of the SSE state are MXCSR and
2268 MXCSR_MASK. */
2269 mxcsr = amd64g_create_mxcsr( gst->guest_SSEROUND );
2270
2271 addrS[12] = toUShort(mxcsr); /* MXCSR */
2272 addrS[13] = toUShort(mxcsr >> 16);
2273
2274 addrS[14] = 0xFFFF; /* MXCSR mask (lo16) */
2275 addrS[15] = 0x0000; /* MXCSR mask (hi16) */
2276 }
2277
2278
2279 /* VISIBLE TO LIBVEX CLIENT */
2280 /* Do FXSAVE from the supplied VexGuestAMD64State structure and store
2281 the result at the given address which represents a buffer of at
2282 least 416 bytes.
2283
2284 This function is not called from generated code. FXSAVE is dealt
2285 with by the amd64 front end by calling the XSAVE_COMPONENT_{0,1}
2286 functions above plus some in-line IR. This function is merely a
2287 convenience function for VEX's users.
2288 */
LibVEX_GuestAMD64_fxsave(VexGuestAMD64State * gst,HWord fp_state)2289 void LibVEX_GuestAMD64_fxsave ( /*IN*/VexGuestAMD64State* gst,
2290 /*OUT*/HWord fp_state )
2291 {
2292 /* Do the x87 part */
2293 amd64g_dirtyhelper_XSAVE_COMPONENT_0(gst, fp_state);
2294
2295 /* And now the SSE part, except for the registers themselves. */
2296 amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS(gst, fp_state);
2297
2298 /* That's the first 160 bytes of the image done. */
2299 /* Now only %xmm0 .. %xmm15 remain to be copied. If the host is
2300 big-endian, these need to be byte-swapped. */
2301 U128 *xmm = (U128 *)(fp_state + 160);
2302 vassert(host_is_little_endian());
2303
2304 # define COPY_U128(_dst,_src) \
2305 do { _dst[0] = _src[0]; _dst[1] = _src[1]; \
2306 _dst[2] = _src[2]; _dst[3] = _src[3]; } \
2307 while (0)
2308
2309 COPY_U128( xmm[0], gst->guest_YMM0 );
2310 COPY_U128( xmm[1], gst->guest_YMM1 );
2311 COPY_U128( xmm[2], gst->guest_YMM2 );
2312 COPY_U128( xmm[3], gst->guest_YMM3 );
2313 COPY_U128( xmm[4], gst->guest_YMM4 );
2314 COPY_U128( xmm[5], gst->guest_YMM5 );
2315 COPY_U128( xmm[6], gst->guest_YMM6 );
2316 COPY_U128( xmm[7], gst->guest_YMM7 );
2317 COPY_U128( xmm[8], gst->guest_YMM8 );
2318 COPY_U128( xmm[9], gst->guest_YMM9 );
2319 COPY_U128( xmm[10], gst->guest_YMM10 );
2320 COPY_U128( xmm[11], gst->guest_YMM11 );
2321 COPY_U128( xmm[12], gst->guest_YMM12 );
2322 COPY_U128( xmm[13], gst->guest_YMM13 );
2323 COPY_U128( xmm[14], gst->guest_YMM14 );
2324 COPY_U128( xmm[15], gst->guest_YMM15 );
2325 # undef COPY_U128
2326 }
2327
2328
2329 /*---------------------------------------------------------------*/
2330 /*--- Supporting functions for XRSTOR/FXRSTOR. ---*/
2331 /*---------------------------------------------------------------*/
2332
2333 /* CALLED FROM GENERATED CODE */
2334 /* DIRTY HELPER (writes guest state, reads guest mem) */
amd64g_dirtyhelper_XRSTOR_COMPONENT_0(VexGuestAMD64State * gst,HWord addr)2335 VexEmNote amd64g_dirtyhelper_XRSTOR_COMPONENT_0
2336 ( VexGuestAMD64State* gst, HWord addr )
2337 {
2338 Fpu_State tmp;
2339 UShort* addrS = (UShort*)addr;
2340 UChar* addrC = (UChar*)addr;
2341 UShort fp_tags;
2342 Int r, stno, i;
2343
2344 /* Copy the x87 registers out of the image, into a temporary
2345 Fpu_State struct. */
2346 for (i = 0; i < 14; i++) tmp.env[i] = 0;
2347 for (i = 0; i < 80; i++) tmp.reg[i] = 0;
2348 /* fill in tmp.reg[0..7] */
2349 for (stno = 0; stno < 8; stno++) {
2350 UShort* dstS = (UShort*)(&tmp.reg[10*stno]);
2351 UShort* srcS = (UShort*)(&addrS[16 + 8*stno]);
2352 dstS[0] = srcS[0];
2353 dstS[1] = srcS[1];
2354 dstS[2] = srcS[2];
2355 dstS[3] = srcS[3];
2356 dstS[4] = srcS[4];
2357 }
2358 /* fill in tmp.env[0..13] */
2359 tmp.env[FP_ENV_CTRL] = addrS[0]; /* FCW: fpu control word */
2360 tmp.env[FP_ENV_STAT] = addrS[1]; /* FCW: fpu status word */
2361
2362 fp_tags = 0;
2363 for (r = 0; r < 8; r++) {
2364 if (addrC[4] & (1<<r))
2365 fp_tags |= (0 << (2*r)); /* EMPTY */
2366 else
2367 fp_tags |= (3 << (2*r)); /* VALID -- not really precise enough. */
2368 }
2369 tmp.env[FP_ENV_TAG] = fp_tags;
2370
2371 /* Now write 'tmp' into the guest state. */
2372 VexEmNote warnX87 = do_put_x87( True/*moveRegs*/, &tmp, gst );
2373
2374 return warnX87;
2375 }
2376
2377
2378 /* CALLED FROM GENERATED CODE */
2379 /* DIRTY HELPER (writes guest state, reads guest mem) */
amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS(VexGuestAMD64State * gst,HWord addr)2380 VexEmNote amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS
2381 ( VexGuestAMD64State* gst, HWord addr )
2382 {
2383 UShort* addrS = (UShort*)addr;
2384 UInt w32 = (((UInt)addrS[12]) & 0xFFFF)
2385 | ((((UInt)addrS[13]) & 0xFFFF) << 16);
2386 ULong w64 = amd64g_check_ldmxcsr( (ULong)w32 );
2387
2388 VexEmNote warnXMM = (VexEmNote)(w64 >> 32);
2389
2390 gst->guest_SSEROUND = w64 & 0xFFFFFFFFULL;
2391 return warnXMM;
2392 }
2393
2394
2395 /* VISIBLE TO LIBVEX CLIENT */
2396 /* Do FXRSTOR from the supplied address and store read values to the given
2397 VexGuestAMD64State structure.
2398
2399 This function is not called from generated code. FXRSTOR is dealt
2400 with by the amd64 front end by calling the XRSTOR_COMPONENT_{0,1}
2401 functions above plus some in-line IR. This function is merely a
2402 convenience function for VEX's users.
2403 */
LibVEX_GuestAMD64_fxrstor(HWord fp_state,VexGuestAMD64State * gst)2404 VexEmNote LibVEX_GuestAMD64_fxrstor ( /*IN*/HWord fp_state,
2405 /*MOD*/VexGuestAMD64State* gst )
2406 {
2407 /* Restore %xmm0 .. %xmm15. If the host is big-endian, these need
2408 to be byte-swapped. */
2409 U128 *xmm = (U128 *)(fp_state + 160);
2410
2411 vassert(host_is_little_endian());
2412
2413 # define COPY_U128(_dst,_src) \
2414 do { _dst[0] = _src[0]; _dst[1] = _src[1]; \
2415 _dst[2] = _src[2]; _dst[3] = _src[3]; } \
2416 while (0)
2417
2418 COPY_U128( gst->guest_YMM0, xmm[0] );
2419 COPY_U128( gst->guest_YMM1, xmm[1] );
2420 COPY_U128( gst->guest_YMM2, xmm[2] );
2421 COPY_U128( gst->guest_YMM3, xmm[3] );
2422 COPY_U128( gst->guest_YMM4, xmm[4] );
2423 COPY_U128( gst->guest_YMM5, xmm[5] );
2424 COPY_U128( gst->guest_YMM6, xmm[6] );
2425 COPY_U128( gst->guest_YMM7, xmm[7] );
2426 COPY_U128( gst->guest_YMM8, xmm[8] );
2427 COPY_U128( gst->guest_YMM9, xmm[9] );
2428 COPY_U128( gst->guest_YMM10, xmm[10] );
2429 COPY_U128( gst->guest_YMM11, xmm[11] );
2430 COPY_U128( gst->guest_YMM12, xmm[12] );
2431 COPY_U128( gst->guest_YMM13, xmm[13] );
2432 COPY_U128( gst->guest_YMM14, xmm[14] );
2433 COPY_U128( gst->guest_YMM15, xmm[15] );
2434
2435 # undef COPY_U128
2436
2437 VexEmNote warnXMM
2438 = amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS(gst, fp_state);
2439 VexEmNote warnX87
2440 = amd64g_dirtyhelper_XRSTOR_COMPONENT_0(gst, fp_state);
2441
2442 /* Prefer an X87 emwarn over an XMM one, if both exist. */
2443 if (warnX87 != EmNote_NONE)
2444 return warnX87;
2445 else
2446 return warnXMM;
2447 }
2448
2449
2450 /*---------------------------------------------------------------*/
2451 /*--- Supporting functions for FSAVE/FRSTOR ---*/
2452 /*---------------------------------------------------------------*/
2453
2454 /* DIRTY HELPER (writes guest state) */
2455 /* Initialise the x87 FPU state as per 'finit'. */
amd64g_dirtyhelper_FINIT(VexGuestAMD64State * gst)2456 void amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* gst )
2457 {
2458 Int i;
2459 gst->guest_FTOP = 0;
2460 for (i = 0; i < 8; i++) {
2461 gst->guest_FPTAG[i] = 0; /* empty */
2462 gst->guest_FPREG[i] = 0; /* IEEE754 64-bit zero */
2463 }
2464 gst->guest_FPROUND = (ULong)Irrm_NEAREST;
2465 gst->guest_FC3210 = 0;
2466 }
2467
2468
2469 /* CALLED FROM GENERATED CODE */
2470 /* DIRTY HELPER (reads guest memory) */
amd64g_dirtyhelper_loadF80le(Addr addrU)2471 ULong amd64g_dirtyhelper_loadF80le ( Addr addrU )
2472 {
2473 ULong f64;
2474 convert_f80le_to_f64le ( (UChar*)addrU, (UChar*)&f64 );
2475 return f64;
2476 }
2477
2478 /* CALLED FROM GENERATED CODE */
2479 /* DIRTY HELPER (writes guest memory) */
amd64g_dirtyhelper_storeF80le(Addr addrU,ULong f64)2480 void amd64g_dirtyhelper_storeF80le ( Addr addrU, ULong f64 )
2481 {
2482 convert_f64le_to_f80le( (UChar*)&f64, (UChar*)addrU );
2483 }
2484
2485
2486 /* CALLED FROM GENERATED CODE */
2487 /* CLEAN HELPER */
2488 /* mxcsr[15:0] contains a SSE native format MXCSR value.
2489 Extract from it the required SSEROUND value and any resulting
2490 emulation warning, and return (warn << 32) | sseround value.
2491 */
amd64g_check_ldmxcsr(ULong mxcsr)2492 ULong amd64g_check_ldmxcsr ( ULong mxcsr )
2493 {
2494 /* Decide on a rounding mode. mxcsr[14:13] holds it. */
2495 /* NOTE, encoded exactly as per enum IRRoundingMode. */
2496 ULong rmode = (mxcsr >> 13) & 3;
2497
2498 /* Detect any required emulation warnings. */
2499 VexEmNote ew = EmNote_NONE;
2500
2501 if ((mxcsr & 0x1F80) != 0x1F80) {
2502 /* unmasked exceptions! */
2503 ew = EmWarn_X86_sseExns;
2504 }
2505 else
2506 if (mxcsr & (1<<15)) {
2507 /* FZ is set */
2508 ew = EmWarn_X86_fz;
2509 }
2510 else
2511 if (mxcsr & (1<<6)) {
2512 /* DAZ is set */
2513 ew = EmWarn_X86_daz;
2514 }
2515
2516 return (((ULong)ew) << 32) | ((ULong)rmode);
2517 }
2518
2519
2520 /* CALLED FROM GENERATED CODE */
2521 /* CLEAN HELPER */
2522 /* Given sseround as an IRRoundingMode value, create a suitable SSE
2523 native format MXCSR value. */
amd64g_create_mxcsr(ULong sseround)2524 ULong amd64g_create_mxcsr ( ULong sseround )
2525 {
2526 sseround &= 3;
2527 return 0x1F80 | (sseround << 13);
2528 }
2529
2530
2531 /* CLEAN HELPER */
2532 /* fpucw[15:0] contains a x87 native format FPU control word.
2533 Extract from it the required FPROUND value and any resulting
2534 emulation warning, and return (warn << 32) | fpround value.
2535 */
amd64g_check_fldcw(ULong fpucw)2536 ULong amd64g_check_fldcw ( ULong fpucw )
2537 {
2538 /* Decide on a rounding mode. fpucw[11:10] holds it. */
2539 /* NOTE, encoded exactly as per enum IRRoundingMode. */
2540 ULong rmode = (fpucw >> 10) & 3;
2541
2542 /* Detect any required emulation warnings. */
2543 VexEmNote ew = EmNote_NONE;
2544
2545 if ((fpucw & 0x3F) != 0x3F) {
2546 /* unmasked exceptions! */
2547 ew = EmWarn_X86_x87exns;
2548 }
2549 else
2550 if (((fpucw >> 8) & 3) != 3) {
2551 /* unsupported precision */
2552 ew = EmWarn_X86_x87precision;
2553 }
2554
2555 return (((ULong)ew) << 32) | ((ULong)rmode);
2556 }
2557
2558
2559 /* CLEAN HELPER */
2560 /* Given fpround as an IRRoundingMode value, create a suitable x87
2561 native format FPU control word. */
amd64g_create_fpucw(ULong fpround)2562 ULong amd64g_create_fpucw ( ULong fpround )
2563 {
2564 fpround &= 3;
2565 return 0x037F | (fpround << 10);
2566 }
2567
2568
2569 /* This is used to implement 'fldenv'.
2570 Reads 28 bytes at x87_state[0 .. 27]. */
2571 /* CALLED FROM GENERATED CODE */
2572 /* DIRTY HELPER */
amd64g_dirtyhelper_FLDENV(VexGuestAMD64State * vex_state,HWord x87_state)2573 VexEmNote amd64g_dirtyhelper_FLDENV ( /*OUT*/VexGuestAMD64State* vex_state,
2574 /*IN*/HWord x87_state)
2575 {
2576 return do_put_x87( False, (Fpu_State*)x87_state, vex_state );
2577 }
2578
2579
2580 /* CALLED FROM GENERATED CODE */
2581 /* DIRTY HELPER */
2582 /* Create an x87 FPU env from the guest state, as close as we can
2583 approximate it. Writes 28 bytes at x87_state[0..27]. */
amd64g_dirtyhelper_FSTENV(VexGuestAMD64State * vex_state,HWord x87_state)2584 void amd64g_dirtyhelper_FSTENV ( /*IN*/VexGuestAMD64State* vex_state,
2585 /*OUT*/HWord x87_state )
2586 {
2587 Int i, stno, preg;
2588 UInt tagw;
2589 UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2590 Fpu_State* x87 = (Fpu_State*)x87_state;
2591 UInt ftop = vex_state->guest_FTOP;
2592 ULong c3210 = vex_state->guest_FC3210;
2593
2594 for (i = 0; i < 14; i++)
2595 x87->env[i] = 0;
2596
2597 x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
2598 x87->env[FP_ENV_STAT]
2599 = toUShort(toUInt( ((ftop & 7) << 11) | (c3210 & 0x4700) ));
2600 x87->env[FP_ENV_CTRL]
2601 = toUShort(toUInt( amd64g_create_fpucw( vex_state->guest_FPROUND ) ));
2602
2603 /* Compute the x87 tag word. */
2604 tagw = 0;
2605 for (stno = 0; stno < 8; stno++) {
2606 preg = (stno + ftop) & 7;
2607 if (vexTags[preg] == 0) {
2608 /* register is empty */
2609 tagw |= (3 << (2*preg));
2610 } else {
2611 /* register is full. */
2612 tagw |= (0 << (2*preg));
2613 }
2614 }
2615 x87->env[FP_ENV_TAG] = toUShort(tagw);
2616
2617 /* We don't dump the x87 registers, tho. */
2618 }
2619
2620
2621 /* This is used to implement 'fnsave'.
2622 Writes 108 bytes at x87_state[0 .. 107]. */
2623 /* CALLED FROM GENERATED CODE */
2624 /* DIRTY HELPER */
amd64g_dirtyhelper_FNSAVE(VexGuestAMD64State * vex_state,HWord x87_state)2625 void amd64g_dirtyhelper_FNSAVE ( /*IN*/VexGuestAMD64State* vex_state,
2626 /*OUT*/HWord x87_state)
2627 {
2628 do_get_x87( vex_state, (Fpu_State*)x87_state );
2629 }
2630
2631
2632 /* This is used to implement 'fnsaves'.
2633 Writes 94 bytes at x87_state[0 .. 93]. */
2634 /* CALLED FROM GENERATED CODE */
2635 /* DIRTY HELPER */
amd64g_dirtyhelper_FNSAVES(VexGuestAMD64State * vex_state,HWord x87_state)2636 void amd64g_dirtyhelper_FNSAVES ( /*IN*/VexGuestAMD64State* vex_state,
2637 /*OUT*/HWord x87_state)
2638 {
2639 Int i, stno, preg;
2640 UInt tagw;
2641 ULong* vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2642 UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2643 Fpu_State_16* x87 = (Fpu_State_16*)x87_state;
2644 UInt ftop = vex_state->guest_FTOP;
2645 UInt c3210 = vex_state->guest_FC3210;
2646
2647 for (i = 0; i < 7; i++)
2648 x87->env[i] = 0;
2649
2650 x87->env[FPS_ENV_STAT]
2651 = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
2652 x87->env[FPS_ENV_CTRL]
2653 = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
2654
2655 /* Dump the register stack in ST order. */
2656 tagw = 0;
2657 for (stno = 0; stno < 8; stno++) {
2658 preg = (stno + ftop) & 7;
2659 if (vexTags[preg] == 0) {
2660 /* register is empty */
2661 tagw |= (3 << (2*preg));
2662 convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2663 &x87->reg[10*stno] );
2664 } else {
2665 /* register is full. */
2666 tagw |= (0 << (2*preg));
2667 convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2668 &x87->reg[10*stno] );
2669 }
2670 }
2671 x87->env[FPS_ENV_TAG] = toUShort(tagw);
2672 }
2673
2674
2675 /* This is used to implement 'frstor'.
2676 Reads 108 bytes at x87_state[0 .. 107]. */
2677 /* CALLED FROM GENERATED CODE */
2678 /* DIRTY HELPER */
amd64g_dirtyhelper_FRSTOR(VexGuestAMD64State * vex_state,HWord x87_state)2679 VexEmNote amd64g_dirtyhelper_FRSTOR ( /*OUT*/VexGuestAMD64State* vex_state,
2680 /*IN*/HWord x87_state)
2681 {
2682 return do_put_x87( True, (Fpu_State*)x87_state, vex_state );
2683 }
2684
2685
2686 /* This is used to implement 'frstors'.
2687 Reads 94 bytes at x87_state[0 .. 93]. */
2688 /* CALLED FROM GENERATED CODE */
2689 /* DIRTY HELPER */
amd64g_dirtyhelper_FRSTORS(VexGuestAMD64State * vex_state,HWord x87_state)2690 VexEmNote amd64g_dirtyhelper_FRSTORS ( /*OUT*/VexGuestAMD64State* vex_state,
2691 /*IN*/HWord x87_state)
2692 {
2693 Int stno, preg;
2694 UInt tag;
2695 ULong* vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2696 UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2697 Fpu_State_16* x87 = (Fpu_State_16*)x87_state;
2698 UInt ftop = (x87->env[FPS_ENV_STAT] >> 11) & 7;
2699 UInt tagw = x87->env[FPS_ENV_TAG];
2700 UInt fpucw = x87->env[FPS_ENV_CTRL];
2701 UInt c3210 = x87->env[FPS_ENV_STAT] & 0x4700;
2702 VexEmNote ew;
2703 UInt fpround;
2704 ULong pair;
2705
2706 /* Copy registers and tags */
2707 for (stno = 0; stno < 8; stno++) {
2708 preg = (stno + ftop) & 7;
2709 tag = (tagw >> (2*preg)) & 3;
2710 if (tag == 3) {
2711 /* register is empty */
2712 /* hmm, if it's empty, does it still get written? Probably
2713 safer to say it does. If we don't, memcheck could get out
2714 of sync, in that it thinks all FP registers are defined by
2715 this helper, but in reality some have not been updated. */
2716 vexRegs[preg] = 0; /* IEEE754 64-bit zero */
2717 vexTags[preg] = 0;
2718 } else {
2719 /* register is non-empty */
2720 convert_f80le_to_f64le( &x87->reg[10*stno],
2721 (UChar*)&vexRegs[preg] );
2722 vexTags[preg] = 1;
2723 }
2724 }
2725
2726 /* stack pointer */
2727 vex_state->guest_FTOP = ftop;
2728
2729 /* status word */
2730 vex_state->guest_FC3210 = c3210;
2731
2732 /* handle the control word, setting FPROUND and detecting any
2733 emulation warnings. */
2734 pair = amd64g_check_fldcw ( (ULong)fpucw );
2735 fpround = (UInt)pair & 0xFFFFFFFFULL;
2736 ew = (VexEmNote)(pair >> 32);
2737
2738 vex_state->guest_FPROUND = fpround & 3;
2739
2740 /* emulation warnings --> caller */
2741 return ew;
2742 }
2743
2744
2745 /*---------------------------------------------------------------*/
2746 /*--- CPUID helpers. ---*/
2747 /*---------------------------------------------------------------*/
2748
2749 /* Claim to be the following CPU, which is probably representative of
2750 the lowliest (earliest) amd64 offerings. It can do neither sse3
2751 nor cx16.
2752
2753 vendor_id : AuthenticAMD
2754 cpu family : 15
2755 model : 5
2756 model name : AMD Opteron (tm) Processor 848
2757 stepping : 10
2758 cpu MHz : 1797.682
2759 cache size : 1024 KB
2760 fpu : yes
2761 fpu_exception : yes
2762 cpuid level : 1
2763 wp : yes
2764 flags : fpu vme de pse tsc msr pae mce cx8 apic sep
2765 mtrr pge mca cmov pat pse36 clflush mmx fxsr
2766 sse sse2 syscall nx mmxext lm 3dnowext 3dnow
2767 bogomips : 3600.62
2768 TLB size : 1088 4K pages
2769 clflush size : 64
2770 cache_alignment : 64
2771 address sizes : 40 bits physical, 48 bits virtual
2772 power management: ts fid vid ttp
2773
2774 2012-Feb-21: don't claim 3dnow or 3dnowext, since in fact
2775 we don't support them. See #291568. 3dnow is 80000001.EDX.31
2776 and 3dnowext is 80000001.EDX.30.
2777 */
amd64g_dirtyhelper_CPUID_baseline(VexGuestAMD64State * st)2778 void amd64g_dirtyhelper_CPUID_baseline ( VexGuestAMD64State* st )
2779 {
2780 # define SET_ABCD(_a,_b,_c,_d) \
2781 do { st->guest_RAX = (ULong)(_a); \
2782 st->guest_RBX = (ULong)(_b); \
2783 st->guest_RCX = (ULong)(_c); \
2784 st->guest_RDX = (ULong)(_d); \
2785 } while (0)
2786
2787 switch (0xFFFFFFFF & st->guest_RAX) {
2788 case 0x00000000:
2789 SET_ABCD(0x00000001, 0x68747541, 0x444d4163, 0x69746e65);
2790 break;
2791 case 0x00000001:
2792 SET_ABCD(0x00000f5a, 0x01000800, 0x00000000, 0x078bfbff);
2793 break;
2794 case 0x80000000:
2795 SET_ABCD(0x80000018, 0x68747541, 0x444d4163, 0x69746e65);
2796 break;
2797 case 0x80000001:
2798 /* Don't claim to support 3dnow or 3dnowext. 0xe1d3fbff is
2799 the original it-is-supported value that the h/w provides.
2800 See #291568. */
2801 SET_ABCD(0x00000f5a, 0x00000505, 0x00000000, /*0xe1d3fbff*/
2802 0x21d3fbff);
2803 break;
2804 case 0x80000002:
2805 SET_ABCD(0x20444d41, 0x6574704f, 0x206e6f72, 0x296d7428);
2806 break;
2807 case 0x80000003:
2808 SET_ABCD(0x6f725020, 0x73736563, 0x3820726f, 0x00003834);
2809 break;
2810 case 0x80000004:
2811 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2812 break;
2813 case 0x80000005:
2814 SET_ABCD(0xff08ff08, 0xff20ff20, 0x40020140, 0x40020140);
2815 break;
2816 case 0x80000006:
2817 SET_ABCD(0x00000000, 0x42004200, 0x04008140, 0x00000000);
2818 break;
2819 case 0x80000007:
2820 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x0000000f);
2821 break;
2822 case 0x80000008:
2823 SET_ABCD(0x00003028, 0x00000000, 0x00000000, 0x00000000);
2824 break;
2825 default:
2826 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2827 break;
2828 }
2829 # undef SET_ABCD
2830 }
2831
2832
2833 /* Claim to be the following CPU (2 x ...), which is sse3 and cx16
2834 capable.
2835
2836 vendor_id : GenuineIntel
2837 cpu family : 6
2838 model : 15
2839 model name : Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
2840 stepping : 6
2841 cpu MHz : 2394.000
2842 cache size : 4096 KB
2843 physical id : 0
2844 siblings : 2
2845 core id : 0
2846 cpu cores : 2
2847 fpu : yes
2848 fpu_exception : yes
2849 cpuid level : 10
2850 wp : yes
2851 flags : fpu vme de pse tsc msr pae mce cx8 apic sep
2852 mtrr pge mca cmov pat pse36 clflush dts acpi
2853 mmx fxsr sse sse2 ss ht tm syscall nx lm
2854 constant_tsc pni monitor ds_cpl vmx est tm2
2855 cx16 xtpr lahf_lm
2856 bogomips : 4798.78
2857 clflush size : 64
2858 cache_alignment : 64
2859 address sizes : 36 bits physical, 48 bits virtual
2860 power management:
2861 */
amd64g_dirtyhelper_CPUID_sse3_and_cx16(VexGuestAMD64State * st)2862 void amd64g_dirtyhelper_CPUID_sse3_and_cx16 ( VexGuestAMD64State* st )
2863 {
2864 # define SET_ABCD(_a,_b,_c,_d) \
2865 do { st->guest_RAX = (ULong)(_a); \
2866 st->guest_RBX = (ULong)(_b); \
2867 st->guest_RCX = (ULong)(_c); \
2868 st->guest_RDX = (ULong)(_d); \
2869 } while (0)
2870
2871 switch (0xFFFFFFFF & st->guest_RAX) {
2872 case 0x00000000:
2873 SET_ABCD(0x0000000a, 0x756e6547, 0x6c65746e, 0x49656e69);
2874 break;
2875 case 0x00000001:
2876 SET_ABCD(0x000006f6, 0x00020800, 0x0000e3bd, 0xbfebfbff);
2877 break;
2878 case 0x00000002:
2879 SET_ABCD(0x05b0b101, 0x005657f0, 0x00000000, 0x2cb43049);
2880 break;
2881 case 0x00000003:
2882 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2883 break;
2884 case 0x00000004: {
2885 switch (0xFFFFFFFF & st->guest_RCX) {
2886 case 0x00000000: SET_ABCD(0x04000121, 0x01c0003f,
2887 0x0000003f, 0x00000001); break;
2888 case 0x00000001: SET_ABCD(0x04000122, 0x01c0003f,
2889 0x0000003f, 0x00000001); break;
2890 case 0x00000002: SET_ABCD(0x04004143, 0x03c0003f,
2891 0x00000fff, 0x00000001); break;
2892 default: SET_ABCD(0x00000000, 0x00000000,
2893 0x00000000, 0x00000000); break;
2894 }
2895 break;
2896 }
2897 case 0x00000005:
2898 SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00000020);
2899 break;
2900 case 0x00000006:
2901 SET_ABCD(0x00000001, 0x00000002, 0x00000001, 0x00000000);
2902 break;
2903 case 0x00000007:
2904 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2905 break;
2906 case 0x00000008:
2907 SET_ABCD(0x00000400, 0x00000000, 0x00000000, 0x00000000);
2908 break;
2909 case 0x00000009:
2910 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2911 break;
2912 case 0x0000000a:
2913 unhandled_eax_value:
2914 SET_ABCD(0x07280202, 0x00000000, 0x00000000, 0x00000000);
2915 break;
2916 case 0x80000000:
2917 SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
2918 break;
2919 case 0x80000001:
2920 SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x20100800);
2921 break;
2922 case 0x80000002:
2923 SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
2924 break;
2925 case 0x80000003:
2926 SET_ABCD(0x43203229, 0x20205550, 0x20202020, 0x20202020);
2927 break;
2928 case 0x80000004:
2929 SET_ABCD(0x30303636, 0x20402020, 0x30342e32, 0x007a4847);
2930 break;
2931 case 0x80000005:
2932 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2933 break;
2934 case 0x80000006:
2935 SET_ABCD(0x00000000, 0x00000000, 0x10008040, 0x00000000);
2936 break;
2937 case 0x80000007:
2938 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2939 break;
2940 case 0x80000008:
2941 SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
2942 break;
2943 default:
2944 goto unhandled_eax_value;
2945 }
2946 # undef SET_ABCD
2947 }
2948
2949
2950 /* Claim to be the following CPU (4 x ...), which is sse4.2 and cx16
2951 capable.
2952
2953 vendor_id : GenuineIntel
2954 cpu family : 6
2955 model : 37
2956 model name : Intel(R) Core(TM) i5 CPU 670 @ 3.47GHz
2957 stepping : 2
2958 cpu MHz : 3334.000
2959 cache size : 4096 KB
2960 physical id : 0
2961 siblings : 4
2962 core id : 0
2963 cpu cores : 2
2964 apicid : 0
2965 initial apicid : 0
2966 fpu : yes
2967 fpu_exception : yes
2968 cpuid level : 11
2969 wp : yes
2970 flags : fpu vme de pse tsc msr pae mce cx8 apic sep
2971 mtrr pge mca cmov pat pse36 clflush dts acpi
2972 mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
2973 lm constant_tsc arch_perfmon pebs bts rep_good
2974 xtopology nonstop_tsc aperfmperf pni pclmulqdq
2975 dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16
2976 xtpr pdcm sse4_1 sse4_2 popcnt aes lahf_lm ida
2977 arat tpr_shadow vnmi flexpriority ept vpid
2978 bogomips : 6957.57
2979 clflush size : 64
2980 cache_alignment : 64
2981 address sizes : 36 bits physical, 48 bits virtual
2982 power management:
2983 */
amd64g_dirtyhelper_CPUID_sse42_and_cx16(VexGuestAMD64State * st)2984 void amd64g_dirtyhelper_CPUID_sse42_and_cx16 ( VexGuestAMD64State* st )
2985 {
2986 # define SET_ABCD(_a,_b,_c,_d) \
2987 do { st->guest_RAX = (ULong)(_a); \
2988 st->guest_RBX = (ULong)(_b); \
2989 st->guest_RCX = (ULong)(_c); \
2990 st->guest_RDX = (ULong)(_d); \
2991 } while (0)
2992
2993 UInt old_eax = (UInt)st->guest_RAX;
2994 UInt old_ecx = (UInt)st->guest_RCX;
2995
2996 switch (old_eax) {
2997 case 0x00000000:
2998 SET_ABCD(0x0000000b, 0x756e6547, 0x6c65746e, 0x49656e69);
2999 break;
3000 case 0x00000001:
3001 SET_ABCD(0x00020652, 0x00100800, 0x0298e3ff, 0xbfebfbff);
3002 break;
3003 case 0x00000002:
3004 SET_ABCD(0x55035a01, 0x00f0b2e3, 0x00000000, 0x09ca212c);
3005 break;
3006 case 0x00000003:
3007 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3008 break;
3009 case 0x00000004:
3010 switch (old_ecx) {
3011 case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
3012 0x0000003f, 0x00000000); break;
3013 case 0x00000001: SET_ABCD(0x1c004122, 0x00c0003f,
3014 0x0000007f, 0x00000000); break;
3015 case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
3016 0x000001ff, 0x00000000); break;
3017 case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
3018 0x00000fff, 0x00000002); break;
3019 default: SET_ABCD(0x00000000, 0x00000000,
3020 0x00000000, 0x00000000); break;
3021 }
3022 break;
3023 case 0x00000005:
3024 SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
3025 break;
3026 case 0x00000006:
3027 SET_ABCD(0x00000007, 0x00000002, 0x00000001, 0x00000000);
3028 break;
3029 case 0x00000007:
3030 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3031 break;
3032 case 0x00000008:
3033 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3034 break;
3035 case 0x00000009:
3036 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3037 break;
3038 case 0x0000000a:
3039 SET_ABCD(0x07300403, 0x00000004, 0x00000000, 0x00000603);
3040 break;
3041 case 0x0000000b:
3042 switch (old_ecx) {
3043 case 0x00000000:
3044 SET_ABCD(0x00000001, 0x00000002,
3045 0x00000100, 0x00000000); break;
3046 case 0x00000001:
3047 SET_ABCD(0x00000004, 0x00000004,
3048 0x00000201, 0x00000000); break;
3049 default:
3050 SET_ABCD(0x00000000, 0x00000000,
3051 old_ecx, 0x00000000); break;
3052 }
3053 break;
3054 case 0x0000000c:
3055 SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
3056 break;
3057 case 0x0000000d:
3058 switch (old_ecx) {
3059 case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
3060 0x00000100, 0x00000000); break;
3061 case 0x00000001: SET_ABCD(0x00000004, 0x00000004,
3062 0x00000201, 0x00000000); break;
3063 default: SET_ABCD(0x00000000, 0x00000000,
3064 old_ecx, 0x00000000); break;
3065 }
3066 break;
3067 case 0x80000000:
3068 SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
3069 break;
3070 case 0x80000001:
3071 SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
3072 break;
3073 case 0x80000002:
3074 SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
3075 break;
3076 case 0x80000003:
3077 SET_ABCD(0x35692029, 0x55504320, 0x20202020, 0x20202020);
3078 break;
3079 case 0x80000004:
3080 SET_ABCD(0x30373620, 0x20402020, 0x37342e33, 0x007a4847);
3081 break;
3082 case 0x80000005:
3083 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3084 break;
3085 case 0x80000006:
3086 SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
3087 break;
3088 case 0x80000007:
3089 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
3090 break;
3091 case 0x80000008:
3092 SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
3093 break;
3094 default:
3095 SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
3096 break;
3097 }
3098 # undef SET_ABCD
3099 }
3100
3101
3102 /* Claim to be the following CPU (4 x ...), which is AVX and cx16
3103 capable. Plus (kludge!) it "supports" HTM.
3104
3105 Also with the following change: claim that XSaveOpt is not
3106 available, by cpuid(eax=0xD,ecx=1).eax[0] returns 0, compared to 1
3107 on the real CPU. Consequently, programs that correctly observe
3108 these CPUID values should only try to use 3 of the 8 XSave-family
3109 instructions: XGETBV, XSAVE and XRSTOR. In particular this avoids
3110 having to implement the compacted or optimised save/restore
3111 variants.
3112
3113 vendor_id : GenuineIntel
3114 cpu family : 6
3115 model : 42
3116 model name : Intel(R) Core(TM) i5-2300 CPU @ 2.80GHz
3117 stepping : 7
3118 cpu MHz : 1600.000
3119 cache size : 6144 KB
3120 physical id : 0
3121 siblings : 4
3122 core id : 3
3123 cpu cores : 4
3124 apicid : 6
3125 initial apicid : 6
3126 fpu : yes
3127 fpu_exception : yes
3128 cpuid level : 13
3129 wp : yes
3130 flags : fpu vme de pse tsc msr pae mce cx8 apic sep
3131 mtrr pge mca cmov pat pse36 clflush dts acpi
3132 mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
3133 lm constant_tsc arch_perfmon pebs bts rep_good
3134 nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq
3135 dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16
3136 xtpr pdcm sse4_1 sse4_2 popcnt aes xsave avx
3137 lahf_lm ida arat epb xsaveopt pln pts dts
3138 tpr_shadow vnmi flexpriority ept vpid
3139
3140 bogomips : 5768.94
3141 clflush size : 64
3142 cache_alignment : 64
3143 address sizes : 36 bits physical, 48 bits virtual
3144 power management:
3145 */
amd64g_dirtyhelper_CPUID_avx_and_cx16(VexGuestAMD64State * st)3146 void amd64g_dirtyhelper_CPUID_avx_and_cx16 ( VexGuestAMD64State* st )
3147 {
3148 # define SET_ABCD(_a,_b,_c,_d) \
3149 do { st->guest_RAX = (ULong)(_a); \
3150 st->guest_RBX = (ULong)(_b); \
3151 st->guest_RCX = (ULong)(_c); \
3152 st->guest_RDX = (ULong)(_d); \
3153 } while (0)
3154
3155 UInt old_eax = (UInt)st->guest_RAX;
3156 UInt old_ecx = (UInt)st->guest_RCX;
3157
3158 switch (old_eax) {
3159 case 0x00000000:
3160 SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
3161 break;
3162 case 0x00000001:
3163 SET_ABCD(0x000206a7, 0x00100800, 0x1f9ae3bf, 0xbfebfbff);
3164 break;
3165 case 0x00000002:
3166 SET_ABCD(0x76035a01, 0x00f0b0ff, 0x00000000, 0x00ca0000);
3167 break;
3168 case 0x00000003:
3169 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3170 break;
3171 case 0x00000004:
3172 switch (old_ecx) {
3173 case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
3174 0x0000003f, 0x00000000); break;
3175 case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
3176 0x0000003f, 0x00000000); break;
3177 case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
3178 0x000001ff, 0x00000000); break;
3179 case 0x00000003: SET_ABCD(0x1c03c163, 0x02c0003f,
3180 0x00001fff, 0x00000006); break;
3181 default: SET_ABCD(0x00000000, 0x00000000,
3182 0x00000000, 0x00000000); break;
3183 }
3184 break;
3185 case 0x00000005:
3186 SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
3187 break;
3188 case 0x00000006:
3189 SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
3190 break;
3191 case 0x00000007:
3192 SET_ABCD(0x00000000, 0x00000800, 0x00000000, 0x00000000);
3193 break;
3194 case 0x00000008:
3195 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3196 break;
3197 case 0x00000009:
3198 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3199 break;
3200 case 0x0000000a:
3201 SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
3202 break;
3203 case 0x0000000b:
3204 switch (old_ecx) {
3205 case 0x00000000:
3206 SET_ABCD(0x00000001, 0x00000001,
3207 0x00000100, 0x00000000); break;
3208 case 0x00000001:
3209 SET_ABCD(0x00000004, 0x00000004,
3210 0x00000201, 0x00000000); break;
3211 default:
3212 SET_ABCD(0x00000000, 0x00000000,
3213 old_ecx, 0x00000000); break;
3214 }
3215 break;
3216 case 0x0000000c:
3217 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3218 break;
3219 case 0x0000000d:
3220 switch (old_ecx) {
3221 case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
3222 0x00000340, 0x00000000); break;
3223 case 0x00000001: SET_ABCD(0x00000000, 0x00000000,
3224 0x00000000, 0x00000000); break;
3225 case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
3226 0x00000000, 0x00000000); break;
3227 default: SET_ABCD(0x00000000, 0x00000000,
3228 0x00000000, 0x00000000); break;
3229 }
3230 break;
3231 case 0x0000000e:
3232 SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3233 break;
3234 case 0x0000000f:
3235 SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3236 break;
3237 case 0x80000000:
3238 SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
3239 break;
3240 case 0x80000001:
3241 SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
3242 break;
3243 case 0x80000002:
3244 SET_ABCD(0x20202020, 0x20202020, 0x65746e49, 0x2952286c);
3245 break;
3246 case 0x80000003:
3247 SET_ABCD(0x726f4320, 0x4d542865, 0x35692029, 0x3033322d);
3248 break;
3249 case 0x80000004:
3250 SET_ABCD(0x50432030, 0x20402055, 0x30382e32, 0x007a4847);
3251 break;
3252 case 0x80000005:
3253 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3254 break;
3255 case 0x80000006:
3256 SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
3257 break;
3258 case 0x80000007:
3259 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
3260 break;
3261 case 0x80000008:
3262 SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
3263 break;
3264 default:
3265 SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3266 break;
3267 }
3268 # undef SET_ABCD
3269 }
3270
3271
3272 /* Claim to be the following CPU (4 x ...), which is AVX2 capable.
3273
3274 With the following change: claim that XSaveOpt is not available, by
3275 cpuid(eax=0xD,ecx=1).eax[0] returns 0, compared to 1 on the real
3276 CPU. Consequently, programs that correctly observe these CPUID
3277 values should only try to use 3 of the 8 XSave-family instructions:
3278 XGETBV, XSAVE and XRSTOR. In particular this avoids having to
3279 implement the compacted or optimised save/restore variants.
3280
3281 vendor_id : GenuineIntel
3282 cpu family : 6
3283 model : 60
3284 model name : Intel(R) Core(TM) i7-4910MQ CPU @ 2.90GHz
3285 stepping : 3
3286 microcode : 0x1c
3287 cpu MHz : 919.957
3288 cache size : 8192 KB
3289 physical id : 0
3290 siblings : 4
3291 core id : 3
3292 cpu cores : 4
3293 apicid : 6
3294 initial apicid : 6
3295 fpu : yes
3296 fpu_exception : yes
3297 cpuid level : 13
3298 wp : yes
3299 flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca
3300 cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht
3301 tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc
3302 arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc
3303 aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl
3304 vmx smx est tm2 ssse3 fma cx16 xtpr pdcm pcid sse4_1
3305 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave
3306 avx f16c rdrand lahf_lm abm ida arat epb pln pts dtherm
3307 tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust
3308 bmi1 avx2 smep bmi2 erms invpcid xsaveopt
3309 bugs :
3310 bogomips : 5786.68
3311 clflush size : 64
3312 cache_alignment : 64
3313 address sizes : 39 bits physical, 48 bits virtual
3314 power management:
3315 */
amd64g_dirtyhelper_CPUID_avx2(VexGuestAMD64State * st)3316 void amd64g_dirtyhelper_CPUID_avx2 ( VexGuestAMD64State* st )
3317 {
3318 # define SET_ABCD(_a,_b,_c,_d) \
3319 do { st->guest_RAX = (ULong)(_a); \
3320 st->guest_RBX = (ULong)(_b); \
3321 st->guest_RCX = (ULong)(_c); \
3322 st->guest_RDX = (ULong)(_d); \
3323 } while (0)
3324
3325 UInt old_eax = (UInt)st->guest_RAX;
3326 UInt old_ecx = (UInt)st->guest_RCX;
3327
3328 switch (old_eax) {
3329 case 0x00000000:
3330 SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
3331 break;
3332 case 0x00000001:
3333 /* Don't advertise RDRAND support, bit 30 in ECX. */
3334 SET_ABCD(0x000306c3, 0x02100800, 0x3ffafbff, 0xbfebfbff);
3335 break;
3336 case 0x00000002:
3337 SET_ABCD(0x76036301, 0x00f0b6ff, 0x00000000, 0x00c10000);
3338 break;
3339 case 0x00000003:
3340 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3341 break;
3342 case 0x00000004:
3343 switch (old_ecx) {
3344 case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
3345 0x0000003f, 0x00000000); break;
3346 case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
3347 0x0000003f, 0x00000000); break;
3348 case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
3349 0x000001ff, 0x00000000); break;
3350 case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
3351 0x00001fff, 0x00000006); break;
3352 default: SET_ABCD(0x00000000, 0x00000000,
3353 0x00000000, 0x00000000); break;
3354 }
3355 break;
3356 case 0x00000005:
3357 SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00042120);
3358 break;
3359 case 0x00000006:
3360 SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
3361 break;
3362 case 0x00000007:
3363 switch (old_ecx) {
3364 case 0x00000000: SET_ABCD(0x00000000, 0x000027ab,
3365 0x00000000, 0x00000000); break;
3366 default: SET_ABCD(0x00000000, 0x00000000,
3367 0x00000000, 0x00000000); break;
3368 }
3369 break;
3370 case 0x00000008:
3371 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3372 break;
3373 case 0x00000009:
3374 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3375 break;
3376 case 0x0000000a:
3377 SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
3378 break;
3379 case 0x0000000b:
3380 switch (old_ecx) {
3381 case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
3382 0x00000100, 0x00000002); break;
3383 case 0x00000001: SET_ABCD(0x00000004, 0x00000008,
3384 0x00000201, 0x00000002); break;
3385 default: SET_ABCD(0x00000000, 0x00000000,
3386 old_ecx, 0x00000002); break;
3387 }
3388 break;
3389 case 0x0000000c:
3390 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3391 break;
3392 case 0x0000000d:
3393 switch (old_ecx) {
3394 case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
3395 0x00000340, 0x00000000); break;
3396 case 0x00000001: SET_ABCD(0x00000000, 0x00000000,
3397 0x00000000, 0x00000000); break;
3398 case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
3399 0x00000000, 0x00000000); break;
3400 default: SET_ABCD(0x00000000, 0x00000000,
3401 0x00000000, 0x00000000); break;
3402 }
3403 break;
3404 case 0x80000000:
3405 SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
3406 break;
3407 case 0x80000001:
3408 SET_ABCD(0x00000000, 0x00000000, 0x00000021, 0x2c100800);
3409 break;
3410 case 0x80000002:
3411 SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
3412 break;
3413 case 0x80000003:
3414 SET_ABCD(0x37692029, 0x3139342d, 0x20514d30, 0x20555043);
3415 break;
3416 case 0x80000004:
3417 SET_ABCD(0x2e322040, 0x48473039, 0x0000007a, 0x00000000);
3418 break;
3419 case 0x80000005:
3420 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3421 break;
3422 case 0x80000006:
3423 SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
3424 break;
3425 case 0x80000007:
3426 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
3427 break;
3428 case 0x80000008:
3429 SET_ABCD(0x00003027, 0x00000000, 0x00000000, 0x00000000);
3430 break;
3431 default:
3432 SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3433 break;
3434 }
3435 # undef SET_ABCD
3436 }
3437
3438
3439 /*---------------------------------------------------------------*/
3440 /*--- Misc integer helpers, including rotates and crypto. ---*/
3441 /*---------------------------------------------------------------*/
3442
amd64g_calculate_RCR(ULong arg,ULong rot_amt,ULong rflags_in,Long szIN)3443 ULong amd64g_calculate_RCR ( ULong arg,
3444 ULong rot_amt,
3445 ULong rflags_in,
3446 Long szIN )
3447 {
3448 Bool wantRflags = toBool(szIN < 0);
3449 ULong sz = wantRflags ? (-szIN) : szIN;
3450 ULong tempCOUNT = rot_amt & (sz == 8 ? 0x3F : 0x1F);
3451 ULong cf=0, of=0, tempcf;
3452
3453 switch (sz) {
3454 case 8:
3455 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3456 of = ((arg >> 63) ^ cf) & 1;
3457 while (tempCOUNT > 0) {
3458 tempcf = arg & 1;
3459 arg = (arg >> 1) | (cf << 63);
3460 cf = tempcf;
3461 tempCOUNT--;
3462 }
3463 break;
3464 case 4:
3465 while (tempCOUNT >= 33) tempCOUNT -= 33;
3466 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3467 of = ((arg >> 31) ^ cf) & 1;
3468 while (tempCOUNT > 0) {
3469 tempcf = arg & 1;
3470 arg = ((arg >> 1) & 0x7FFFFFFFULL) | (cf << 31);
3471 cf = tempcf;
3472 tempCOUNT--;
3473 }
3474 break;
3475 case 2:
3476 while (tempCOUNT >= 17) tempCOUNT -= 17;
3477 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3478 of = ((arg >> 15) ^ cf) & 1;
3479 while (tempCOUNT > 0) {
3480 tempcf = arg & 1;
3481 arg = ((arg >> 1) & 0x7FFFULL) | (cf << 15);
3482 cf = tempcf;
3483 tempCOUNT--;
3484 }
3485 break;
3486 case 1:
3487 while (tempCOUNT >= 9) tempCOUNT -= 9;
3488 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3489 of = ((arg >> 7) ^ cf) & 1;
3490 while (tempCOUNT > 0) {
3491 tempcf = arg & 1;
3492 arg = ((arg >> 1) & 0x7FULL) | (cf << 7);
3493 cf = tempcf;
3494 tempCOUNT--;
3495 }
3496 break;
3497 default:
3498 vpanic("calculate_RCR(amd64g): invalid size");
3499 }
3500
3501 cf &= 1;
3502 of &= 1;
3503 rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
3504 rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
3505
3506 /* caller can ask to have back either the resulting flags or
3507 resulting value, but not both */
3508 return wantRflags ? rflags_in : arg;
3509 }
3510
amd64g_calculate_RCL(ULong arg,ULong rot_amt,ULong rflags_in,Long szIN)3511 ULong amd64g_calculate_RCL ( ULong arg,
3512 ULong rot_amt,
3513 ULong rflags_in,
3514 Long szIN )
3515 {
3516 Bool wantRflags = toBool(szIN < 0);
3517 ULong sz = wantRflags ? (-szIN) : szIN;
3518 ULong tempCOUNT = rot_amt & (sz == 8 ? 0x3F : 0x1F);
3519 ULong cf=0, of=0, tempcf;
3520
3521 switch (sz) {
3522 case 8:
3523 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3524 while (tempCOUNT > 0) {
3525 tempcf = (arg >> 63) & 1;
3526 arg = (arg << 1) | (cf & 1);
3527 cf = tempcf;
3528 tempCOUNT--;
3529 }
3530 of = ((arg >> 63) ^ cf) & 1;
3531 break;
3532 case 4:
3533 while (tempCOUNT >= 33) tempCOUNT -= 33;
3534 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3535 while (tempCOUNT > 0) {
3536 tempcf = (arg >> 31) & 1;
3537 arg = 0xFFFFFFFFULL & ((arg << 1) | (cf & 1));
3538 cf = tempcf;
3539 tempCOUNT--;
3540 }
3541 of = ((arg >> 31) ^ cf) & 1;
3542 break;
3543 case 2:
3544 while (tempCOUNT >= 17) tempCOUNT -= 17;
3545 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3546 while (tempCOUNT > 0) {
3547 tempcf = (arg >> 15) & 1;
3548 arg = 0xFFFFULL & ((arg << 1) | (cf & 1));
3549 cf = tempcf;
3550 tempCOUNT--;
3551 }
3552 of = ((arg >> 15) ^ cf) & 1;
3553 break;
3554 case 1:
3555 while (tempCOUNT >= 9) tempCOUNT -= 9;
3556 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3557 while (tempCOUNT > 0) {
3558 tempcf = (arg >> 7) & 1;
3559 arg = 0xFFULL & ((arg << 1) | (cf & 1));
3560 cf = tempcf;
3561 tempCOUNT--;
3562 }
3563 of = ((arg >> 7) ^ cf) & 1;
3564 break;
3565 default:
3566 vpanic("calculate_RCL(amd64g): invalid size");
3567 }
3568
3569 cf &= 1;
3570 of &= 1;
3571 rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
3572 rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
3573
3574 return wantRflags ? rflags_in : arg;
3575 }
3576
3577 /* Taken from gf2x-0.9.5, released under GPLv2+ (later versions LGPLv2+)
3578 * svn://scm.gforge.inria.fr/svn/gf2x/trunk/hardware/opteron/gf2x_mul1.h@25
3579 */
amd64g_calculate_pclmul(ULong a,ULong b,ULong which)3580 ULong amd64g_calculate_pclmul(ULong a, ULong b, ULong which)
3581 {
3582 ULong hi, lo, tmp, A[16];
3583
3584 A[0] = 0; A[1] = a;
3585 A[2] = A[1] << 1; A[3] = A[2] ^ a;
3586 A[4] = A[2] << 1; A[5] = A[4] ^ a;
3587 A[6] = A[3] << 1; A[7] = A[6] ^ a;
3588 A[8] = A[4] << 1; A[9] = A[8] ^ a;
3589 A[10] = A[5] << 1; A[11] = A[10] ^ a;
3590 A[12] = A[6] << 1; A[13] = A[12] ^ a;
3591 A[14] = A[7] << 1; A[15] = A[14] ^ a;
3592
3593 lo = (A[b >> 60] << 4) ^ A[(b >> 56) & 15];
3594 hi = lo >> 56;
3595 lo = (lo << 8) ^ (A[(b >> 52) & 15] << 4) ^ A[(b >> 48) & 15];
3596 hi = (hi << 8) | (lo >> 56);
3597 lo = (lo << 8) ^ (A[(b >> 44) & 15] << 4) ^ A[(b >> 40) & 15];
3598 hi = (hi << 8) | (lo >> 56);
3599 lo = (lo << 8) ^ (A[(b >> 36) & 15] << 4) ^ A[(b >> 32) & 15];
3600 hi = (hi << 8) | (lo >> 56);
3601 lo = (lo << 8) ^ (A[(b >> 28) & 15] << 4) ^ A[(b >> 24) & 15];
3602 hi = (hi << 8) | (lo >> 56);
3603 lo = (lo << 8) ^ (A[(b >> 20) & 15] << 4) ^ A[(b >> 16) & 15];
3604 hi = (hi << 8) | (lo >> 56);
3605 lo = (lo << 8) ^ (A[(b >> 12) & 15] << 4) ^ A[(b >> 8) & 15];
3606 hi = (hi << 8) | (lo >> 56);
3607 lo = (lo << 8) ^ (A[(b >> 4) & 15] << 4) ^ A[b & 15];
3608
3609 ULong m0 = -1;
3610 m0 /= 255;
3611 tmp = -((a >> 63) & 1); tmp &= ((b & (m0 * 0xfe)) >> 1); hi = hi ^ tmp;
3612 tmp = -((a >> 62) & 1); tmp &= ((b & (m0 * 0xfc)) >> 2); hi = hi ^ tmp;
3613 tmp = -((a >> 61) & 1); tmp &= ((b & (m0 * 0xf8)) >> 3); hi = hi ^ tmp;
3614 tmp = -((a >> 60) & 1); tmp &= ((b & (m0 * 0xf0)) >> 4); hi = hi ^ tmp;
3615 tmp = -((a >> 59) & 1); tmp &= ((b & (m0 * 0xe0)) >> 5); hi = hi ^ tmp;
3616 tmp = -((a >> 58) & 1); tmp &= ((b & (m0 * 0xc0)) >> 6); hi = hi ^ tmp;
3617 tmp = -((a >> 57) & 1); tmp &= ((b & (m0 * 0x80)) >> 7); hi = hi ^ tmp;
3618
3619 return which ? hi : lo;
3620 }
3621
3622
3623 /* CALLED FROM GENERATED CODE */
3624 /* DIRTY HELPER (non-referentially-transparent) */
3625 /* Horrible hack. On non-amd64 platforms, return 1. */
amd64g_dirtyhelper_RDTSC(void)3626 ULong amd64g_dirtyhelper_RDTSC ( void )
3627 {
3628 # if defined(__x86_64__)
3629 UInt eax, edx;
3630 __asm__ __volatile__("rdtsc" : "=a" (eax), "=d" (edx));
3631 return (((ULong)edx) << 32) | ((ULong)eax);
3632 # else
3633 return 1ULL;
3634 # endif
3635 }
3636
3637 /* CALLED FROM GENERATED CODE */
3638 /* DIRTY HELPER (non-referentially-transparent) */
3639 /* Horrible hack. On non-amd64 platforms, return 1. */
3640 /* This uses a different calling convention from _RDTSC just above
3641 only because of the difficulty of returning 96 bits from a C
3642 function -- RDTSC returns 64 bits and so is simple by comparison,
3643 on amd64. */
amd64g_dirtyhelper_RDTSCP(VexGuestAMD64State * st)3644 void amd64g_dirtyhelper_RDTSCP ( VexGuestAMD64State* st )
3645 {
3646 # if defined(__x86_64__)
3647 UInt eax, ecx, edx;
3648 __asm__ __volatile__("rdtscp" : "=a" (eax), "=d" (edx), "=c" (ecx));
3649 st->guest_RAX = (ULong)eax;
3650 st->guest_RCX = (ULong)ecx;
3651 st->guest_RDX = (ULong)edx;
3652 # else
3653 /* Do nothing. */
3654 # endif
3655 }
3656
3657 /* CALLED FROM GENERATED CODE */
3658 /* DIRTY HELPER (non-referentially-transparent) */
3659 /* Horrible hack. On non-amd64 platforms, return 0. */
amd64g_dirtyhelper_IN(ULong portno,ULong sz)3660 ULong amd64g_dirtyhelper_IN ( ULong portno, ULong sz/*1,2 or 4*/ )
3661 {
3662 # if defined(__x86_64__)
3663 ULong r = 0;
3664 portno &= 0xFFFF;
3665 switch (sz) {
3666 case 4:
3667 __asm__ __volatile__("movq $0,%%rax; inl %w1,%%eax; movq %%rax,%0"
3668 : "=a" (r) : "Nd" (portno));
3669 break;
3670 case 2:
3671 __asm__ __volatile__("movq $0,%%rax; inw %w1,%w0"
3672 : "=a" (r) : "Nd" (portno));
3673 break;
3674 case 1:
3675 __asm__ __volatile__("movq $0,%%rax; inb %w1,%b0"
3676 : "=a" (r) : "Nd" (portno));
3677 break;
3678 default:
3679 break; /* note: no 64-bit version of insn exists */
3680 }
3681 return r;
3682 # else
3683 return 0;
3684 # endif
3685 }
3686
3687
3688 /* CALLED FROM GENERATED CODE */
3689 /* DIRTY HELPER (non-referentially-transparent) */
3690 /* Horrible hack. On non-amd64 platforms, do nothing. */
amd64g_dirtyhelper_OUT(ULong portno,ULong data,ULong sz)3691 void amd64g_dirtyhelper_OUT ( ULong portno, ULong data, ULong sz/*1,2 or 4*/ )
3692 {
3693 # if defined(__x86_64__)
3694 portno &= 0xFFFF;
3695 switch (sz) {
3696 case 4:
3697 __asm__ __volatile__("movq %0,%%rax; outl %%eax, %w1"
3698 : : "a" (data), "Nd" (portno));
3699 break;
3700 case 2:
3701 __asm__ __volatile__("outw %w0, %w1"
3702 : : "a" (data), "Nd" (portno));
3703 break;
3704 case 1:
3705 __asm__ __volatile__("outb %b0, %w1"
3706 : : "a" (data), "Nd" (portno));
3707 break;
3708 default:
3709 break; /* note: no 64-bit version of insn exists */
3710 }
3711 # else
3712 /* do nothing */
3713 # endif
3714 }
3715
3716 /* CALLED FROM GENERATED CODE */
3717 /* DIRTY HELPER (non-referentially-transparent) */
3718 /* Horrible hack. On non-amd64 platforms, do nothing. */
3719 /* op = 0: call the native SGDT instruction.
3720 op = 1: call the native SIDT instruction.
3721 */
amd64g_dirtyhelper_SxDT(void * address,ULong op)3722 void amd64g_dirtyhelper_SxDT ( void *address, ULong op ) {
3723 # if defined(__x86_64__)
3724 switch (op) {
3725 case 0:
3726 __asm__ __volatile__("sgdt (%0)" : : "r" (address) : "memory");
3727 break;
3728 case 1:
3729 __asm__ __volatile__("sidt (%0)" : : "r" (address) : "memory");
3730 break;
3731 default:
3732 vpanic("amd64g_dirtyhelper_SxDT");
3733 }
3734 # else
3735 /* do nothing */
3736 UChar* p = (UChar*)address;
3737 p[0] = p[1] = p[2] = p[3] = p[4] = p[5] = 0;
3738 p[6] = p[7] = p[8] = p[9] = 0;
3739 # endif
3740 }
3741
3742 /*---------------------------------------------------------------*/
3743 /*--- Helpers for MMX/SSE/SSE2. ---*/
3744 /*---------------------------------------------------------------*/
3745
abdU8(UChar xx,UChar yy)3746 static inline UChar abdU8 ( UChar xx, UChar yy ) {
3747 return toUChar(xx>yy ? xx-yy : yy-xx);
3748 }
3749
mk32x2(UInt w1,UInt w0)3750 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
3751 return (((ULong)w1) << 32) | ((ULong)w0);
3752 }
3753
sel16x4_3(ULong w64)3754 static inline UShort sel16x4_3 ( ULong w64 ) {
3755 UInt hi32 = toUInt(w64 >> 32);
3756 return toUShort(hi32 >> 16);
3757 }
sel16x4_2(ULong w64)3758 static inline UShort sel16x4_2 ( ULong w64 ) {
3759 UInt hi32 = toUInt(w64 >> 32);
3760 return toUShort(hi32);
3761 }
sel16x4_1(ULong w64)3762 static inline UShort sel16x4_1 ( ULong w64 ) {
3763 UInt lo32 = toUInt(w64);
3764 return toUShort(lo32 >> 16);
3765 }
sel16x4_0(ULong w64)3766 static inline UShort sel16x4_0 ( ULong w64 ) {
3767 UInt lo32 = toUInt(w64);
3768 return toUShort(lo32);
3769 }
3770
sel8x8_7(ULong w64)3771 static inline UChar sel8x8_7 ( ULong w64 ) {
3772 UInt hi32 = toUInt(w64 >> 32);
3773 return toUChar(hi32 >> 24);
3774 }
sel8x8_6(ULong w64)3775 static inline UChar sel8x8_6 ( ULong w64 ) {
3776 UInt hi32 = toUInt(w64 >> 32);
3777 return toUChar(hi32 >> 16);
3778 }
sel8x8_5(ULong w64)3779 static inline UChar sel8x8_5 ( ULong w64 ) {
3780 UInt hi32 = toUInt(w64 >> 32);
3781 return toUChar(hi32 >> 8);
3782 }
sel8x8_4(ULong w64)3783 static inline UChar sel8x8_4 ( ULong w64 ) {
3784 UInt hi32 = toUInt(w64 >> 32);
3785 return toUChar(hi32 >> 0);
3786 }
sel8x8_3(ULong w64)3787 static inline UChar sel8x8_3 ( ULong w64 ) {
3788 UInt lo32 = toUInt(w64);
3789 return toUChar(lo32 >> 24);
3790 }
sel8x8_2(ULong w64)3791 static inline UChar sel8x8_2 ( ULong w64 ) {
3792 UInt lo32 = toUInt(w64);
3793 return toUChar(lo32 >> 16);
3794 }
sel8x8_1(ULong w64)3795 static inline UChar sel8x8_1 ( ULong w64 ) {
3796 UInt lo32 = toUInt(w64);
3797 return toUChar(lo32 >> 8);
3798 }
sel8x8_0(ULong w64)3799 static inline UChar sel8x8_0 ( ULong w64 ) {
3800 UInt lo32 = toUInt(w64);
3801 return toUChar(lo32 >> 0);
3802 }
3803
3804 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calculate_mmx_pmaddwd(ULong xx,ULong yy)3805 ULong amd64g_calculate_mmx_pmaddwd ( ULong xx, ULong yy )
3806 {
3807 return
3808 mk32x2(
3809 (((Int)(Short)sel16x4_3(xx)) * ((Int)(Short)sel16x4_3(yy)))
3810 + (((Int)(Short)sel16x4_2(xx)) * ((Int)(Short)sel16x4_2(yy))),
3811 (((Int)(Short)sel16x4_1(xx)) * ((Int)(Short)sel16x4_1(yy)))
3812 + (((Int)(Short)sel16x4_0(xx)) * ((Int)(Short)sel16x4_0(yy)))
3813 );
3814 }
3815
3816 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calculate_mmx_psadbw(ULong xx,ULong yy)3817 ULong amd64g_calculate_mmx_psadbw ( ULong xx, ULong yy )
3818 {
3819 UInt t = 0;
3820 t += (UInt)abdU8( sel8x8_7(xx), sel8x8_7(yy) );
3821 t += (UInt)abdU8( sel8x8_6(xx), sel8x8_6(yy) );
3822 t += (UInt)abdU8( sel8x8_5(xx), sel8x8_5(yy) );
3823 t += (UInt)abdU8( sel8x8_4(xx), sel8x8_4(yy) );
3824 t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
3825 t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
3826 t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
3827 t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
3828 t &= 0xFFFF;
3829 return (ULong)t;
3830 }
3831
3832 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calculate_sse_phminposuw(ULong sLo,ULong sHi)3833 ULong amd64g_calculate_sse_phminposuw ( ULong sLo, ULong sHi )
3834 {
3835 UShort t, min;
3836 UInt idx;
3837 t = sel16x4_0(sLo); if (True) { min = t; idx = 0; }
3838 t = sel16x4_1(sLo); if (t < min) { min = t; idx = 1; }
3839 t = sel16x4_2(sLo); if (t < min) { min = t; idx = 2; }
3840 t = sel16x4_3(sLo); if (t < min) { min = t; idx = 3; }
3841 t = sel16x4_0(sHi); if (t < min) { min = t; idx = 4; }
3842 t = sel16x4_1(sHi); if (t < min) { min = t; idx = 5; }
3843 t = sel16x4_2(sHi); if (t < min) { min = t; idx = 6; }
3844 t = sel16x4_3(sHi); if (t < min) { min = t; idx = 7; }
3845 return ((ULong)(idx << 16)) | ((ULong)min);
3846 }
3847
3848 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calc_crc32b(ULong crcIn,ULong b)3849 ULong amd64g_calc_crc32b ( ULong crcIn, ULong b )
3850 {
3851 UInt i;
3852 ULong crc = (b & 0xFFULL) ^ crcIn;
3853 for (i = 0; i < 8; i++)
3854 crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
3855 return crc;
3856 }
3857
3858 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calc_crc32w(ULong crcIn,ULong w)3859 ULong amd64g_calc_crc32w ( ULong crcIn, ULong w )
3860 {
3861 UInt i;
3862 ULong crc = (w & 0xFFFFULL) ^ crcIn;
3863 for (i = 0; i < 16; i++)
3864 crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
3865 return crc;
3866 }
3867
3868 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calc_crc32l(ULong crcIn,ULong l)3869 ULong amd64g_calc_crc32l ( ULong crcIn, ULong l )
3870 {
3871 UInt i;
3872 ULong crc = (l & 0xFFFFFFFFULL) ^ crcIn;
3873 for (i = 0; i < 32; i++)
3874 crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
3875 return crc;
3876 }
3877
3878 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calc_crc32q(ULong crcIn,ULong q)3879 ULong amd64g_calc_crc32q ( ULong crcIn, ULong q )
3880 {
3881 ULong crc = amd64g_calc_crc32l(crcIn, q);
3882 return amd64g_calc_crc32l(crc, q >> 32);
3883 }
3884
3885
3886 /* .. helper for next fn .. */
sad_8x4(ULong xx,ULong yy)3887 static inline ULong sad_8x4 ( ULong xx, ULong yy )
3888 {
3889 UInt t = 0;
3890 t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
3891 t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
3892 t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
3893 t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
3894 return (ULong)t;
3895 }
3896
3897 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calc_mpsadbw(ULong sHi,ULong sLo,ULong dHi,ULong dLo,ULong imm_and_return_control_bit)3898 ULong amd64g_calc_mpsadbw ( ULong sHi, ULong sLo,
3899 ULong dHi, ULong dLo,
3900 ULong imm_and_return_control_bit )
3901 {
3902 UInt imm8 = imm_and_return_control_bit & 7;
3903 Bool calcHi = (imm_and_return_control_bit >> 7) & 1;
3904 UInt srcOffsL = imm8 & 3; /* src offs in 32-bit (L) chunks */
3905 UInt dstOffsL = (imm8 >> 2) & 1; /* dst offs in ditto chunks */
3906 /* For src we only need 32 bits, so get them into the
3907 lower half of a 64 bit word. */
3908 ULong src = ((srcOffsL & 2) ? sHi : sLo) >> (32 * (srcOffsL & 1));
3909 /* For dst we need to get hold of 56 bits (7 bytes) from a total of
3910 11 bytes. If calculating the low part of the result, need bytes
3911 dstOffsL * 4 + (0 .. 6); if calculating the high part,
3912 dstOffsL * 4 + (4 .. 10). */
3913 ULong dst;
3914 /* dstOffL = 0, Lo -> 0 .. 6
3915 dstOffL = 1, Lo -> 4 .. 10
3916 dstOffL = 0, Hi -> 4 .. 10
3917 dstOffL = 1, Hi -> 8 .. 14
3918 */
3919 if (calcHi && dstOffsL) {
3920 /* 8 .. 14 */
3921 dst = dHi & 0x00FFFFFFFFFFFFFFULL;
3922 }
3923 else if (!calcHi && !dstOffsL) {
3924 /* 0 .. 6 */
3925 dst = dLo & 0x00FFFFFFFFFFFFFFULL;
3926 }
3927 else {
3928 /* 4 .. 10 */
3929 dst = (dLo >> 32) | ((dHi & 0x00FFFFFFULL) << 32);
3930 }
3931 ULong r0 = sad_8x4( dst >> 0, src );
3932 ULong r1 = sad_8x4( dst >> 8, src );
3933 ULong r2 = sad_8x4( dst >> 16, src );
3934 ULong r3 = sad_8x4( dst >> 24, src );
3935 ULong res = (r3 << 48) | (r2 << 32) | (r1 << 16) | r0;
3936 return res;
3937 }
3938
3939 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calculate_pext(ULong src_masked,ULong mask)3940 ULong amd64g_calculate_pext ( ULong src_masked, ULong mask )
3941 {
3942 ULong dst = 0;
3943 ULong src_bit;
3944 ULong dst_bit = 1;
3945 for (src_bit = 1; src_bit; src_bit <<= 1) {
3946 if (mask & src_bit) {
3947 if (src_masked & src_bit) dst |= dst_bit;
3948 dst_bit <<= 1;
3949 }
3950 }
3951 return dst;
3952 }
3953
3954 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calculate_pdep(ULong src,ULong mask)3955 ULong amd64g_calculate_pdep ( ULong src, ULong mask )
3956 {
3957 ULong dst = 0;
3958 ULong dst_bit;
3959 ULong src_bit = 1;
3960 for (dst_bit = 1; dst_bit; dst_bit <<= 1) {
3961 if (mask & dst_bit) {
3962 if (src & src_bit) dst |= dst_bit;
3963 src_bit <<= 1;
3964 }
3965 }
3966 return dst;
3967 }
3968
3969 /*---------------------------------------------------------------*/
3970 /*--- Helpers for SSE4.2 PCMP{E,I}STR{I,M} ---*/
3971 /*---------------------------------------------------------------*/
3972
zmask_from_V128(V128 * arg)3973 static UInt zmask_from_V128 ( V128* arg )
3974 {
3975 UInt i, res = 0;
3976 for (i = 0; i < 16; i++) {
3977 res |= ((arg->w8[i] == 0) ? 1 : 0) << i;
3978 }
3979 return res;
3980 }
3981
zmask_from_V128_wide(V128 * arg)3982 static UInt zmask_from_V128_wide ( V128* arg )
3983 {
3984 UInt i, res = 0;
3985 for (i = 0; i < 8; i++) {
3986 res |= ((arg->w16[i] == 0) ? 1 : 0) << i;
3987 }
3988 return res;
3989 }
3990
3991 /* Helps with PCMP{I,E}STR{I,M}.
3992
3993 CALLED FROM GENERATED CODE: DIRTY HELPER(s). (But not really,
3994 actually it could be a clean helper, but for the fact that we can't
3995 pass by value 2 x V128 to a clean helper, nor have one returned.)
3996 Reads guest state, writes to guest state for the xSTRM cases, no
3997 accesses of memory, is a pure function.
3998
3999 opc_and_imm contains (4th byte of opcode << 8) | the-imm8-byte so
4000 the callee knows which I/E and I/M variant it is dealing with and
4001 what the specific operation is. 4th byte of opcode is in the range
4002 0x60 to 0x63:
4003 istri 66 0F 3A 63
4004 istrm 66 0F 3A 62
4005 estri 66 0F 3A 61
4006 estrm 66 0F 3A 60
4007
4008 gstOffL and gstOffR are the guest state offsets for the two XMM
4009 register inputs. We never have to deal with the memory case since
4010 that is handled by pre-loading the relevant value into the fake
4011 XMM16 register.
4012
4013 For ESTRx variants, edxIN and eaxIN hold the values of those two
4014 registers.
4015
4016 In all cases, the bottom 16 bits of the result contain the new
4017 OSZACP %rflags values. For xSTRI variants, bits[31:16] of the
4018 result hold the new %ecx value. For xSTRM variants, the helper
4019 writes the result directly to the guest XMM0.
4020
4021 Declarable side effects: in all cases, reads guest state at
4022 [gstOffL, +16) and [gstOffR, +16). For xSTRM variants, also writes
4023 guest_XMM0.
4024
4025 Is expected to be called with opc_and_imm combinations which have
4026 actually been validated, and will assert if otherwise. The front
4027 end should ensure we're only called with verified values.
4028 */
amd64g_dirtyhelper_PCMPxSTRx(VexGuestAMD64State * gst,HWord opc4_and_imm,HWord gstOffL,HWord gstOffR,HWord edxIN,HWord eaxIN)4029 ULong amd64g_dirtyhelper_PCMPxSTRx (
4030 VexGuestAMD64State* gst,
4031 HWord opc4_and_imm,
4032 HWord gstOffL, HWord gstOffR,
4033 HWord edxIN, HWord eaxIN
4034 )
4035 {
4036 HWord opc4 = (opc4_and_imm >> 8) & 0xFF;
4037 HWord imm8 = opc4_and_imm & 0xFF;
4038 HWord isISTRx = opc4 & 2;
4039 HWord isxSTRM = (opc4 & 1) ^ 1;
4040 vassert((opc4 & 0xFC) == 0x60); /* 0x60 .. 0x63 */
4041 HWord wide = (imm8 & 1);
4042
4043 // where the args are
4044 V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
4045 V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
4046
4047 /* Create the arg validity masks, either from the vectors
4048 themselves or from the supplied edx/eax values. */
4049 // FIXME: this is only right for the 8-bit data cases.
4050 // At least that is asserted above.
4051 UInt zmaskL, zmaskR;
4052
4053 // temp spot for the resulting flags and vector.
4054 V128 resV;
4055 UInt resOSZACP;
4056
4057 // for checking whether case was handled
4058 Bool ok = False;
4059
4060 if (wide) {
4061 if (isISTRx) {
4062 zmaskL = zmask_from_V128_wide(argL);
4063 zmaskR = zmask_from_V128_wide(argR);
4064 } else {
4065 Int tmp;
4066 tmp = edxIN & 0xFFFFFFFF;
4067 if (tmp < -8) tmp = -8;
4068 if (tmp > 8) tmp = 8;
4069 if (tmp < 0) tmp = -tmp;
4070 vassert(tmp >= 0 && tmp <= 8);
4071 zmaskL = (1 << tmp) & 0xFF;
4072 tmp = eaxIN & 0xFFFFFFFF;
4073 if (tmp < -8) tmp = -8;
4074 if (tmp > 8) tmp = 8;
4075 if (tmp < 0) tmp = -tmp;
4076 vassert(tmp >= 0 && tmp <= 8);
4077 zmaskR = (1 << tmp) & 0xFF;
4078 }
4079 // do the meyaath
4080 ok = compute_PCMPxSTRx_wide (
4081 &resV, &resOSZACP, argL, argR,
4082 zmaskL, zmaskR, imm8, (Bool)isxSTRM
4083 );
4084 } else {
4085 if (isISTRx) {
4086 zmaskL = zmask_from_V128(argL);
4087 zmaskR = zmask_from_V128(argR);
4088 } else {
4089 Int tmp;
4090 tmp = edxIN & 0xFFFFFFFF;
4091 if (tmp < -16) tmp = -16;
4092 if (tmp > 16) tmp = 16;
4093 if (tmp < 0) tmp = -tmp;
4094 vassert(tmp >= 0 && tmp <= 16);
4095 zmaskL = (1 << tmp) & 0xFFFF;
4096 tmp = eaxIN & 0xFFFFFFFF;
4097 if (tmp < -16) tmp = -16;
4098 if (tmp > 16) tmp = 16;
4099 if (tmp < 0) tmp = -tmp;
4100 vassert(tmp >= 0 && tmp <= 16);
4101 zmaskR = (1 << tmp) & 0xFFFF;
4102 }
4103 // do the meyaath
4104 ok = compute_PCMPxSTRx (
4105 &resV, &resOSZACP, argL, argR,
4106 zmaskL, zmaskR, imm8, (Bool)isxSTRM
4107 );
4108 }
4109
4110 // front end shouldn't pass us any imm8 variants we can't
4111 // handle. Hence:
4112 vassert(ok);
4113
4114 // So, finally we need to get the results back to the caller.
4115 // In all cases, the new OSZACP value is the lowest 16 of
4116 // the return value.
4117 if (isxSTRM) {
4118 gst->guest_YMM0[0] = resV.w32[0];
4119 gst->guest_YMM0[1] = resV.w32[1];
4120 gst->guest_YMM0[2] = resV.w32[2];
4121 gst->guest_YMM0[3] = resV.w32[3];
4122 return resOSZACP & 0x8D5;
4123 } else {
4124 UInt newECX = resV.w32[0] & 0xFFFF;
4125 return (newECX << 16) | (resOSZACP & 0x8D5);
4126 }
4127 }
4128
4129 /*---------------------------------------------------------------*/
4130 /*--- AES primitives and helpers ---*/
4131 /*---------------------------------------------------------------*/
4132 /* a 16 x 16 matrix */
4133 static const UChar sbox[256] = { // row nr
4134 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, // 1
4135 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
4136 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, // 2
4137 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
4138 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, // 3
4139 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
4140 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, // 4
4141 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
4142 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, // 5
4143 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
4144 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, // 6
4145 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
4146 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, // 7
4147 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
4148 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, // 8
4149 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
4150 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, // 9
4151 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
4152 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, //10
4153 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
4154 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, //11
4155 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
4156 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, //12
4157 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
4158 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, //13
4159 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
4160 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, //14
4161 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
4162 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, //15
4163 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
4164 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, //16
4165 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
4166 };
SubBytes(V128 * v)4167 static void SubBytes (V128* v)
4168 {
4169 V128 r;
4170 UInt i;
4171 for (i = 0; i < 16; i++)
4172 r.w8[i] = sbox[v->w8[i]];
4173 *v = r;
4174 }
4175
4176 /* a 16 x 16 matrix */
4177 static const UChar invsbox[256] = { // row nr
4178 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, // 1
4179 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
4180 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, // 2
4181 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
4182 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, // 3
4183 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
4184 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, // 4
4185 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
4186 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, // 5
4187 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
4188 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, // 6
4189 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
4190 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, // 7
4191 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
4192 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, // 8
4193 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
4194 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, // 9
4195 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
4196 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, //10
4197 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
4198 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, //11
4199 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
4200 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, //12
4201 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
4202 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, //13
4203 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
4204 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, //14
4205 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
4206 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, //15
4207 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
4208 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, //16
4209 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
4210 };
InvSubBytes(V128 * v)4211 static void InvSubBytes (V128* v)
4212 {
4213 V128 r;
4214 UInt i;
4215 for (i = 0; i < 16; i++)
4216 r.w8[i] = invsbox[v->w8[i]];
4217 *v = r;
4218 }
4219
4220 static const UChar ShiftRows_op[16] =
4221 {11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0};
ShiftRows(V128 * v)4222 static void ShiftRows (V128* v)
4223 {
4224 V128 r;
4225 UInt i;
4226 for (i = 0; i < 16; i++)
4227 r.w8[i] = v->w8[ShiftRows_op[15-i]];
4228 *v = r;
4229 }
4230
4231 static const UChar InvShiftRows_op[16] =
4232 {3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0};
InvShiftRows(V128 * v)4233 static void InvShiftRows (V128* v)
4234 {
4235 V128 r;
4236 UInt i;
4237 for (i = 0; i < 16; i++)
4238 r.w8[i] = v->w8[InvShiftRows_op[15-i]];
4239 *v = r;
4240 }
4241
4242 /* Multiplication of the finite fields elements of AES.
4243 See "A Specification for The AES Algorithm Rijndael
4244 (by Joan Daemen & Vincent Rijmen)"
4245 Dr. Brian Gladman, v3.1, 3rd March 2001. */
4246 /* N values so that (hex) xy = 0x03^N.
4247 0x00 cannot be used. We put 0xff for this value.*/
4248 /* a 16 x 16 matrix */
4249 static const UChar Nxy[256] = { // row nr
4250 0xff, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, // 1
4251 0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03,
4252 0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, // 2
4253 0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1,
4254 0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, // 3
4255 0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78,
4256 0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, // 4
4257 0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e,
4258 0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, // 5
4259 0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38,
4260 0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, // 6
4261 0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10,
4262 0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, // 7
4263 0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba,
4264 0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, // 8
4265 0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57,
4266 0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, // 9
4267 0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8,
4268 0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, //10
4269 0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0,
4270 0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, //11
4271 0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7,
4272 0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, //12
4273 0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d,
4274 0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, //13
4275 0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1,
4276 0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, //14
4277 0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab,
4278 0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, //15
4279 0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5,
4280 0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, //16
4281 0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07
4282 };
4283
4284 /* E values so that E = 0x03^xy. */
4285 static const UChar Exy[256] = { // row nr
4286 0x01, 0x03, 0x05, 0x0f, 0x11, 0x33, 0x55, 0xff, // 1
4287 0x1a, 0x2e, 0x72, 0x96, 0xa1, 0xf8, 0x13, 0x35,
4288 0x5f, 0xe1, 0x38, 0x48, 0xd8, 0x73, 0x95, 0xa4, // 2
4289 0xf7, 0x02, 0x06, 0x0a, 0x1e, 0x22, 0x66, 0xaa,
4290 0xe5, 0x34, 0x5c, 0xe4, 0x37, 0x59, 0xeb, 0x26, // 3
4291 0x6a, 0xbe, 0xd9, 0x70, 0x90, 0xab, 0xe6, 0x31,
4292 0x53, 0xf5, 0x04, 0x0c, 0x14, 0x3c, 0x44, 0xcc, // 4
4293 0x4f, 0xd1, 0x68, 0xb8, 0xd3, 0x6e, 0xb2, 0xcd,
4294 0x4c, 0xd4, 0x67, 0xa9, 0xe0, 0x3b, 0x4d, 0xd7, // 5
4295 0x62, 0xa6, 0xf1, 0x08, 0x18, 0x28, 0x78, 0x88,
4296 0x83, 0x9e, 0xb9, 0xd0, 0x6b, 0xbd, 0xdc, 0x7f, // 6
4297 0x81, 0x98, 0xb3, 0xce, 0x49, 0xdb, 0x76, 0x9a,
4298 0xb5, 0xc4, 0x57, 0xf9, 0x10, 0x30, 0x50, 0xf0, // 7
4299 0x0b, 0x1d, 0x27, 0x69, 0xbb, 0xd6, 0x61, 0xa3,
4300 0xfe, 0x19, 0x2b, 0x7d, 0x87, 0x92, 0xad, 0xec, // 8
4301 0x2f, 0x71, 0x93, 0xae, 0xe9, 0x20, 0x60, 0xa0,
4302 0xfb, 0x16, 0x3a, 0x4e, 0xd2, 0x6d, 0xb7, 0xc2, // 9
4303 0x5d, 0xe7, 0x32, 0x56, 0xfa, 0x15, 0x3f, 0x41,
4304 0xc3, 0x5e, 0xe2, 0x3d, 0x47, 0xc9, 0x40, 0xc0, //10
4305 0x5b, 0xed, 0x2c, 0x74, 0x9c, 0xbf, 0xda, 0x75,
4306 0x9f, 0xba, 0xd5, 0x64, 0xac, 0xef, 0x2a, 0x7e, //11
4307 0x82, 0x9d, 0xbc, 0xdf, 0x7a, 0x8e, 0x89, 0x80,
4308 0x9b, 0xb6, 0xc1, 0x58, 0xe8, 0x23, 0x65, 0xaf, //12
4309 0xea, 0x25, 0x6f, 0xb1, 0xc8, 0x43, 0xc5, 0x54,
4310 0xfc, 0x1f, 0x21, 0x63, 0xa5, 0xf4, 0x07, 0x09, //13
4311 0x1b, 0x2d, 0x77, 0x99, 0xb0, 0xcb, 0x46, 0xca,
4312 0x45, 0xcf, 0x4a, 0xde, 0x79, 0x8b, 0x86, 0x91, //14
4313 0xa8, 0xe3, 0x3e, 0x42, 0xc6, 0x51, 0xf3, 0x0e,
4314 0x12, 0x36, 0x5a, 0xee, 0x29, 0x7b, 0x8d, 0x8c, //15
4315 0x8f, 0x8a, 0x85, 0x94, 0xa7, 0xf2, 0x0d, 0x17,
4316 0x39, 0x4b, 0xdd, 0x7c, 0x84, 0x97, 0xa2, 0xfd, //16
4317 0x1c, 0x24, 0x6c, 0xb4, 0xc7, 0x52, 0xf6, 0x01};
4318
ff_mul(UChar u1,UChar u2)4319 static inline UChar ff_mul(UChar u1, UChar u2)
4320 {
4321 if ((u1 > 0) && (u2 > 0)) {
4322 UInt ui = Nxy[u1] + Nxy[u2];
4323 if (ui >= 255)
4324 ui = ui - 255;
4325 return Exy[ui];
4326 } else {
4327 return 0;
4328 };
4329 }
4330
MixColumns(V128 * v)4331 static void MixColumns (V128* v)
4332 {
4333 V128 r;
4334 Int j;
4335 #define P(x,row,col) (x)->w8[((row)*4+(col))]
4336 for (j = 0; j < 4; j++) {
4337 P(&r,j,0) = ff_mul(0x02, P(v,j,0)) ^ ff_mul(0x03, P(v,j,1))
4338 ^ P(v,j,2) ^ P(v,j,3);
4339 P(&r,j,1) = P(v,j,0) ^ ff_mul( 0x02, P(v,j,1) )
4340 ^ ff_mul(0x03, P(v,j,2) ) ^ P(v,j,3);
4341 P(&r,j,2) = P(v,j,0) ^ P(v,j,1) ^ ff_mul( 0x02, P(v,j,2) )
4342 ^ ff_mul(0x03, P(v,j,3) );
4343 P(&r,j,3) = ff_mul(0x03, P(v,j,0) ) ^ P(v,j,1) ^ P(v,j,2)
4344 ^ ff_mul( 0x02, P(v,j,3) );
4345 }
4346 *v = r;
4347 #undef P
4348 }
4349
InvMixColumns(V128 * v)4350 static void InvMixColumns (V128* v)
4351 {
4352 V128 r;
4353 Int j;
4354 #define P(x,row,col) (x)->w8[((row)*4+(col))]
4355 for (j = 0; j < 4; j++) {
4356 P(&r,j,0) = ff_mul(0x0e, P(v,j,0) ) ^ ff_mul(0x0b, P(v,j,1) )
4357 ^ ff_mul(0x0d,P(v,j,2) ) ^ ff_mul(0x09, P(v,j,3) );
4358 P(&r,j,1) = ff_mul(0x09, P(v,j,0) ) ^ ff_mul(0x0e, P(v,j,1) )
4359 ^ ff_mul(0x0b,P(v,j,2) ) ^ ff_mul(0x0d, P(v,j,3) );
4360 P(&r,j,2) = ff_mul(0x0d, P(v,j,0) ) ^ ff_mul(0x09, P(v,j,1) )
4361 ^ ff_mul(0x0e,P(v,j,2) ) ^ ff_mul(0x0b, P(v,j,3) );
4362 P(&r,j,3) = ff_mul(0x0b, P(v,j,0) ) ^ ff_mul(0x0d, P(v,j,1) )
4363 ^ ff_mul(0x09,P(v,j,2) ) ^ ff_mul(0x0e, P(v,j,3) );
4364 }
4365 *v = r;
4366 #undef P
4367
4368 }
4369
4370 /* For description, see definition in guest_amd64_defs.h */
amd64g_dirtyhelper_AES(VexGuestAMD64State * gst,HWord opc4,HWord gstOffD,HWord gstOffL,HWord gstOffR)4371 void amd64g_dirtyhelper_AES (
4372 VexGuestAMD64State* gst,
4373 HWord opc4, HWord gstOffD,
4374 HWord gstOffL, HWord gstOffR
4375 )
4376 {
4377 // where the args are
4378 V128* argD = (V128*)( ((UChar*)gst) + gstOffD );
4379 V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
4380 V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
4381 V128 r;
4382
4383 switch (opc4) {
4384 case 0xDC: /* AESENC */
4385 case 0xDD: /* AESENCLAST */
4386 r = *argR;
4387 ShiftRows (&r);
4388 SubBytes (&r);
4389 if (opc4 == 0xDC)
4390 MixColumns (&r);
4391 argD->w64[0] = r.w64[0] ^ argL->w64[0];
4392 argD->w64[1] = r.w64[1] ^ argL->w64[1];
4393 break;
4394
4395 case 0xDE: /* AESDEC */
4396 case 0xDF: /* AESDECLAST */
4397 r = *argR;
4398 InvShiftRows (&r);
4399 InvSubBytes (&r);
4400 if (opc4 == 0xDE)
4401 InvMixColumns (&r);
4402 argD->w64[0] = r.w64[0] ^ argL->w64[0];
4403 argD->w64[1] = r.w64[1] ^ argL->w64[1];
4404 break;
4405
4406 case 0xDB: /* AESIMC */
4407 *argD = *argL;
4408 InvMixColumns (argD);
4409 break;
4410 default: vassert(0);
4411 }
4412 }
4413
RotWord(UInt w32)4414 static inline UInt RotWord (UInt w32)
4415 {
4416 return ((w32 >> 8) | (w32 << 24));
4417 }
4418
SubWord(UInt w32)4419 static inline UInt SubWord (UInt w32)
4420 {
4421 UChar *w8;
4422 UChar *r8;
4423 UInt res;
4424 w8 = (UChar*) &w32;
4425 r8 = (UChar*) &res;
4426 r8[0] = sbox[w8[0]];
4427 r8[1] = sbox[w8[1]];
4428 r8[2] = sbox[w8[2]];
4429 r8[3] = sbox[w8[3]];
4430 return res;
4431 }
4432
4433 /* For description, see definition in guest_amd64_defs.h */
amd64g_dirtyhelper_AESKEYGENASSIST(VexGuestAMD64State * gst,HWord imm8,HWord gstOffL,HWord gstOffR)4434 extern void amd64g_dirtyhelper_AESKEYGENASSIST (
4435 VexGuestAMD64State* gst,
4436 HWord imm8,
4437 HWord gstOffL, HWord gstOffR
4438 )
4439 {
4440 // where the args are
4441 V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
4442 V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
4443
4444 // We have to create the result in a temporary in the
4445 // case where the src and dst regs are the same. See #341698.
4446 V128 tmp;
4447
4448 tmp.w32[3] = RotWord (SubWord (argL->w32[3])) ^ imm8;
4449 tmp.w32[2] = SubWord (argL->w32[3]);
4450 tmp.w32[1] = RotWord (SubWord (argL->w32[1])) ^ imm8;
4451 tmp.w32[0] = SubWord (argL->w32[1]);
4452
4453 argR->w32[3] = tmp.w32[3];
4454 argR->w32[2] = tmp.w32[2];
4455 argR->w32[1] = tmp.w32[1];
4456 argR->w32[0] = tmp.w32[0];
4457 }
4458
4459
4460
4461 /*---------------------------------------------------------------*/
4462 /*--- Helpers for dealing with, and describing, ---*/
4463 /*--- guest state as a whole. ---*/
4464 /*---------------------------------------------------------------*/
4465
4466 /* Initialise the entire amd64 guest state. */
4467 /* VISIBLE TO LIBVEX CLIENT */
LibVEX_GuestAMD64_initialise(VexGuestAMD64State * vex_state)4468 void LibVEX_GuestAMD64_initialise ( /*OUT*/VexGuestAMD64State* vex_state )
4469 {
4470 vex_state->host_EvC_FAILADDR = 0;
4471 vex_state->host_EvC_COUNTER = 0;
4472 vex_state->pad0 = 0;
4473
4474 vex_state->guest_RAX = 0;
4475 vex_state->guest_RCX = 0;
4476 vex_state->guest_RDX = 0;
4477 vex_state->guest_RBX = 0;
4478 vex_state->guest_RSP = 0;
4479 vex_state->guest_RBP = 0;
4480 vex_state->guest_RSI = 0;
4481 vex_state->guest_RDI = 0;
4482 vex_state->guest_R8 = 0;
4483 vex_state->guest_R9 = 0;
4484 vex_state->guest_R10 = 0;
4485 vex_state->guest_R11 = 0;
4486 vex_state->guest_R12 = 0;
4487 vex_state->guest_R13 = 0;
4488 vex_state->guest_R14 = 0;
4489 vex_state->guest_R15 = 0;
4490
4491 vex_state->guest_CC_OP = AMD64G_CC_OP_COPY;
4492 vex_state->guest_CC_DEP1 = 0;
4493 vex_state->guest_CC_DEP2 = 0;
4494 vex_state->guest_CC_NDEP = 0;
4495
4496 vex_state->guest_DFLAG = 1; /* forwards */
4497 vex_state->guest_IDFLAG = 0;
4498 vex_state->guest_ACFLAG = 0;
4499
4500 /* HACK: represent the offset associated with a constant %fs.
4501 Typically, on linux, this assumes that %fs is only ever zero (main
4502 thread) or 0x63. */
4503 vex_state->guest_FS_CONST = 0;
4504
4505 vex_state->guest_RIP = 0;
4506
4507 /* Initialise the simulated FPU */
4508 amd64g_dirtyhelper_FINIT( vex_state );
4509
4510 /* Initialise the AVX state. */
4511 # define AVXZERO(_ymm) \
4512 do { _ymm[0]=_ymm[1]=_ymm[2]=_ymm[3] = 0; \
4513 _ymm[4]=_ymm[5]=_ymm[6]=_ymm[7] = 0; \
4514 } while (0)
4515 vex_state->guest_SSEROUND = (ULong)Irrm_NEAREST;
4516 AVXZERO(vex_state->guest_YMM0);
4517 AVXZERO(vex_state->guest_YMM1);
4518 AVXZERO(vex_state->guest_YMM2);
4519 AVXZERO(vex_state->guest_YMM3);
4520 AVXZERO(vex_state->guest_YMM4);
4521 AVXZERO(vex_state->guest_YMM5);
4522 AVXZERO(vex_state->guest_YMM6);
4523 AVXZERO(vex_state->guest_YMM7);
4524 AVXZERO(vex_state->guest_YMM8);
4525 AVXZERO(vex_state->guest_YMM9);
4526 AVXZERO(vex_state->guest_YMM10);
4527 AVXZERO(vex_state->guest_YMM11);
4528 AVXZERO(vex_state->guest_YMM12);
4529 AVXZERO(vex_state->guest_YMM13);
4530 AVXZERO(vex_state->guest_YMM14);
4531 AVXZERO(vex_state->guest_YMM15);
4532 AVXZERO(vex_state->guest_YMM16);
4533
4534 # undef AVXZERO
4535
4536 vex_state->guest_EMNOTE = EmNote_NONE;
4537
4538 /* These should not ever be either read or written, but we
4539 initialise them anyway. */
4540 vex_state->guest_CMSTART = 0;
4541 vex_state->guest_CMLEN = 0;
4542
4543 vex_state->guest_NRADDR = 0;
4544 vex_state->guest_SC_CLASS = 0;
4545 vex_state->guest_GS_CONST = 0;
4546
4547 vex_state->guest_IP_AT_SYSCALL = 0;
4548 vex_state->pad1 = 0;
4549 }
4550
4551
4552 /* Figure out if any part of the guest state contained in minoff
4553 .. maxoff requires precise memory exceptions. If in doubt return
4554 True (but this generates significantly slower code).
4555
4556 By default we enforce precise exns for guest %RSP, %RBP and %RIP
4557 only. These are the minimum needed to extract correct stack
4558 backtraces from amd64 code.
4559
4560 Only %RSP is needed in mode VexRegUpdSpAtMemAccess.
4561 */
guest_amd64_state_requires_precise_mem_exns(Int minoff,Int maxoff,VexRegisterUpdates pxControl)4562 Bool guest_amd64_state_requires_precise_mem_exns (
4563 Int minoff, Int maxoff, VexRegisterUpdates pxControl
4564 )
4565 {
4566 Int rbp_min = offsetof(VexGuestAMD64State, guest_RBP);
4567 Int rbp_max = rbp_min + 8 - 1;
4568 Int rsp_min = offsetof(VexGuestAMD64State, guest_RSP);
4569 Int rsp_max = rsp_min + 8 - 1;
4570 Int rip_min = offsetof(VexGuestAMD64State, guest_RIP);
4571 Int rip_max = rip_min + 8 - 1;
4572
4573 if (maxoff < rsp_min || minoff > rsp_max) {
4574 /* no overlap with rsp */
4575 if (pxControl == VexRegUpdSpAtMemAccess)
4576 return False; // We only need to check stack pointer.
4577 } else {
4578 return True;
4579 }
4580
4581 if (maxoff < rbp_min || minoff > rbp_max) {
4582 /* no overlap with rbp */
4583 } else {
4584 return True;
4585 }
4586
4587 if (maxoff < rip_min || minoff > rip_max) {
4588 /* no overlap with eip */
4589 } else {
4590 return True;
4591 }
4592
4593 return False;
4594 }
4595
4596
4597 #define ALWAYSDEFD(field) \
4598 { offsetof(VexGuestAMD64State, field), \
4599 (sizeof ((VexGuestAMD64State*)0)->field) }
4600
4601 VexGuestLayout
4602 amd64guest_layout
4603 = {
4604 /* Total size of the guest state, in bytes. */
4605 .total_sizeB = sizeof(VexGuestAMD64State),
4606
4607 /* Describe the stack pointer. */
4608 .offset_SP = offsetof(VexGuestAMD64State,guest_RSP),
4609 .sizeof_SP = 8,
4610
4611 /* Describe the frame pointer. */
4612 .offset_FP = offsetof(VexGuestAMD64State,guest_RBP),
4613 .sizeof_FP = 8,
4614
4615 /* Describe the instruction pointer. */
4616 .offset_IP = offsetof(VexGuestAMD64State,guest_RIP),
4617 .sizeof_IP = 8,
4618
4619 /* Describe any sections to be regarded by Memcheck as
4620 'always-defined'. */
4621 .n_alwaysDefd = 16,
4622
4623 /* flags thunk: OP and NDEP are always defd, whereas DEP1
4624 and DEP2 have to be tracked. See detailed comment in
4625 gdefs.h on meaning of thunk fields. */
4626 .alwaysDefd
4627 = { /* 0 */ ALWAYSDEFD(guest_CC_OP),
4628 /* 1 */ ALWAYSDEFD(guest_CC_NDEP),
4629 /* 2 */ ALWAYSDEFD(guest_DFLAG),
4630 /* 3 */ ALWAYSDEFD(guest_IDFLAG),
4631 /* 4 */ ALWAYSDEFD(guest_RIP),
4632 /* 5 */ ALWAYSDEFD(guest_FS_CONST),
4633 /* 6 */ ALWAYSDEFD(guest_FTOP),
4634 /* 7 */ ALWAYSDEFD(guest_FPTAG),
4635 /* 8 */ ALWAYSDEFD(guest_FPROUND),
4636 /* 9 */ ALWAYSDEFD(guest_FC3210),
4637 // /* */ ALWAYSDEFD(guest_CS),
4638 // /* */ ALWAYSDEFD(guest_DS),
4639 // /* */ ALWAYSDEFD(guest_ES),
4640 // /* */ ALWAYSDEFD(guest_FS),
4641 // /* */ ALWAYSDEFD(guest_GS),
4642 // /* */ ALWAYSDEFD(guest_SS),
4643 // /* */ ALWAYSDEFD(guest_LDT),
4644 // /* */ ALWAYSDEFD(guest_GDT),
4645 /* 10 */ ALWAYSDEFD(guest_EMNOTE),
4646 /* 11 */ ALWAYSDEFD(guest_SSEROUND),
4647 /* 12 */ ALWAYSDEFD(guest_CMSTART),
4648 /* 13 */ ALWAYSDEFD(guest_CMLEN),
4649 /* 14 */ ALWAYSDEFD(guest_SC_CLASS),
4650 /* 15 */ ALWAYSDEFD(guest_IP_AT_SYSCALL)
4651 }
4652 };
4653
4654
4655 /*---------------------------------------------------------------*/
4656 /*--- end guest_amd64_helpers.c ---*/
4657 /*---------------------------------------------------------------*/
4658