VEX/priv/guest_amd64_helpers.c

/*---------------------------------------------------------------*/
/*--- begin                             guest_amd64_helpers.c ---*/
/*---------------------------------------------------------------*/

/*
   This file is part of Valgrind, a dynamic binary instrumentation
   framework.

   Copyright (C) 2004-2017 OpenWorks LLP
      info@open-works.net

   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

   This program is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
   02110-1301, USA.

   The GNU General Public License is contained in the file COPYING.

   Neither the names of the U.S. Department of Energy nor the
   University of California nor the names of its contributors may be
   used to endorse or promote products derived from this software
   without prior written permission.
*/

#include "libvex_basictypes.h"
#include "libvex_emnote.h"
#include "libvex_guest_amd64.h"
#include "libvex_ir.h"
#include "libvex.h"

#include "main_util.h"
#include "main_globals.h"
#include "guest_generic_bb_to_IR.h"
#include "guest_amd64_defs.h"
#include "guest_generic_x87.h"


/* This file contains helper functions for amd64 guest code.
   Calls to these functions are generated by the back end.
   These calls are of course in the host machine code and
   this file will be compiled to host machine code, so that
   all makes sense.

   Only change the signatures of these helper functions very
   carefully.  If you change the signature here, you'll have to change
   the parameters passed to it in the IR calls constructed by
   guest-amd64/toIR.c.

   The convention used is that all functions called from generated
   code are named amd64g_<something>, and any function whose name lacks
   that prefix is not called from generated code.  Note that some
   LibVEX_* functions can however be called by VEX's client, but that
   is not the same as calling them from VEX-generated code.
*/


/* Set to 1 to get detailed profiling info about use of the flag
   machinery. */
#define PROFILE_RFLAGS 0


/*---------------------------------------------------------------*/
/*--- %rflags run-time helpers.                               ---*/
/*---------------------------------------------------------------*/

/* Do 64x64 -> 128 signed/unsigned multiplies, for computing flags
   after imulq/mulq. */

static void mullS64 ( Long u, Long v, Long* rHi, Long* rLo )
{
   const Long halfMask = 0xFFFFFFFFLL;
   ULong u0, v0, w0;
    Long u1, v1, w1, w2, t;
   u0   = u & halfMask;
   u1   = u >> 32;
   v0   = v & halfMask;
   v1   = v >> 32;
   w0   = u0 * v0;
   t    = u1 * v0 + (w0 >> 32);
   w1   = t & halfMask;
   w2   = t >> 32;
   w1   = u0 * v1 + w1;
   *rHi = u1 * v1 + w2 + (w1 >> 32);
   *rLo = (Long)((ULong)u * (ULong)v);
}

static void mullU64 ( ULong u, ULong v, ULong* rHi, ULong* rLo )
{
   const ULong halfMask = 0xFFFFFFFFULL;
   ULong u0, v0, w0;
   ULong u1, v1, w1,w2,t;
   u0   = u & halfMask;
   u1   = u >> 32;
   v0   = v & halfMask;
   v1   = v >> 32;
   w0   = u0 * v0;
   t    = u1 * v0 + (w0 >> 32);
   w1   = t & halfMask;
   w2   = t >> 32;
   w1   = u0 * v1 + w1;
   *rHi = u1 * v1 + w2 + (w1 >> 32);
   *rLo = u * v;
}


static const UChar parity_table[256] = {
    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
};

/* generalised left-shifter */
static inline Long lshift ( Long x, Int n )
{
   if (n >= 0)
      return (ULong)x << n;
   else
      return x >> (-n);
}

/* identity on ULong */
static inline ULong idULong ( ULong x )
{
   return x;
}


#define PREAMBLE(__data_bits)					\
   /* const */ ULong DATA_MASK 					\
      = __data_bits==8                                          \
           ? 0xFFULL 					        \
           : (__data_bits==16                                   \
                ? 0xFFFFULL 		                        \
                : (__data_bits==32                              \
                     ? 0xFFFFFFFFULL                            \
                     : 0xFFFFFFFFFFFFFFFFULL));                 \
   /* const */ ULong SIGN_MASK = 1ULL << (__data_bits - 1);     \
   /* const */ ULong CC_DEP1 = cc_dep1_formal;			\
   /* const */ ULong CC_DEP2 = cc_dep2_formal;			\
   /* const */ ULong CC_NDEP = cc_ndep_formal;			\
   /* Four bogus assignments, which hopefully gcc can     */	\
   /* optimise away, and which stop it complaining about  */	\
   /* unused variables.                                   */	\
   SIGN_MASK = SIGN_MASK;					\
   DATA_MASK = DATA_MASK;					\
   CC_DEP2 = CC_DEP2;						\
   CC_NDEP = CC_NDEP;


/*-------------------------------------------------------------*/

#define ACTIONS_ADD(DATA_BITS,DATA_UTYPE)			\
{								\
   PREAMBLE(DATA_BITS);						\
   { ULong cf, pf, af, zf, sf, of;				\
     ULong argL, argR, res;					\
     argL = CC_DEP1;						\
     argR = CC_DEP2;						\
     res  = argL + argR;					\
     cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;			\
     pf = parity_table[(UChar)res];				\
     af = (res ^ argL ^ argR) & 0x10;				\
     zf = ((DATA_UTYPE)res == 0) << 6;				\
     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
     of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
                 12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
     return cf | pf | af | zf | sf | of;			\
   }								\
}

/*-------------------------------------------------------------*/

#define ACTIONS_SUB(DATA_BITS,DATA_UTYPE)			\
{								\
   PREAMBLE(DATA_BITS);						\
   { ULong cf, pf, af, zf, sf, of;				\
     ULong argL, argR, res;					\
     argL = CC_DEP1;						\
     argR = CC_DEP2;						\
     res  = argL - argR;					\
     cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;			\
     pf = parity_table[(UChar)res];				\
     af = (res ^ argL ^ argR) & 0x10;				\
     zf = ((DATA_UTYPE)res == 0) << 6;				\
     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
     of = lshift((argL ^ argR) & (argL ^ res),	 		\
                 12 - DATA_BITS) & AMD64G_CC_MASK_O; 		\
     return cf | pf | af | zf | sf | of;			\
   }								\
}

/*-------------------------------------------------------------*/

#define ACTIONS_ADC(DATA_BITS,DATA_UTYPE)			\
{								\
   PREAMBLE(DATA_BITS);						\
   { ULong cf, pf, af, zf, sf, of;				\
     ULong argL, argR, oldC, res;		 		\
     oldC = CC_NDEP & AMD64G_CC_MASK_C;				\
     argL = CC_DEP1;						\
     argR = CC_DEP2 ^ oldC;	       				\
     res  = (argL + argR) + oldC;				\
     if (oldC)							\
        cf = (DATA_UTYPE)res <= (DATA_UTYPE)argL;		\
     else							\
        cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;		\
     pf = parity_table[(UChar)res];				\
     af = (res ^ argL ^ argR) & 0x10;				\
     zf = ((DATA_UTYPE)res == 0) << 6;				\
     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
     of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
                  12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
     return cf | pf | af | zf | sf | of;			\
   }								\
}

/*-------------------------------------------------------------*/

#define ACTIONS_SBB(DATA_BITS,DATA_UTYPE)			\
{								\
   PREAMBLE(DATA_BITS);						\
   { ULong cf, pf, af, zf, sf, of;				\
     ULong argL, argR, oldC, res;	       			\
     oldC = CC_NDEP & AMD64G_CC_MASK_C;				\
     argL = CC_DEP1;						\
     argR = CC_DEP2 ^ oldC;	       				\
     res  = (argL - argR) - oldC;				\
     if (oldC)							\
        cf = (DATA_UTYPE)argL <= (DATA_UTYPE)argR;		\
     else							\
        cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;		\
     pf = parity_table[(UChar)res];				\
     af = (res ^ argL ^ argR) & 0x10;				\
     zf = ((DATA_UTYPE)res == 0) << 6;				\
     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
     of = lshift((argL ^ argR) & (argL ^ res), 			\
                 12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
     return cf | pf | af | zf | sf | of;			\
   }								\
}

/*-------------------------------------------------------------*/

#define ACTIONS_LOGIC(DATA_BITS,DATA_UTYPE)			\
{								\
   PREAMBLE(DATA_BITS);						\
   { ULong cf, pf, af, zf, sf, of;				\
     cf = 0;							\
     pf = parity_table[(UChar)CC_DEP1];				\
     af = 0;							\
     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
     of = 0;							\
     return cf | pf | af | zf | sf | of;			\
   }								\
}

/*-------------------------------------------------------------*/

#define ACTIONS_INC(DATA_BITS,DATA_UTYPE)			\
{								\
   PREAMBLE(DATA_BITS);						\
   { ULong cf, pf, af, zf, sf, of;				\
     ULong argL, argR, res;					\
     res  = CC_DEP1;						\
     argL = res - 1;						\
     argR = 1;							\
     cf = CC_NDEP & AMD64G_CC_MASK_C;				\
     pf = parity_table[(UChar)res];				\
     af = (res ^ argL ^ argR) & 0x10;				\
     zf = ((DATA_UTYPE)res == 0) << 6;				\
     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
     of = ((res & DATA_MASK) == SIGN_MASK) << 11;		\
     return cf | pf | af | zf | sf | of;			\
   }								\
}

/*-------------------------------------------------------------*/

#define ACTIONS_DEC(DATA_BITS,DATA_UTYPE)			\
{								\
   PREAMBLE(DATA_BITS);						\
   { ULong cf, pf, af, zf, sf, of;				\
     ULong argL, argR, res;					\
     res  = CC_DEP1;						\
     argL = res + 1;						\
     argR = 1;							\
     cf = CC_NDEP & AMD64G_CC_MASK_C;				\
     pf = parity_table[(UChar)res];				\
     af = (res ^ argL ^ argR) & 0x10;				\
     zf = ((DATA_UTYPE)res == 0) << 6;				\
     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
     of = ((res & DATA_MASK) 					\
          == ((ULong)SIGN_MASK - 1)) << 11;			\
     return cf | pf | af | zf | sf | of;			\
   }								\
}

/*-------------------------------------------------------------*/

#define ACTIONS_SHL(DATA_BITS,DATA_UTYPE)			\
{								\
   PREAMBLE(DATA_BITS);						\
   { ULong cf, pf, af, zf, sf, of;				\
     cf = (CC_DEP2 >> (DATA_BITS - 1)) & AMD64G_CC_MASK_C;	\
     pf = parity_table[(UChar)CC_DEP1];				\
     af = 0; /* undefined */					\
     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
     /* of is defined if shift count == 1 */			\
     of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) 		\
          & AMD64G_CC_MASK_O;					\
     return cf | pf | af | zf | sf | of;			\
   }								\
}

/*-------------------------------------------------------------*/

#define ACTIONS_SHR(DATA_BITS,DATA_UTYPE)			\
{								\
   PREAMBLE(DATA_BITS);  					\
   { ULong cf, pf, af, zf, sf, of;				\
     cf = CC_DEP2 & 1;						\
     pf = parity_table[(UChar)CC_DEP1];				\
     af = 0; /* undefined */					\
     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
     /* of is defined if shift count == 1 */			\
     of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS)		\
          & AMD64G_CC_MASK_O;					\
     return cf | pf | af | zf | sf | of;			\
   }								\
}

/*-------------------------------------------------------------*/

/* ROL: cf' = lsb(result).  of' = msb(result) ^ lsb(result). */
/* DEP1 = result, NDEP = old flags */
#define ACTIONS_ROL(DATA_BITS,DATA_UTYPE)			\
{								\
   PREAMBLE(DATA_BITS);						\
   { ULong fl 							\
        = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))	\
          | (AMD64G_CC_MASK_C & CC_DEP1)			\
          | (AMD64G_CC_MASK_O & (lshift(CC_DEP1,  		\
                                      11-(DATA_BITS-1)) 	\
                     ^ lshift(CC_DEP1, 11)));			\
     return fl;							\
   }								\
}

/*-------------------------------------------------------------*/

/* ROR: cf' = msb(result).  of' = msb(result) ^ msb-1(result). */
/* DEP1 = result, NDEP = old flags */
#define ACTIONS_ROR(DATA_BITS,DATA_UTYPE)			\
{								\
   PREAMBLE(DATA_BITS);						\
   { ULong fl 							\
        = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))	\
          | (AMD64G_CC_MASK_C & (CC_DEP1 >> (DATA_BITS-1)))	\
          | (AMD64G_CC_MASK_O & (lshift(CC_DEP1, 		\
                                      11-(DATA_BITS-1)) 	\
                     ^ lshift(CC_DEP1, 11-(DATA_BITS-1)+1)));	\
     return fl;							\
   }								\
}

/*-------------------------------------------------------------*/

#define ACTIONS_UMUL(DATA_BITS, DATA_UTYPE,  NARROWtoU,         \
                                DATA_U2TYPE, NARROWto2U)        \
{                                                               \
   PREAMBLE(DATA_BITS);                                         \
   { ULong cf, pf, af, zf, sf, of;                              \
     DATA_UTYPE  hi;                                            \
     DATA_UTYPE  lo                                             \
        = NARROWtoU( ((DATA_UTYPE)CC_DEP1)                      \
                     * ((DATA_UTYPE)CC_DEP2) );                 \
     DATA_U2TYPE rr                                             \
        = NARROWto2U(                                           \
             ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP1))               \
             * ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP2)) );          \
     hi = NARROWtoU(rr >>/*u*/ DATA_BITS);                      \
     cf = (hi != 0);                                            \
     pf = parity_table[(UChar)lo];                              \
     af = 0; /* undefined */                                    \
     zf = (lo == 0) << 6;                                       \
     sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
     of = cf << 11;                                             \
     return cf | pf | af | zf | sf | of;                        \
   }								\
}

/*-------------------------------------------------------------*/

#define ACTIONS_SMUL(DATA_BITS, DATA_STYPE,  NARROWtoS,         \
                                DATA_S2TYPE, NARROWto2S)        \
{                                                               \
   PREAMBLE(DATA_BITS);                                         \
   { ULong cf, pf, af, zf, sf, of;                              \
     DATA_STYPE  hi;                                            \
     DATA_STYPE  lo                                             \
        = NARROWtoS( ((DATA_S2TYPE)(DATA_STYPE)CC_DEP1)         \
                     * ((DATA_S2TYPE)(DATA_STYPE)CC_DEP2) );    \
     DATA_S2TYPE rr                                             \
        = NARROWto2S(                                           \
             ((DATA_S2TYPE)((DATA_STYPE)CC_DEP1))               \
             * ((DATA_S2TYPE)((DATA_STYPE)CC_DEP2)) );          \
     hi = NARROWtoS(rr >>/*s*/ DATA_BITS);                      \
     cf = (hi != (lo >>/*s*/ (DATA_BITS-1)));                   \
     pf = parity_table[(UChar)lo];                              \
     af = 0; /* undefined */                                    \
     zf = (lo == 0) << 6;                                       \
     sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
     of = cf << 11;                                             \
     return cf | pf | af | zf | sf | of;                        \
   }								\
}

/*-------------------------------------------------------------*/

#define ACTIONS_UMULQ                                           \
{                                                               \
   PREAMBLE(64);                                                \
   { ULong cf, pf, af, zf, sf, of;                              \
     ULong lo, hi;                                              \
     mullU64( (ULong)CC_DEP1, (ULong)CC_DEP2, &hi, &lo );       \
     cf = (hi != 0);                                            \
     pf = parity_table[(UChar)lo];                              \
     af = 0; /* undefined */                                    \
     zf = (lo == 0) << 6;                                       \
     sf = lshift(lo, 8 - 64) & 0x80;                            \
     of = cf << 11;                                             \
     return cf | pf | af | zf | sf | of;                        \
   }								\
}

/*-------------------------------------------------------------*/

#define ACTIONS_SMULQ                                           \
{                                                               \
   PREAMBLE(64);                                                \
   { ULong cf, pf, af, zf, sf, of;                              \
     Long lo, hi;                                               \
     mullS64( (Long)CC_DEP1, (Long)CC_DEP2, &hi, &lo );         \
     cf = (hi != (lo >>/*s*/ (64-1)));                          \
     pf = parity_table[(UChar)lo];                              \
     af = 0; /* undefined */                                    \
     zf = (lo == 0) << 6;                                       \
     sf = lshift(lo, 8 - 64) & 0x80;                            \
     of = cf << 11;                                             \
     return cf | pf | af | zf | sf | of;                        \
   }								\
}

/*-------------------------------------------------------------*/

#define ACTIONS_ANDN(DATA_BITS,DATA_UTYPE)			\
{								\
   PREAMBLE(DATA_BITS);						\
   { ULong cf, pf, af, zf, sf, of;				\
     cf = 0;							\
     pf = 0;							\
     af = 0;							\
     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
     of = 0;							\
     return cf | pf | af | zf | sf | of;			\
   }								\
}

/*-------------------------------------------------------------*/

#define ACTIONS_BLSI(DATA_BITS,DATA_UTYPE)			\
{								\
   PREAMBLE(DATA_BITS);						\
   { ULong cf, pf, af, zf, sf, of;				\
     cf = ((DATA_UTYPE)CC_DEP2 != 0);				\
     pf = 0;							\
     af = 0;							\
     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
     of = 0;							\
     return cf | pf | af | zf | sf | of;			\
   }								\
}

/*-------------------------------------------------------------*/

#define ACTIONS_BLSMSK(DATA_BITS,DATA_UTYPE)			\
{								\
   PREAMBLE(DATA_BITS);						\
   { Long cf, pf, af, zf, sf, of;				\
     cf = ((DATA_UTYPE)CC_DEP2 == 0);				\
     pf = 0;							\
     af = 0;							\
     zf = 0;							\
     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
     of = 0;							\
     return cf | pf | af | zf | sf | of;			\
   }								\
}

/*-------------------------------------------------------------*/

#define ACTIONS_BLSR(DATA_BITS,DATA_UTYPE)			\
{								\
   PREAMBLE(DATA_BITS);						\
   { ULong cf, pf, af, zf, sf, of;				\
     cf = ((DATA_UTYPE)CC_DEP2 == 0);				\
     pf = 0;							\
     af = 0;							\
     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
     of = 0;							\
     return cf | pf | af | zf | sf | of;			\
   }								\
}

/*-------------------------------------------------------------*/

#define ACTIONS_ADX(DATA_BITS,DATA_UTYPE,FLAGNAME)		\
{								\
   PREAMBLE(DATA_BITS);						\
   { ULong ocf;	/* o or c */					\
     ULong argL, argR, oldOC, res;				\
     oldOC = (CC_NDEP >> AMD64G_CC_SHIFT_##FLAGNAME) & 1;	\
     argL  = CC_DEP1;						\
     argR  = CC_DEP2 ^ oldOC;					\
     res   = (argL + argR) + oldOC;				\
     if (oldOC)							\
        ocf = (DATA_UTYPE)res <= (DATA_UTYPE)argL;		\
     else							\
        ocf = (DATA_UTYPE)res < (DATA_UTYPE)argL;		\
     return (CC_NDEP & ~AMD64G_CC_MASK_##FLAGNAME)		\
            | (ocf << AMD64G_CC_SHIFT_##FLAGNAME);		\
   }								\
}

/*-------------------------------------------------------------*/


#if PROFILE_RFLAGS

static Bool initted     = False;

/* C flag, fast route */
static UInt tabc_fast[AMD64G_CC_OP_NUMBER];
/* C flag, slow route */
static UInt tabc_slow[AMD64G_CC_OP_NUMBER];
/* table for calculate_cond */
static UInt tab_cond[AMD64G_CC_OP_NUMBER][16];
/* total entry counts for calc_all, calc_c, calc_cond. */
static UInt n_calc_all  = 0;
static UInt n_calc_c    = 0;
static UInt n_calc_cond = 0;

#define SHOW_COUNTS_NOW (0 == (0x3FFFFF & (n_calc_all+n_calc_c+n_calc_cond)))


static void showCounts ( void )
{
   Int op, co;
   HChar ch;
   vex_printf("\nTotal calls: calc_all=%u   calc_cond=%u   calc_c=%u\n",
              n_calc_all, n_calc_cond, n_calc_c);

   vex_printf("      cSLOW  cFAST    O   NO    B   NB    Z   NZ   BE  NBE"
              "    S   NS    P   NP    L   NL   LE  NLE\n");
   vex_printf("     -----------------------------------------------------"
              "----------------------------------------\n");
   for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {

      ch = ' ';
      if (op > 0 && (op-1) % 4 == 0)
         ch = 'B';
      if (op > 0 && (op-1) % 4 == 1)
         ch = 'W';
      if (op > 0 && (op-1) % 4 == 2)
         ch = 'L';
      if (op > 0 && (op-1) % 4 == 3)
         ch = 'Q';

      vex_printf("%2d%c: ", op, ch);
      vex_printf("%6u ", tabc_slow[op]);
      vex_printf("%6u ", tabc_fast[op]);
      for (co = 0; co < 16; co++) {
         Int n = tab_cond[op][co];
         if (n >= 1000) {
            vex_printf(" %3dK", n / 1000);
         } else
         if (n >= 0) {
            vex_printf(" %3d ", n );
         } else {
            vex_printf("     ");
         }
      }
      vex_printf("\n");
   }
   vex_printf("\n");
}

static void initCounts ( void )
{
   Int op, co;
   initted = True;
   for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
      tabc_fast[op] = tabc_slow[op] = 0;
      for (co = 0; co < 16; co++)
         tab_cond[op][co] = 0;
   }
}

#endif /* PROFILE_RFLAGS */


/* CALLED FROM GENERATED CODE: CLEAN HELPER */
/* Calculate all the 6 flags from the supplied thunk parameters.
   Worker function, not directly called from generated code. */
static
ULong amd64g_calculate_rflags_all_WRK ( ULong cc_op,
                                        ULong cc_dep1_formal,
                                        ULong cc_dep2_formal,
                                        ULong cc_ndep_formal )
{
   switch (cc_op) {
      case AMD64G_CC_OP_COPY:
         return cc_dep1_formal
                & (AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z
                   | AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P);

      case AMD64G_CC_OP_ADDB:   ACTIONS_ADD( 8,  UChar  );
      case AMD64G_CC_OP_ADDW:   ACTIONS_ADD( 16, UShort );
      case AMD64G_CC_OP_ADDL:   ACTIONS_ADD( 32, UInt   );
      case AMD64G_CC_OP_ADDQ:   ACTIONS_ADD( 64, ULong  );

      case AMD64G_CC_OP_ADCB:   ACTIONS_ADC( 8,  UChar  );
      case AMD64G_CC_OP_ADCW:   ACTIONS_ADC( 16, UShort );
      case AMD64G_CC_OP_ADCL:   ACTIONS_ADC( 32, UInt   );
      case AMD64G_CC_OP_ADCQ:   ACTIONS_ADC( 64, ULong  );

      case AMD64G_CC_OP_SUBB:   ACTIONS_SUB(  8, UChar  );
      case AMD64G_CC_OP_SUBW:   ACTIONS_SUB( 16, UShort );
      case AMD64G_CC_OP_SUBL:   ACTIONS_SUB( 32, UInt   );
      case AMD64G_CC_OP_SUBQ:   ACTIONS_SUB( 64, ULong  );

      case AMD64G_CC_OP_SBBB:   ACTIONS_SBB(  8, UChar  );
      case AMD64G_CC_OP_SBBW:   ACTIONS_SBB( 16, UShort );
      case AMD64G_CC_OP_SBBL:   ACTIONS_SBB( 32, UInt   );
      case AMD64G_CC_OP_SBBQ:   ACTIONS_SBB( 64, ULong  );

      case AMD64G_CC_OP_LOGICB: ACTIONS_LOGIC(  8, UChar  );
      case AMD64G_CC_OP_LOGICW: ACTIONS_LOGIC( 16, UShort );
      case AMD64G_CC_OP_LOGICL: ACTIONS_LOGIC( 32, UInt   );
      case AMD64G_CC_OP_LOGICQ: ACTIONS_LOGIC( 64, ULong  );

      case AMD64G_CC_OP_INCB:   ACTIONS_INC(  8, UChar  );
      case AMD64G_CC_OP_INCW:   ACTIONS_INC( 16, UShort );
      case AMD64G_CC_OP_INCL:   ACTIONS_INC( 32, UInt   );
      case AMD64G_CC_OP_INCQ:   ACTIONS_INC( 64, ULong  );

      case AMD64G_CC_OP_DECB:   ACTIONS_DEC(  8, UChar  );
      case AMD64G_CC_OP_DECW:   ACTIONS_DEC( 16, UShort );
      case AMD64G_CC_OP_DECL:   ACTIONS_DEC( 32, UInt   );
      case AMD64G_CC_OP_DECQ:   ACTIONS_DEC( 64, ULong  );

      case AMD64G_CC_OP_SHLB:   ACTIONS_SHL(  8, UChar  );
      case AMD64G_CC_OP_SHLW:   ACTIONS_SHL( 16, UShort );
      case AMD64G_CC_OP_SHLL:   ACTIONS_SHL( 32, UInt   );
      case AMD64G_CC_OP_SHLQ:   ACTIONS_SHL( 64, ULong  );

      case AMD64G_CC_OP_SHRB:   ACTIONS_SHR(  8, UChar  );
      case AMD64G_CC_OP_SHRW:   ACTIONS_SHR( 16, UShort );
      case AMD64G_CC_OP_SHRL:   ACTIONS_SHR( 32, UInt   );
      case AMD64G_CC_OP_SHRQ:   ACTIONS_SHR( 64, ULong  );

      case AMD64G_CC_OP_ROLB:   ACTIONS_ROL(  8, UChar  );
      case AMD64G_CC_OP_ROLW:   ACTIONS_ROL( 16, UShort );
      case AMD64G_CC_OP_ROLL:   ACTIONS_ROL( 32, UInt   );
      case AMD64G_CC_OP_ROLQ:   ACTIONS_ROL( 64, ULong  );

      case AMD64G_CC_OP_RORB:   ACTIONS_ROR(  8, UChar  );
      case AMD64G_CC_OP_RORW:   ACTIONS_ROR( 16, UShort );
      case AMD64G_CC_OP_RORL:   ACTIONS_ROR( 32, UInt   );
      case AMD64G_CC_OP_RORQ:   ACTIONS_ROR( 64, ULong  );

      case AMD64G_CC_OP_UMULB:  ACTIONS_UMUL(  8, UChar,  toUChar,
                                                  UShort, toUShort );
      case AMD64G_CC_OP_UMULW:  ACTIONS_UMUL( 16, UShort, toUShort,
                                                  UInt,   toUInt );
      case AMD64G_CC_OP_UMULL:  ACTIONS_UMUL( 32, UInt,   toUInt,
                                                  ULong,  idULong );

      case AMD64G_CC_OP_UMULQ:  ACTIONS_UMULQ;

      case AMD64G_CC_OP_SMULB:  ACTIONS_SMUL(  8, Char,   toUChar,
                                                  Short,  toUShort );
      case AMD64G_CC_OP_SMULW:  ACTIONS_SMUL( 16, Short,  toUShort,
                                                  Int,    toUInt   );
      case AMD64G_CC_OP_SMULL:  ACTIONS_SMUL( 32, Int,    toUInt,
                                                  Long,   idULong );

      case AMD64G_CC_OP_SMULQ:  ACTIONS_SMULQ;

      case AMD64G_CC_OP_ANDN32: ACTIONS_ANDN( 32, UInt   );
      case AMD64G_CC_OP_ANDN64: ACTIONS_ANDN( 64, ULong  );

      case AMD64G_CC_OP_BLSI32: ACTIONS_BLSI( 32, UInt   );
      case AMD64G_CC_OP_BLSI64: ACTIONS_BLSI( 64, ULong  );

      case AMD64G_CC_OP_BLSMSK32: ACTIONS_BLSMSK( 32, UInt   );
      case AMD64G_CC_OP_BLSMSK64: ACTIONS_BLSMSK( 64, ULong  );

      case AMD64G_CC_OP_BLSR32: ACTIONS_BLSR( 32, UInt   );
      case AMD64G_CC_OP_BLSR64: ACTIONS_BLSR( 64, ULong  );

      case AMD64G_CC_OP_ADCX32: ACTIONS_ADX( 32, UInt,  C );
      case AMD64G_CC_OP_ADCX64: ACTIONS_ADX( 64, ULong, C );

      case AMD64G_CC_OP_ADOX32: ACTIONS_ADX( 32, UInt,  O );
      case AMD64G_CC_OP_ADOX64: ACTIONS_ADX( 64, ULong, O );

      default:
         /* shouldn't really make these calls from generated code */
         vex_printf("amd64g_calculate_rflags_all_WRK(AMD64)"
                    "( %llu, 0x%llx, 0x%llx, 0x%llx )\n",
                    cc_op, cc_dep1_formal, cc_dep2_formal, cc_ndep_formal );
         vpanic("amd64g_calculate_rflags_all_WRK(AMD64)");
   }
}


/* CALLED FROM GENERATED CODE: CLEAN HELPER */
/* Calculate all the 6 flags from the supplied thunk parameters. */
ULong amd64g_calculate_rflags_all ( ULong cc_op,
                                    ULong cc_dep1,
                                    ULong cc_dep2,
                                    ULong cc_ndep )
{
#  if PROFILE_RFLAGS
   if (!initted) initCounts();
   n_calc_all++;
   if (SHOW_COUNTS_NOW) showCounts();
#  endif
   return
      amd64g_calculate_rflags_all_WRK ( cc_op, cc_dep1, cc_dep2, cc_ndep );
}


/* CALLED FROM GENERATED CODE: CLEAN HELPER */
/* Calculate just the carry flag from the supplied thunk parameters. */
ULong amd64g_calculate_rflags_c ( ULong cc_op,
                                  ULong cc_dep1,
                                  ULong cc_dep2,
                                  ULong cc_ndep )
{
#  if PROFILE_RFLAGS
   if (!initted) initCounts();
   n_calc_c++;
   tabc_fast[cc_op]++;
   if (SHOW_COUNTS_NOW) showCounts();
#  endif

   /* Fast-case some common ones. */
   switch (cc_op) {
      case AMD64G_CC_OP_COPY:
         return (cc_dep1 >> AMD64G_CC_SHIFT_C) & 1;
      case AMD64G_CC_OP_LOGICQ:
      case AMD64G_CC_OP_LOGICL:
      case AMD64G_CC_OP_LOGICW:
      case AMD64G_CC_OP_LOGICB:
         return 0;
	 //      case AMD64G_CC_OP_SUBL:
	 //         return ((UInt)cc_dep1) < ((UInt)cc_dep2)
	 //                   ? AMD64G_CC_MASK_C : 0;
	 //      case AMD64G_CC_OP_SUBW:
	 //         return ((UInt)(cc_dep1 & 0xFFFF)) < ((UInt)(cc_dep2 & 0xFFFF))
	 //                   ? AMD64G_CC_MASK_C : 0;
	 //      case AMD64G_CC_OP_SUBB:
	 //         return ((UInt)(cc_dep1 & 0xFF)) < ((UInt)(cc_dep2 & 0xFF))
	 //                   ? AMD64G_CC_MASK_C : 0;
	 //      case AMD64G_CC_OP_INCL:
	 //      case AMD64G_CC_OP_DECL:
	 //         return cc_ndep & AMD64G_CC_MASK_C;
      default:
         break;
   }

#  if PROFILE_RFLAGS
   tabc_fast[cc_op]--;
   tabc_slow[cc_op]++;
#  endif

   return amd64g_calculate_rflags_all_WRK(cc_op,cc_dep1,cc_dep2,cc_ndep)
          & AMD64G_CC_MASK_C;
}


/* CALLED FROM GENERATED CODE: CLEAN HELPER */
/* returns 1 or 0 */
ULong amd64g_calculate_condition ( ULong/*AMD64Condcode*/ cond,
                                   ULong cc_op,
                                   ULong cc_dep1,
                                   ULong cc_dep2,
                                   ULong cc_ndep )
{
   ULong rflags = amd64g_calculate_rflags_all_WRK(cc_op, cc_dep1,
                                                  cc_dep2, cc_ndep);
   ULong of,sf,zf,cf,pf;
   ULong inv = cond & 1;

#  if PROFILE_RFLAGS
   if (!initted) initCounts();
   tab_cond[cc_op][cond]++;
   n_calc_cond++;
   if (SHOW_COUNTS_NOW) showCounts();
#  endif

   switch (cond) {
      case AMD64CondNO:
      case AMD64CondO: /* OF == 1 */
         of = rflags >> AMD64G_CC_SHIFT_O;
         return 1 & (inv ^ of);

      case AMD64CondNZ:
      case AMD64CondZ: /* ZF == 1 */
         zf = rflags >> AMD64G_CC_SHIFT_Z;
         return 1 & (inv ^ zf);

      case AMD64CondNB:
      case AMD64CondB: /* CF == 1 */
         cf = rflags >> AMD64G_CC_SHIFT_C;
         return 1 & (inv ^ cf);
         break;

      case AMD64CondNBE:
      case AMD64CondBE: /* (CF or ZF) == 1 */
         cf = rflags >> AMD64G_CC_SHIFT_C;
         zf = rflags >> AMD64G_CC_SHIFT_Z;
         return 1 & (inv ^ (cf | zf));
         break;

      case AMD64CondNS:
      case AMD64CondS: /* SF == 1 */
         sf = rflags >> AMD64G_CC_SHIFT_S;
         return 1 & (inv ^ sf);

      case AMD64CondNP:
      case AMD64CondP: /* PF == 1 */
         pf = rflags >> AMD64G_CC_SHIFT_P;
         return 1 & (inv ^ pf);

      case AMD64CondNL:
      case AMD64CondL: /* (SF xor OF) == 1 */
         sf = rflags >> AMD64G_CC_SHIFT_S;
         of = rflags >> AMD64G_CC_SHIFT_O;
         return 1 & (inv ^ (sf ^ of));
         break;

      case AMD64CondNLE:
      case AMD64CondLE: /* ((SF xor OF) or ZF)  == 1 */
         sf = rflags >> AMD64G_CC_SHIFT_S;
         of = rflags >> AMD64G_CC_SHIFT_O;
         zf = rflags >> AMD64G_CC_SHIFT_Z;
         return 1 & (inv ^ ((sf ^ of) | zf));
         break;

      default:
         /* shouldn't really make these calls from generated code */
         vex_printf("amd64g_calculate_condition"
                    "( %llu, %llu, 0x%llx, 0x%llx, 0x%llx )\n",
                    cond, cc_op, cc_dep1, cc_dep2, cc_ndep );
         vpanic("amd64g_calculate_condition");
   }
}


/* VISIBLE TO LIBVEX CLIENT */
ULong LibVEX_GuestAMD64_get_rflags ( /*IN*/const VexGuestAMD64State* vex_state )
{
   ULong rflags = amd64g_calculate_rflags_all_WRK(
                     vex_state->guest_CC_OP,
                     vex_state->guest_CC_DEP1,
                     vex_state->guest_CC_DEP2,
                     vex_state->guest_CC_NDEP
                  );
   Long dflag = vex_state->guest_DFLAG;
   vassert(dflag == 1 || dflag == -1);
   if (dflag == -1)
      rflags |= (1<<10);
   if (vex_state->guest_IDFLAG == 1)
      rflags |= (1<<21);
   if (vex_state->guest_ACFLAG == 1)
      rflags |= (1<<18);

   return rflags;
}

/* VISIBLE TO LIBVEX CLIENT */
void
LibVEX_GuestAMD64_put_rflags ( ULong rflags,
                               /*MOD*/VexGuestAMD64State* vex_state )
{
   /* D flag */
   if (rflags & AMD64G_CC_MASK_D) {
      vex_state->guest_DFLAG = -1;
      rflags &= ~AMD64G_CC_MASK_D;
   }
   else
      vex_state->guest_DFLAG = 1;

   /* ID flag */
   if (rflags & AMD64G_CC_MASK_ID) {
      vex_state->guest_IDFLAG = 1;
      rflags &= ~AMD64G_CC_MASK_ID;
   }
   else
      vex_state->guest_IDFLAG = 0;

   /* AC flag */
   if (rflags & AMD64G_CC_MASK_AC) {
      vex_state->guest_ACFLAG = 1;
      rflags &= ~AMD64G_CC_MASK_AC;
   }
   else
      vex_state->guest_ACFLAG = 0;

   UInt cc_mask = AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z |
                  AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P;
   vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
   vex_state->guest_CC_DEP1 = rflags & cc_mask;
   vex_state->guest_CC_DEP2 = 0;
   vex_state->guest_CC_NDEP = 0;
}

/* VISIBLE TO LIBVEX CLIENT */
void
LibVEX_GuestAMD64_put_rflag_c ( ULong new_carry_flag,
                               /*MOD*/VexGuestAMD64State* vex_state )
{
   ULong oszacp = amd64g_calculate_rflags_all_WRK(
                     vex_state->guest_CC_OP,
                     vex_state->guest_CC_DEP1,
                     vex_state->guest_CC_DEP2,
                     vex_state->guest_CC_NDEP
                  );
   if (new_carry_flag & 1) {
      oszacp |= AMD64G_CC_MASK_C;
   } else {
      oszacp &= ~AMD64G_CC_MASK_C;
   }
   vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
   vex_state->guest_CC_DEP1 = oszacp;
   vex_state->guest_CC_DEP2 = 0;
   vex_state->guest_CC_NDEP = 0;
}


/*---------------------------------------------------------------*/
/*--- %rflags translation-time function specialisers.         ---*/
/*--- These help iropt specialise calls the above run-time    ---*/
/*--- %rflags functions.                                      ---*/
/*---------------------------------------------------------------*/

/* Used by the optimiser to try specialisations.  Returns an
   equivalent expression, or NULL if none. */

static inline Bool isU64 ( IRExpr* e, ULong n )
{
   return e->tag == Iex_Const
          && e->Iex.Const.con->tag == Ico_U64
          && e->Iex.Const.con->Ico.U64 == n;
}

/* Returns N if W64 is a value of the form 1 << N for N in 1 to 31,
   and zero in any other case. */
static Int isU64_1_shl_N_literal ( ULong w64 )
{
   if (w64 < (1ULL << 1) || w64 > (1ULL << 31))
      return 0;
   if ((w64 & (w64 - 1)) != 0)
      return 0;
   /* At this point, we know w64 is a power of two in the range 2^1 .. 2^31,
      and we only need to find out which one it is. */
   for (Int n = 1; n <= 31; n++) {
      if (w64 == (1ULL << n))
         return n;
   }
   /* Consequently we should never get here. */
   /*UNREACHED*/
   vassert(0);
   return 0;
}

/* Returns N if E is an immediate of the form 1 << N for N in 1 to 31,
   and zero in any other case. */
static Int isU64_1_shl_N ( IRExpr* e )
{
   if (e->tag != Iex_Const || e->Iex.Const.con->tag != Ico_U64)
      return 0;
   ULong w64 = e->Iex.Const.con->Ico.U64;
   return isU64_1_shl_N_literal(w64);
}

/* Returns N if E is an immediate of the form (1 << N) - 1 for N in 1 to 31,
   and zero in any other case. */
static Int isU64_1_shl_N_minus_1 ( IRExpr* e )
{
  if (e->tag != Iex_Const || e->Iex.Const.con->tag != Ico_U64)
    return 0;
  ULong w64 = e->Iex.Const.con->Ico.U64;
  // This isn't actually necessary since isU64_1_shl_N_literal will return
  // zero given a zero argument, but still ..
  if (w64 == 0xFFFFFFFFFFFFFFFFULL)
     return 0;
  return isU64_1_shl_N_literal(w64 + 1);
}

IRExpr* guest_amd64_spechelper ( const HChar* function_name,
                                 IRExpr** args,
                                 IRStmt** precedingStmts,
                                 Int      n_precedingStmts )
{
#  define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
#  define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
#  define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
#  define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
#  define mkU8(_n)  IRExpr_Const(IRConst_U8(_n))

   Int i, arity = 0;
   for (i = 0; args[i]; i++)
      arity++;
#  if 0
   vex_printf("spec request:\n");
   vex_printf("   %s  ", function_name);
   for (i = 0; i < arity; i++) {
      vex_printf("  ");
      ppIRExpr(args[i]);
   }
   vex_printf("\n");
#  endif

   /* --------- specialising "amd64g_calculate_condition" --------- */

   if (vex_streq(function_name, "amd64g_calculate_condition")) {
      /* specialise calls to above "calculate condition" function */
      IRExpr *cond, *cc_op, *cc_dep1, *cc_dep2;
      vassert(arity == 5);
      cond    = args[0];
      cc_op   = args[1];
      cc_dep1 = args[2];
      cc_dep2 = args[3];

      /*---------------- ADDQ ----------------*/

      if (isU64(cc_op, AMD64G_CC_OP_ADDQ) && isU64(cond, AMD64CondZ)) {
         /* long long add, then Z --> test (dst+src == 0) */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpEQ64,
                           binop(Iop_Add64, cc_dep1, cc_dep2),
                           mkU64(0)));
      }

      /*---------------- ADDL ----------------*/

      if (isU64(cc_op, AMD64G_CC_OP_ADDL) && isU64(cond, AMD64CondO)) {
         /* This is very commonly generated by Javascript JITs, for
            the idiom "do a 32-bit add and jump to out-of-line code if
            an overflow occurs". */
         /* long add, then O (overflow)
            --> ((dep1 ^ dep2 ^ -1) & (dep1 ^ (dep1 + dep2)))[31]
            --> (((dep1 ^ dep2 ^ -1) & (dep1 ^ (dep1 +64 dep2))) >>u 31) & 1
            --> (((not(dep1 ^ dep2)) & (dep1 ^ (dep1 +64 dep2))) >>u 31) & 1
         */
         vassert(isIRAtom(cc_dep1));
         vassert(isIRAtom(cc_dep2));
         return
            binop(Iop_And64,
                  binop(Iop_Shr64,
                        binop(Iop_And64,
                              unop(Iop_Not64,
                                   binop(Iop_Xor64, cc_dep1, cc_dep2)),
                              binop(Iop_Xor64,
                                    cc_dep1,
                                    binop(Iop_Add64, cc_dep1, cc_dep2))),
                        mkU8(31)),
                  mkU64(1));

      }

      /*---------------- SUBQ ----------------*/

      /* 0, */
      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondO)) {
         /* long long sub/cmp, then O (overflow)
            --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2)))[63]
            --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2))) >>u 63
         */
         vassert(isIRAtom(cc_dep1));
         vassert(isIRAtom(cc_dep2));
         return binop(Iop_Shr64,
                      binop(Iop_And64,
                            binop(Iop_Xor64, cc_dep1, cc_dep2),
                            binop(Iop_Xor64,
                                  cc_dep1,
                                  binop(Iop_Sub64, cc_dep1, cc_dep2))),
                      mkU8(63));
      }
      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNO)) {
         /* No action.  Never yet found a test case. */
      }

      /* 2, 3 */
      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondB)) {
         /* long long sub/cmp, then B (unsigned less than)
            --> test dst <u src */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLT64U, cc_dep1, cc_dep2));
      }
      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNB)) {
         /* long long sub/cmp, then NB (unsigned greater than or equal)
            --> test src <=u dst */
         /* Note, args are opposite way round from the usual */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLE64U, cc_dep2, cc_dep1));
      }

      /* 4, 5 */
      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondZ)) {
         /* long long sub/cmp, then Z --> test dst==src */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpEQ64,cc_dep1,cc_dep2));
      }
      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNZ)) {
         /* long long sub/cmp, then NZ --> test dst!=src */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpNE64,cc_dep1,cc_dep2));
      }

      /* 6, 7 */
      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondBE)) {
         /* long long sub/cmp, then BE (unsigned less than or equal)
            --> test dst <=u src */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLE64U, cc_dep1, cc_dep2));
      }
      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNBE)) {
         /* long long sub/cmp, then NBE (unsigned greater than)
            --> test !(dst <=u src) */
         return binop(Iop_Xor64,
                      unop(Iop_1Uto64,
                           binop(Iop_CmpLE64U, cc_dep1, cc_dep2)),
                      mkU64(1));
      }

      /* 8, 9 */
      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondS)) {
         /* long long sub/cmp, then S (negative)
            --> (dst-src)[63]
            --> (dst-src) >>u 63 */
         return binop(Iop_Shr64,
                      binop(Iop_Sub64, cc_dep1, cc_dep2),
                      mkU8(63));
      }
      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNS)) {
         /* long long sub/cmp, then NS (not negative)
            --> (dst-src)[63] ^ 1
            --> ((dst-src) >>u 63) ^ 1 */
         return binop(Iop_Xor64,
                      binop(Iop_Shr64,
                            binop(Iop_Sub64, cc_dep1, cc_dep2),
                            mkU8(63)),
                      mkU64(1));
      }

      /* 12, 13 */
      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondL)) {
         /* long long sub/cmp, then L (signed less than)
            --> test dst <s src */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLT64S, cc_dep1, cc_dep2));
      }
      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNL)) {
         /* long long sub/cmp, then NL (signed greater than or equal)
            --> test dst >=s src
            --> test src <=s dst */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLE64S, cc_dep2, cc_dep1));
      }

      /* 14, 15 */
      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondLE)) {
         /* long long sub/cmp, then LE (signed less than or equal)
            --> test dst <=s src */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLE64S, cc_dep1, cc_dep2));
      }
      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNLE)) {
         /* long sub/cmp, then NLE (signed greater than)
            --> test !(dst <=s src)
            --> test (dst >s src)
            --> test (src <s dst) */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLT64S, cc_dep2, cc_dep1));

      }

      /*---------------- SUBL ----------------*/

      /* 0, */
      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondO)) {
         /* This is very commonly generated by Javascript JITs, for
            the idiom "do a 32-bit subtract and jump to out-of-line
            code if an overflow occurs". */
         /* long sub/cmp, then O (overflow)
            --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2)))[31]
            --> (((dep1 ^ dep2) & (dep1 ^ (dep1 -64 dep2))) >>u 31) & 1
         */
         vassert(isIRAtom(cc_dep1));
         vassert(isIRAtom(cc_dep2));
         return
            binop(Iop_And64,
                  binop(Iop_Shr64,
                        binop(Iop_And64,
                              binop(Iop_Xor64, cc_dep1, cc_dep2),
                              binop(Iop_Xor64,
                                    cc_dep1,
                                    binop(Iop_Sub64, cc_dep1, cc_dep2))),
                        mkU8(31)),
                  mkU64(1));
      }
      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNO)) {
         /* No action.  Never yet found a test case. */
      }

      /* 2, 3 */
      {
        /* It appears that LLVM 5.0 and later have a new way to find out
           whether the top N bits of a word W are all zero, by computing

             W  <u   0---(N-1)---0 1 0---0  or
             W  <=u  0---(N-1)---0 0 1---1

           In particular, the result will be defined if the top N bits of W
           are defined, even if the trailing bits -- those corresponding to
           the rightmost 0---0 / 1---1 section -- are undefined.  Rather than
           make Memcheck more complex, we detect this case where we can and
           shift out the irrelevant and potentially undefined bits. */
        Int n = 0;
        Bool is_NB_or_NBE = False;
        if (isU64(cc_op, AMD64G_CC_OP_SUBL)) {
           if (isU64(cond, AMD64CondB) || isU64(cond, AMD64CondNB)) {
              /* long sub/cmp, then B (unsigned less than),
                 where dep2 is a power of 2:
                   -> CmpLT32U(dep1, 1 << N)
                   -> CmpEQ32(dep1 >>u N, 0)
                 and
                 long sub/cmp, then NB (unsigned greater than or equal),
                 where dep2 is a power of 2:
                   -> CmpGE32U(dep1, 1 << N)
                   -> CmpNE32(dep1 >>u N, 0)
                 This avoids CmpLT32U/CmpGE32U being applied to potentially
                 uninitialised bits in the area being shifted out. */
              n = isU64_1_shl_N(cc_dep2);
              is_NB_or_NBE = isU64(cond, AMD64CondNB);
           } else if (isU64(cond, AMD64CondBE) || isU64(cond, AMD64CondNBE)) {
              /* long sub/cmp, then BE (unsigned less than or equal),
                 where dep2 is a power of 2 minus 1:
                  -> CmpLE32U(dep1, (1 << N) - 1)
                  -> CmpEQ32(dep1 >>u N, 0)
                 and
                 long sub/cmp, then NBE (unsigned greater than),
                 where dep2 is a power of 2 minus 1:
                   -> CmpGT32U(dep1, (1 << N) - 1)
                   -> CmpNE32(dep1 >>u N, 0)
                 This avoids CmpLE32U/CmpGT32U being applied to potentially
                 uninitialised bits in the area being shifted out. */
              n = isU64_1_shl_N_minus_1(cc_dep2);
              is_NB_or_NBE = isU64(cond, AMD64CondNBE);
           }
        }
        if (n > 0) {
           vassert(n >= 1 && n <= 31);
           return unop(Iop_1Uto64,
                       binop(is_NB_or_NBE ? Iop_CmpNE32 : Iop_CmpEQ32,
                             binop(Iop_Shr32, unop(Iop_64to32, cc_dep1),
                                              mkU8(n)),
                             mkU32(0)));
        }
      }
      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondB)) {
         /* long sub/cmp, then B (unsigned less than)
            --> test dst <u src */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLT32U,
                           unop(Iop_64to32, cc_dep1),
                           unop(Iop_64to32, cc_dep2)));
      }
      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNB)) {
         /* long sub/cmp, then NB (unsigned greater than or equal)
            --> test src <=u dst */
         /* Note, args are opposite way round from the usual */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLE32U,
                           unop(Iop_64to32, cc_dep2),
                           unop(Iop_64to32, cc_dep1)));
      }

      /* 4, 5 */
      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondZ)) {
         /* long sub/cmp, then Z --> test dst==src */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpEQ32,
                           unop(Iop_64to32, cc_dep1),
                           unop(Iop_64to32, cc_dep2)));
      }
      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNZ)) {
         /* long sub/cmp, then NZ --> test dst!=src */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpNE32,
                           unop(Iop_64to32, cc_dep1),
                           unop(Iop_64to32, cc_dep2)));
      }

      /* 6, 7 */
      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondBE)) {
         /* long sub/cmp, then BE (unsigned less than or equal)
            --> test dst <=u src */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLE32U,
                           unop(Iop_64to32, cc_dep1),
                           unop(Iop_64to32, cc_dep2)));
      }
      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNBE)) {
         /* long sub/cmp, then NBE (unsigned greater than)
            --> test src <u dst */
         /* Note, args are opposite way round from the usual */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLT32U,
                           unop(Iop_64to32, cc_dep2),
                           unop(Iop_64to32, cc_dep1)));
      }

      /* 8, 9 */
      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondS)) {
         /* long sub/cmp, then S (negative)
            --> (dst-src)[31]
            --> ((dst -64 src) >>u 31) & 1
            Pointless to narrow the args to 32 bit before the subtract. */
         return binop(Iop_And64,
                      binop(Iop_Shr64,
                            binop(Iop_Sub64, cc_dep1, cc_dep2),
                            mkU8(31)),
                      mkU64(1));
      }
      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNS)) {
         /* long sub/cmp, then NS (not negative)
            --> (dst-src)[31] ^ 1
            --> (((dst -64 src) >>u 31) & 1) ^ 1
            Pointless to narrow the args to 32 bit before the subtract. */
         return binop(Iop_Xor64,
                      binop(Iop_And64,
                            binop(Iop_Shr64,
                                  binop(Iop_Sub64, cc_dep1, cc_dep2),
                                  mkU8(31)),
                            mkU64(1)),
                      mkU64(1));
      }

      /* 12, 13 */
      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondL)) {
         /* long sub/cmp, then L (signed less than)
            --> test dst <s src */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLT32S,
                           unop(Iop_64to32, cc_dep1),
                           unop(Iop_64to32, cc_dep2)));
      }
      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNL)) {
         /* long sub/cmp, then NL (signed greater than or equal)
            --> test dst >=s src
            --> test src <=s dst */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLE32S,
                           unop(Iop_64to32, cc_dep2),
                           unop(Iop_64to32, cc_dep1)));
      }

      /* 14, 15 */
      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondLE)) {
         /* long sub/cmp, then LE (signed less than or equal)
            --> test dst <=s src */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLE32S,
                           unop(Iop_64to32, cc_dep1),
                           unop(Iop_64to32, cc_dep2)));

      }
      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNLE)) {
         /* long sub/cmp, then NLE (signed greater than)
            --> test !(dst <=s src)
            --> test (dst >s src)
            --> test (src <s dst) */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLT32S,
                           unop(Iop_64to32, cc_dep2),
                           unop(Iop_64to32, cc_dep1)));

      }

      /*---------------- SUBW ----------------*/

      /* 4, 5 */
      if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondZ)) {
         /* word sub/cmp, then Z --> test dst==src */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpEQ16,
                           unop(Iop_64to16,cc_dep1),
                           unop(Iop_64to16,cc_dep2)));
      }
      if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondNZ)) {
         /* word sub/cmp, then NZ --> test dst!=src */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpNE16,
                           unop(Iop_64to16,cc_dep1),
                           unop(Iop_64to16,cc_dep2)));
      }

      /* 6, */
      if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondBE)) {
         /* word sub/cmp, then BE (unsigned less than or equal)
            --> test dst <=u src */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLE64U,
                           binop(Iop_Shl64, cc_dep1, mkU8(48)),
                           binop(Iop_Shl64, cc_dep2, mkU8(48))));
      }

      /* 8, 9 */
      if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondS)
                                          && isU64(cc_dep2, 0)) {
         /* word sub/cmp of zero, then S --> test (dst-0 <s 0)
                                         --> test dst <s 0
                                         --> (ULong)dst[15]
            This is yet another scheme by which clang figures out if the
            top bit of a word is 1 or 0.  See also LOGICB/CondS below. */
         /* Note: isU64(cc_dep2, 0) is correct, even though this is
            for an 16-bit comparison, since the args to the helper
            function are always U64s. */
         return binop(Iop_And64,
                      binop(Iop_Shr64,cc_dep1,mkU8(15)),
                      mkU64(1));
      }
      if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondNS)
                                          && isU64(cc_dep2, 0)) {
         /* word sub/cmp of zero, then NS --> test !(dst-0 <s 0)
                                          --> test !(dst <s 0)
                                          --> (ULong) !dst[15]
         */
         return binop(Iop_Xor64,
                      binop(Iop_And64,
                            binop(Iop_Shr64,cc_dep1,mkU8(15)),
                            mkU64(1)),
                      mkU64(1));
      }

      /* 14, */
      if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondLE)) {
         /* word sub/cmp, then LE (signed less than or equal)
            --> test dst <=s src */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLE64S,
                           binop(Iop_Shl64,cc_dep1,mkU8(48)),
                           binop(Iop_Shl64,cc_dep2,mkU8(48))));

      }

      /*---------------- SUBB ----------------*/

      /* 2, 3 */
      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondB)) {
         /* byte sub/cmp, then B (unsigned less than)
            --> test dst <u src */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLT64U,
                           binop(Iop_And64, cc_dep1, mkU64(0xFF)),
                           binop(Iop_And64, cc_dep2, mkU64(0xFF))));
      }
      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNB)) {
         /* byte sub/cmp, then NB (unsigned greater than or equal)
            --> test src <=u dst */
         /* Note, args are opposite way round from the usual */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLE64U,
                           binop(Iop_And64, cc_dep2, mkU64(0xFF)),
                           binop(Iop_And64, cc_dep1, mkU64(0xFF))));
      }

      /* 4, 5 */
      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondZ)) {
         /* byte sub/cmp, then Z --> test dst==src */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpEQ8,
                           unop(Iop_64to8,cc_dep1),
                           unop(Iop_64to8,cc_dep2)));
      }
      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNZ)) {
         /* byte sub/cmp, then NZ --> test dst!=src */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpNE8,
                           unop(Iop_64to8,cc_dep1),
                           unop(Iop_64to8,cc_dep2)));
      }

      /* 6, */
      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondBE)) {
         /* byte sub/cmp, then BE (unsigned less than or equal)
            --> test dst <=u src */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLE64U,
                           binop(Iop_And64, cc_dep1, mkU64(0xFF)),
                           binop(Iop_And64, cc_dep2, mkU64(0xFF))));
      }

      /* 8, 9 */
      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondS)
                                          && isU64(cc_dep2, 0)) {
         /* byte sub/cmp of zero, then S --> test (dst-0 <s 0)
                                         --> test dst <s 0
                                         --> (ULong)dst[7]
            This is yet another scheme by which gcc figures out if the
            top bit of a byte is 1 or 0.  See also LOGICB/CondS below. */
         /* Note: isU64(cc_dep2, 0) is correct, even though this is
            for an 8-bit comparison, since the args to the helper
            function are always U64s. */
         return binop(Iop_And64,
                      binop(Iop_Shr64,cc_dep1,mkU8(7)),
                      mkU64(1));
      }
      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNS)
                                          && isU64(cc_dep2, 0)) {
         /* byte sub/cmp of zero, then NS --> test !(dst-0 <s 0)
                                          --> test !(dst <s 0)
                                          --> (ULong) !dst[7]
         */
         return binop(Iop_Xor64,
                      binop(Iop_And64,
                            binop(Iop_Shr64,cc_dep1,mkU8(7)),
                            mkU64(1)),
                      mkU64(1));
      }

      /*---------------- LOGICQ ----------------*/

      if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondZ)) {
         /* long long and/or/xor, then Z --> test dst==0 */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
      }
      if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondNZ)) {
         /* long long and/or/xor, then NZ --> test dst!=0 */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
      }

      if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondL)) {
         /* long long and/or/xor, then L
            LOGIC sets SF and ZF according to the
            result and makes OF be zero.  L computes SF ^ OF, but
            OF is zero, so this reduces to SF -- which will be 1 iff
            the result is < signed 0.  Hence ...
         */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLT64S,
                           cc_dep1,
                           mkU64(0)));
      }

      /*---------------- LOGICL ----------------*/

      if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondZ)) {
         /* long and/or/xor, then Z --> test dst==0 */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpEQ32,
                           unop(Iop_64to32, cc_dep1),
                           mkU32(0)));
      }
      if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNZ)) {
         /* long and/or/xor, then NZ --> test dst!=0 */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpNE32,
                           unop(Iop_64to32, cc_dep1),
                           mkU32(0)));
      }

      if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondLE)) {
         /* long and/or/xor, then LE
            This is pretty subtle.  LOGIC sets SF and ZF according to the
            result and makes OF be zero.  LE computes (SF ^ OF) | ZF, but
            OF is zero, so this reduces to SF | ZF -- which will be 1 iff
            the result is <=signed 0.  Hence ...
         */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLE32S,
                           unop(Iop_64to32, cc_dep1),
                           mkU32(0)));
      }

      if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondS)) {
         /* long and/or/xor, then S --> (ULong)result[31] */
         return binop(Iop_And64,
                      binop(Iop_Shr64, cc_dep1, mkU8(31)),
                      mkU64(1));
      }
      if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNS)) {
         /* long and/or/xor, then S --> (ULong) ~ result[31] */
         return binop(Iop_Xor64,
                binop(Iop_And64,
                      binop(Iop_Shr64, cc_dep1, mkU8(31)),
                      mkU64(1)),
                mkU64(1));
      }

      /*---------------- LOGICW ----------------*/

      if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondZ)) {
         /* word and/or/xor, then Z --> test dst==0 */
         // Use CmpEQ32 rather than CmpEQ64 here, so that Memcheck instruments
         // it exactly at EdcAUTO.
         return unop(Iop_1Uto64,
                     binop(Iop_CmpEQ32,
                           unop(Iop_16Uto32, unop(Iop_64to16, cc_dep1)),
                           mkU32(0)));
      }
      if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondNZ)) {
         /* word and/or/xor, then NZ --> test dst!=0 */
         // Use CmpNE32 rather than CmpNE64 here, so that Memcheck instruments
         // it exactly at EdcAUTO.
         return unop(Iop_1Uto64,
                     binop(Iop_CmpNE32,
                           unop(Iop_16Uto32, unop(Iop_64to16, cc_dep1)),
                           mkU32(0)));
      }

      /*---------------- LOGICB ----------------*/

      if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondZ)) {
         /* byte and/or/xor, then Z --> test dst==0 */
         // Use CmpEQ32 rather than CmpEQ64 here, so that Memcheck instruments
         // it exactly at EdcAUTO.
         return unop(Iop_1Uto64,
                     binop(Iop_CmpEQ32,
                           unop(Iop_8Uto32, unop(Iop_64to8, cc_dep1)),
                           mkU32(0)));
      }
      if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNZ)) {
         /* byte and/or/xor, then NZ --> test dst!=0 */
         // Use CmpNE32 rather than CmpNE64 here, so that Memcheck instruments
         // it exactly at EdcAUTO.
         return unop(Iop_1Uto64,
                     binop(Iop_CmpNE32,
                           unop(Iop_8Uto32, unop(Iop_64to8, cc_dep1)),
                           mkU32(0)));
      }

      if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondS)) {
         /* this is an idiom gcc sometimes uses to find out if the top
            bit of a byte register is set: eg testb %al,%al; js ..
            Since it just depends on the top bit of the byte, extract
            that bit and explicitly get rid of all the rest.  This
            helps memcheck avoid false positives in the case where any
            of the other bits in the byte are undefined. */
         /* byte and/or/xor, then S --> (UInt)result[7] */
         return binop(Iop_And64,
                      binop(Iop_Shr64,cc_dep1,mkU8(7)),
                      mkU64(1));
      }
      if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNS)) {
         /* byte and/or/xor, then NS --> (UInt)!result[7] */
         return binop(Iop_Xor64,
                      binop(Iop_And64,
                            binop(Iop_Shr64,cc_dep1,mkU8(7)),
                            mkU64(1)),
                      mkU64(1));
      }

      /*---------------- INCB ----------------*/

      if (isU64(cc_op, AMD64G_CC_OP_INCB) && isU64(cond, AMD64CondLE)) {
         /* 8-bit inc, then LE --> sign bit of the arg */
         return binop(Iop_And64,
                      binop(Iop_Shr64,
                            binop(Iop_Sub64, cc_dep1, mkU64(1)),
                            mkU8(7)),
                      mkU64(1));
      }

      /*---------------- INCW ----------------*/

      if (isU64(cc_op, AMD64G_CC_OP_INCW) && isU64(cond, AMD64CondZ)) {
         /* 16-bit inc, then Z --> test dst == 0 */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpEQ64,
                           binop(Iop_Shl64,cc_dep1,mkU8(48)),
                           mkU64(0)));
      }

      /*---------------- DECL ----------------*/

      if (isU64(cc_op, AMD64G_CC_OP_DECL) && isU64(cond, AMD64CondZ)) {
         /* dec L, then Z --> test dst == 0 */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpEQ32,
                           unop(Iop_64to32, cc_dep1),
                           mkU32(0)));
      }

      /*---------------- DECW ----------------*/

      if (isU64(cc_op, AMD64G_CC_OP_DECW) && isU64(cond, AMD64CondNZ)) {
         /* 16-bit dec, then NZ --> test dst != 0 */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpNE64,
                           binop(Iop_Shl64,cc_dep1,mkU8(48)),
                           mkU64(0)));
      }

      /*---------------- SHRQ ----------------*/

      if (isU64(cc_op, AMD64G_CC_OP_SHRQ) && isU64(cond, AMD64CondZ)) {
         /* SHRQ, then Z --> test dep1 == 0 */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
      }
      if (isU64(cc_op, AMD64G_CC_OP_SHRQ) && isU64(cond, AMD64CondNZ)) {
         /* SHRQ, then NZ --> test dep1 != 0 */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
      }

      /*---------------- SHRL ----------------*/

      if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondZ)) {
         /* SHRL, then Z --> test dep1 == 0 */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpEQ32, unop(Iop_64to32, cc_dep1),
                           mkU32(0)));
      }
      if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondNZ)) {
         /* SHRL, then NZ --> test dep1 != 0 */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpNE32, unop(Iop_64to32, cc_dep1),
                           mkU32(0)));
      }

      if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondS)) {
         /* SHRL/SARL, then S --> (ULong)result[31] */
         return binop(Iop_And64,
                      binop(Iop_Shr64, cc_dep1, mkU8(31)),
                      mkU64(1));
      }
      // The following looks correct to me, but never seems to happen because
      // the front end converts jns to js by switching the fallthrough vs
      // taken addresses.  See jcc_01().  But then why do other conditions
      // considered by this function show up in both variants (xx and Nxx) ?
      //if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondNS)) {
      //   /* SHRL/SARL, then NS --> (ULong) ~ result[31] */
      //   vassert(0);
      //   return binop(Iop_Xor64,
      //                binop(Iop_And64,
      //                      binop(Iop_Shr64, cc_dep1, mkU8(31)),
      //                      mkU64(1)),
      //                mkU64(1));
      //}

      /*---------------- COPY ----------------*/
      /* This can happen, as a result of amd64 FP compares: "comisd ... ;
         jbe" for example. */

      if (isU64(cc_op, AMD64G_CC_OP_COPY)
          && (isU64(cond, AMD64CondBE) || isU64(cond, AMD64CondNBE))) {
         /* COPY, then BE --> extract C and Z from dep1, and test (C
            or Z == 1). */
         /* COPY, then NBE --> extract C and Z from dep1, and test (C
            or Z == 0). */
         ULong nnn = isU64(cond, AMD64CondBE) ? 1 : 0;
         return
            unop(
               Iop_1Uto64,
               binop(
                  Iop_CmpEQ64,
                  binop(
                     Iop_And64,
                     binop(
                        Iop_Or64,
                        binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
                        binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z))
                     ),
                     mkU64(1)
                  ),
                  mkU64(nnn)
               )
            );
      }

      if (isU64(cc_op, AMD64G_CC_OP_COPY)
          && (isU64(cond, AMD64CondB) || isU64(cond, AMD64CondNB))) {
         /* COPY, then B --> extract C from dep1, and test (C == 1). */
         /* COPY, then NB --> extract C from dep1, and test (C == 0). */
         ULong nnn = isU64(cond, AMD64CondB) ? 1 : 0;
         return
            unop(
               Iop_1Uto64,
               binop(
                  Iop_CmpEQ64,
                  binop(
                     Iop_And64,
                     binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
                     mkU64(1)
                  ),
                  mkU64(nnn)
               )
            );
      }

      if (isU64(cc_op, AMD64G_CC_OP_COPY)
          && (isU64(cond, AMD64CondZ) || isU64(cond, AMD64CondNZ))) {
         /* COPY, then Z --> extract Z from dep1, and test (Z == 1). */
         /* COPY, then NZ --> extract Z from dep1, and test (Z == 0). */
         ULong nnn = isU64(cond, AMD64CondZ) ? 1 : 0;
         return
            unop(
               Iop_1Uto64,
               binop(
                  Iop_CmpEQ64,
                  binop(
                     Iop_And64,
                     binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z)),
                     mkU64(1)
                  ),
                  mkU64(nnn)
               )
            );
      }

      if (isU64(cc_op, AMD64G_CC_OP_COPY)
          && (isU64(cond, AMD64CondP) || isU64(cond, AMD64CondNP))) {
         /* COPY, then P --> extract P from dep1, and test (P == 1). */
         /* COPY, then NP --> extract P from dep1, and test (P == 0). */
         ULong nnn = isU64(cond, AMD64CondP) ? 1 : 0;
         return
            unop(
               Iop_1Uto64,
               binop(
                  Iop_CmpEQ64,
                  binop(
                     Iop_And64,
                     binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_P)),
                     mkU64(1)
                  ),
                  mkU64(nnn)
               )
            );
      }

      return NULL;
   }

   /* --------- specialising "amd64g_calculate_rflags_c" --------- */

   if (vex_streq(function_name, "amd64g_calculate_rflags_c")) {
      /* specialise calls to above "calculate_rflags_c" function */
      IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
      vassert(arity == 4);
      cc_op   = args[0];
      cc_dep1 = args[1];
      cc_dep2 = args[2];
      cc_ndep = args[3];

      if (isU64(cc_op, AMD64G_CC_OP_SUBQ)) {
         /* C after sub denotes unsigned less than */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLT64U,
                           cc_dep1,
                           cc_dep2));
      }
      if (isU64(cc_op, AMD64G_CC_OP_SUBL)) {
         /* C after sub denotes unsigned less than */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLT32U,
                           unop(Iop_64to32, cc_dep1),
                           unop(Iop_64to32, cc_dep2)));
      }
      if (isU64(cc_op, AMD64G_CC_OP_SUBB)) {
         /* C after sub denotes unsigned less than */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLT64U,
                           binop(Iop_And64,cc_dep1,mkU64(0xFF)),
                           binop(Iop_And64,cc_dep2,mkU64(0xFF))));
      }
      if (isU64(cc_op, AMD64G_CC_OP_ADDQ)) {
         /* C after add denotes sum <u either arg */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLT64U,
                           binop(Iop_Add64, cc_dep1, cc_dep2),
                           cc_dep1));
      }
      if (isU64(cc_op, AMD64G_CC_OP_ADDL)) {
         /* C after add denotes sum <u either arg */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLT32U,
                           unop(Iop_64to32, binop(Iop_Add64, cc_dep1, cc_dep2)),
                           unop(Iop_64to32, cc_dep1)));
      }
      if (isU64(cc_op, AMD64G_CC_OP_LOGICQ)
          || isU64(cc_op, AMD64G_CC_OP_LOGICL)
          || isU64(cc_op, AMD64G_CC_OP_LOGICW)
          || isU64(cc_op, AMD64G_CC_OP_LOGICB)) {
         /* cflag after logic is zero */
         return mkU64(0);
      }
      if (isU64(cc_op, AMD64G_CC_OP_DECL) || isU64(cc_op, AMD64G_CC_OP_INCL)
          || isU64(cc_op, AMD64G_CC_OP_DECQ) || isU64(cc_op, AMD64G_CC_OP_INCQ)) {
         /* If the thunk is dec or inc, the cflag is supplied as CC_NDEP. */
         return cc_ndep;
      }

#     if 0
      if (cc_op->tag == Iex_Const) {
         vex_printf("CFLAG "); ppIRExpr(cc_op); vex_printf("\n");
      }
#     endif

      return NULL;
   }

#  undef unop
#  undef binop
#  undef mkU64
#  undef mkU32
#  undef mkU8

   return NULL;
}


/*---------------------------------------------------------------*/
/*--- Supporting functions for x87 FPU activities.            ---*/
/*---------------------------------------------------------------*/

static inline Bool host_is_little_endian ( void )
{
   UInt x = 0x76543210;
   UChar* p = (UChar*)(&x);
   return toBool(*p == 0x10);
}

/* Inspect a value and its tag, as per the x87 'FXAM' instruction. */
/* CALLED FROM GENERATED CODE: CLEAN HELPER */
ULong amd64g_calculate_FXAM ( ULong tag, ULong dbl )
{
   Bool   mantissaIsZero;
   Int    bexp;
   UChar  sign;
   UChar* f64;

   vassert(host_is_little_endian());

   /* vex_printf("calculate_FXAM ( %d, %llx ) .. ", tag, dbl ); */

   f64  = (UChar*)(&dbl);
   sign = toUChar( (f64[7] >> 7) & 1 );

   /* First off, if the tag indicates the register was empty,
      return 1,0,sign,1 */
   if (tag == 0) {
      /* vex_printf("Empty\n"); */
      return AMD64G_FC_MASK_C3 | 0 | (sign << AMD64G_FC_SHIFT_C1)
                                   | AMD64G_FC_MASK_C0;
   }

   bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
   bexp &= 0x7FF;

   mantissaIsZero
      = toBool(
           (f64[6] & 0x0F) == 0
           && (f64[5] | f64[4] | f64[3] | f64[2] | f64[1] | f64[0]) == 0
        );

   /* If both exponent and mantissa are zero, the value is zero.
      Return 1,0,sign,0. */
   if (bexp == 0 && mantissaIsZero) {
      /* vex_printf("Zero\n"); */
      return AMD64G_FC_MASK_C3 | 0
                               | (sign << AMD64G_FC_SHIFT_C1) | 0;
   }

   /* If exponent is zero but mantissa isn't, it's a denormal.
      Return 1,1,sign,0. */
   if (bexp == 0 && !mantissaIsZero) {
      /* vex_printf("Denormal\n"); */
      return AMD64G_FC_MASK_C3 | AMD64G_FC_MASK_C2
                               | (sign << AMD64G_FC_SHIFT_C1) | 0;
   }

   /* If the exponent is 7FF and the mantissa is zero, this is an infinity.
      Return 0,1,sign,1. */
   if (bexp == 0x7FF && mantissaIsZero) {
      /* vex_printf("Inf\n"); */
      return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1)
                                   | AMD64G_FC_MASK_C0;
   }

   /* If the exponent is 7FF and the mantissa isn't zero, this is a NaN.
      Return 0,0,sign,1. */
   if (bexp == 0x7FF && !mantissaIsZero) {
      /* vex_printf("NaN\n"); */
      return 0 | 0 | (sign << AMD64G_FC_SHIFT_C1) | AMD64G_FC_MASK_C0;
   }

   /* Uh, ok, we give up.  It must be a normal finite number.
      Return 0,1,sign,0.
   */
   /* vex_printf("normal\n"); */
   return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1) | 0;
}


/* This is used to implement both 'frstor' and 'fldenv'.  The latter
   appears to differ from the former only in that the 8 FP registers
   themselves are not transferred into the guest state. */
static
VexEmNote do_put_x87 ( Bool moveRegs,
                       /*IN*/Fpu_State* x87_state,
                       /*OUT*/VexGuestAMD64State* vex_state )
{
   Int        stno, preg;
   UInt       tag;
   ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
   UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
   UInt       ftop    = (x87_state->env[FP_ENV_STAT] >> 11) & 7;
   UInt       tagw    = x87_state->env[FP_ENV_TAG];
   UInt       fpucw   = x87_state->env[FP_ENV_CTRL];
   UInt       c3210   = x87_state->env[FP_ENV_STAT] & 0x4700;
   VexEmNote  ew;
   UInt       fpround;
   ULong      pair;

   /* Copy registers and tags */
   for (stno = 0; stno < 8; stno++) {
      preg = (stno + ftop) & 7;
      tag = (tagw >> (2*preg)) & 3;
      if (tag == 3) {
         /* register is empty */
         /* hmm, if it's empty, does it still get written?  Probably
            safer to say it does.  If we don't, memcheck could get out
            of sync, in that it thinks all FP registers are defined by
            this helper, but in reality some have not been updated. */
         if (moveRegs)
            vexRegs[preg] = 0; /* IEEE754 64-bit zero */
         vexTags[preg] = 0;
      } else {
         /* register is non-empty */
         if (moveRegs)
            convert_f80le_to_f64le( &x87_state->reg[10*stno],
                                    (UChar*)&vexRegs[preg] );
         vexTags[preg] = 1;
      }
   }

   /* stack pointer */
   vex_state->guest_FTOP = ftop;

   /* status word */
   vex_state->guest_FC3210 = c3210;

   /* handle the control word, setting FPROUND and detecting any
      emulation warnings. */
   pair    = amd64g_check_fldcw ( (ULong)fpucw );
   fpround = (UInt)pair & 0xFFFFFFFFULL;
   ew      = (VexEmNote)(pair >> 32);

   vex_state->guest_FPROUND = fpround & 3;

   /* emulation warnings --> caller */
   return ew;
}


/* Create an x87 FPU state from the guest state, as close as
   we can approximate it. */
static
void do_get_x87 ( /*IN*/VexGuestAMD64State* vex_state,
                  /*OUT*/Fpu_State* x87_state )
{
   Int        i, stno, preg;
   UInt       tagw;
   ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
   UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
   UInt       ftop    = vex_state->guest_FTOP;
   UInt       c3210   = vex_state->guest_FC3210;

   for (i = 0; i < 14; i++)
      x87_state->env[i] = 0;

   x87_state->env[1] = x87_state->env[3] = x87_state->env[5]
      = x87_state->env[13] = 0xFFFF;
   x87_state->env[FP_ENV_STAT]
      = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
   x87_state->env[FP_ENV_CTRL]
      = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));

   /* Dump the register stack in ST order. */
   tagw = 0;
   for (stno = 0; stno < 8; stno++) {
      preg = (stno + ftop) & 7;
      if (vexTags[preg] == 0) {
         /* register is empty */
         tagw |= (3 << (2*preg));
         convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
                                 &x87_state->reg[10*stno] );
      } else {
         /* register is full. */
         tagw |= (0 << (2*preg));
         convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
                                 &x87_state->reg[10*stno] );
      }
   }
   x87_state->env[FP_ENV_TAG] = toUShort(tagw);
}


/*---------------------------------------------------------------*/
/*--- Supporting functions for XSAVE/FXSAVE.                  ---*/
/*---------------------------------------------------------------*/

/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (reads guest state, writes guest mem) */
/* XSAVE component 0 is the x87 FPU state. */
void amd64g_dirtyhelper_XSAVE_COMPONENT_0
        ( VexGuestAMD64State* gst, HWord addr )
{
   /* Derived from values obtained from
      vendor_id       : AuthenticAMD
      cpu family      : 15
      model           : 12
      model name      : AMD Athlon(tm) 64 Processor 3200+
      stepping        : 0
      cpu MHz         : 2200.000
      cache size      : 512 KB
   */
   /* Somewhat roundabout, but at least it's simple. */
   Fpu_State tmp;
   UShort*   addrS = (UShort*)addr;
   UChar*    addrC = (UChar*)addr;
   UShort    fp_tags;
   UInt      summary_tags;
   Int       r, stno;
   UShort    *srcS, *dstS;

   do_get_x87( gst, &tmp );

   /* Now build the proper fxsave x87 image from the fsave x87 image
      we just made. */

   addrS[0]  = tmp.env[FP_ENV_CTRL]; /* FCW: fpu control word */
   addrS[1]  = tmp.env[FP_ENV_STAT]; /* FCW: fpu status word */

   /* set addrS[2] in an endian-independent way */
   summary_tags = 0;
   fp_tags = tmp.env[FP_ENV_TAG];
   for (r = 0; r < 8; r++) {
      if ( ((fp_tags >> (2*r)) & 3) != 3 )
         summary_tags |= (1 << r);
   }
   addrC[4]  = toUChar(summary_tags); /* FTW: tag summary byte */
   addrC[5]  = 0; /* pad */

   /* FOP: faulting fpu opcode.  From experimentation, the real CPU
      does not write this field. (?!) */
   addrS[3]  = 0; /* BOGUS */

   /* RIP (Last x87 instruction pointer).  From experimentation, the
      real CPU does not write this field. (?!) */
   addrS[4]  = 0; /* BOGUS */
   addrS[5]  = 0; /* BOGUS */
   addrS[6]  = 0; /* BOGUS */
   addrS[7]  = 0; /* BOGUS */

   /* RDP (Last x87 data pointer).  From experimentation, the real CPU
      does not write this field. (?!) */
   addrS[8]  = 0; /* BOGUS */
   addrS[9]  = 0; /* BOGUS */
   addrS[10] = 0; /* BOGUS */
   addrS[11] = 0; /* BOGUS */

   /* addrS[13,12] are MXCSR -- not written */
   /* addrS[15,14] are MXCSR_MASK -- not written */

   /* Copy in the FP registers, in ST order. */
   for (stno = 0; stno < 8; stno++) {
      srcS = (UShort*)(&tmp.reg[10*stno]);
      dstS = (UShort*)(&addrS[16 + 8*stno]);
      dstS[0] = srcS[0];
      dstS[1] = srcS[1];
      dstS[2] = srcS[2];
      dstS[3] = srcS[3];
      dstS[4] = srcS[4];
      dstS[5] = 0;
      dstS[6] = 0;
      dstS[7] = 0;
   }
}


/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (reads guest state, writes guest mem) */
/* XSAVE component 1 is the SSE state. */
void amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS
        ( VexGuestAMD64State* gst, HWord addr )
{
   UShort* addrS = (UShort*)addr;
   UInt    mxcsr;

   /* The only non-register parts of the SSE state are MXCSR and
      MXCSR_MASK. */
   mxcsr = amd64g_create_mxcsr( gst->guest_SSEROUND );

   addrS[12] = toUShort(mxcsr);  /* MXCSR */
   addrS[13] = toUShort(mxcsr >> 16);

   addrS[14] = 0xFFFF; /* MXCSR mask (lo16) */
   addrS[15] = 0x0000; /* MXCSR mask (hi16) */
}


/* VISIBLE TO LIBVEX CLIENT */
/* Do FXSAVE from the supplied VexGuestAMD64State structure and store
   the result at the given address which represents a buffer of at
   least 416 bytes.

   This function is not called from generated code.  FXSAVE is dealt
   with by the amd64 front end by calling the XSAVE_COMPONENT_{0,1}
   functions above plus some in-line IR.  This function is merely a
   convenience function for VEX's users.
*/
void LibVEX_GuestAMD64_fxsave ( /*IN*/VexGuestAMD64State* gst,
                                /*OUT*/HWord fp_state )
{
   /* Do the x87 part */
   amd64g_dirtyhelper_XSAVE_COMPONENT_0(gst, fp_state);

   /* And now the SSE part, except for the registers themselves. */
   amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS(gst, fp_state);

   /* That's the first 160 bytes of the image done. */
   /* Now only %xmm0 .. %xmm15 remain to be copied.  If the host is
      big-endian, these need to be byte-swapped. */
   U128 *xmm = (U128 *)(fp_state + 160);
   vassert(host_is_little_endian());

#  define COPY_U128(_dst,_src)                       \
      do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
           _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
      while (0)

   COPY_U128( xmm[0],  gst->guest_YMM0 );
   COPY_U128( xmm[1],  gst->guest_YMM1 );
   COPY_U128( xmm[2],  gst->guest_YMM2 );
   COPY_U128( xmm[3],  gst->guest_YMM3 );
   COPY_U128( xmm[4],  gst->guest_YMM4 );
   COPY_U128( xmm[5],  gst->guest_YMM5 );
   COPY_U128( xmm[6],  gst->guest_YMM6 );
   COPY_U128( xmm[7],  gst->guest_YMM7 );
   COPY_U128( xmm[8],  gst->guest_YMM8 );
   COPY_U128( xmm[9],  gst->guest_YMM9 );
   COPY_U128( xmm[10], gst->guest_YMM10 );
   COPY_U128( xmm[11], gst->guest_YMM11 );
   COPY_U128( xmm[12], gst->guest_YMM12 );
   COPY_U128( xmm[13], gst->guest_YMM13 );
   COPY_U128( xmm[14], gst->guest_YMM14 );
   COPY_U128( xmm[15], gst->guest_YMM15 );
#  undef COPY_U128
}


/*---------------------------------------------------------------*/
/*--- Supporting functions for XRSTOR/FXRSTOR.                ---*/
/*---------------------------------------------------------------*/

/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (writes guest state, reads guest mem) */
VexEmNote amd64g_dirtyhelper_XRSTOR_COMPONENT_0
             ( VexGuestAMD64State* gst, HWord addr )
{
   Fpu_State tmp;
   UShort*   addrS   = (UShort*)addr;
   UChar*    addrC   = (UChar*)addr;
   UShort    fp_tags;
   Int       r, stno, i;

   /* Copy the x87 registers out of the image, into a temporary
      Fpu_State struct. */
   for (i = 0; i < 14; i++) tmp.env[i] = 0;
   for (i = 0; i < 80; i++) tmp.reg[i] = 0;
   /* fill in tmp.reg[0..7] */
   for (stno = 0; stno < 8; stno++) {
      UShort* dstS = (UShort*)(&tmp.reg[10*stno]);
      UShort* srcS = (UShort*)(&addrS[16 + 8*stno]);
      dstS[0] = srcS[0];
      dstS[1] = srcS[1];
      dstS[2] = srcS[2];
      dstS[3] = srcS[3];
      dstS[4] = srcS[4];
   }
   /* fill in tmp.env[0..13] */
   tmp.env[FP_ENV_CTRL] = addrS[0]; /* FCW: fpu control word */
   tmp.env[FP_ENV_STAT] = addrS[1]; /* FCW: fpu status word */

   fp_tags = 0;
   for (r = 0; r < 8; r++) {
      if (addrC[4] & (1<<r))
         fp_tags |= (0 << (2*r)); /* EMPTY */
      else
         fp_tags |= (3 << (2*r)); /* VALID -- not really precise enough. */
   }
   tmp.env[FP_ENV_TAG] = fp_tags;

   /* Now write 'tmp' into the guest state. */
   VexEmNote warnX87 = do_put_x87( True/*moveRegs*/, &tmp, gst );

   return warnX87;
}


/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (writes guest state, reads guest mem) */
VexEmNote amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS
             ( VexGuestAMD64State* gst, HWord addr )
{
   UShort* addrS = (UShort*)addr;
   UInt    w32   = (((UInt)addrS[12]) & 0xFFFF)
                   | ((((UInt)addrS[13]) & 0xFFFF) << 16);
   ULong   w64   = amd64g_check_ldmxcsr( (ULong)w32 );

   VexEmNote warnXMM = (VexEmNote)(w64 >> 32);

   gst->guest_SSEROUND = w64 & 0xFFFFFFFFULL;
   return warnXMM;
}


/* VISIBLE TO LIBVEX CLIENT */
/* Do FXRSTOR from the supplied address and store read values to the given
   VexGuestAMD64State structure.

   This function is not called from generated code.  FXRSTOR is dealt
   with by the amd64 front end by calling the XRSTOR_COMPONENT_{0,1}
   functions above plus some in-line IR.  This function is merely a
   convenience function for VEX's users.
*/
VexEmNote LibVEX_GuestAMD64_fxrstor ( /*IN*/HWord fp_state,
                                      /*MOD*/VexGuestAMD64State* gst )
{
   /* Restore %xmm0 .. %xmm15.  If the host is big-endian, these need
      to be byte-swapped. */
   U128 *xmm = (U128 *)(fp_state + 160);

   vassert(host_is_little_endian());

#  define COPY_U128(_dst,_src)                       \
      do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
           _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
      while (0)

   COPY_U128( gst->guest_YMM0, xmm[0] );
   COPY_U128( gst->guest_YMM1, xmm[1] );
   COPY_U128( gst->guest_YMM2, xmm[2] );
   COPY_U128( gst->guest_YMM3, xmm[3] );
   COPY_U128( gst->guest_YMM4, xmm[4] );
   COPY_U128( gst->guest_YMM5, xmm[5] );
   COPY_U128( gst->guest_YMM6, xmm[6] );
   COPY_U128( gst->guest_YMM7, xmm[7] );
   COPY_U128( gst->guest_YMM8, xmm[8] );
   COPY_U128( gst->guest_YMM9, xmm[9] );
   COPY_U128( gst->guest_YMM10, xmm[10] );
   COPY_U128( gst->guest_YMM11, xmm[11] );
   COPY_U128( gst->guest_YMM12, xmm[12] );
   COPY_U128( gst->guest_YMM13, xmm[13] );
   COPY_U128( gst->guest_YMM14, xmm[14] );
   COPY_U128( gst->guest_YMM15, xmm[15] );

#  undef COPY_U128

   VexEmNote warnXMM
      = amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS(gst, fp_state);
   VexEmNote warnX87
      = amd64g_dirtyhelper_XRSTOR_COMPONENT_0(gst, fp_state);

   /* Prefer an X87 emwarn over an XMM one, if both exist. */
   if (warnX87 != EmNote_NONE)
      return warnX87;
   else
      return warnXMM;
}


/*---------------------------------------------------------------*/
/*--- Supporting functions for FSAVE/FRSTOR                   ---*/
/*---------------------------------------------------------------*/

/* DIRTY HELPER (writes guest state) */
/* Initialise the x87 FPU state as per 'finit'. */
void amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* gst )
{
   Int i;
   gst->guest_FTOP = 0;
   for (i = 0; i < 8; i++) {
      gst->guest_FPTAG[i] = 0; /* empty */
      gst->guest_FPREG[i] = 0; /* IEEE754 64-bit zero */
   }
   gst->guest_FPROUND = (ULong)Irrm_NEAREST;
   gst->guest_FC3210  = 0;
}


/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (reads guest memory) */
ULong amd64g_dirtyhelper_loadF80le ( Addr addrU )
{
   ULong f64;
   convert_f80le_to_f64le ( (UChar*)addrU, (UChar*)&f64 );
   return f64;
}

/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (writes guest memory) */
void amd64g_dirtyhelper_storeF80le ( Addr addrU, ULong f64 )
{
   convert_f64le_to_f80le( (UChar*)&f64, (UChar*)addrU );
}


/* CALLED FROM GENERATED CODE */
/* CLEAN HELPER */
/* mxcsr[15:0] contains a SSE native format MXCSR value.
   Extract from it the required SSEROUND value and any resulting
   emulation warning, and return (warn << 32) | sseround value.
*/
ULong amd64g_check_ldmxcsr ( ULong mxcsr )
{
   /* Decide on a rounding mode.  mxcsr[14:13] holds it. */
   /* NOTE, encoded exactly as per enum IRRoundingMode. */
   ULong rmode = (mxcsr >> 13) & 3;

   /* Detect any required emulation warnings. */
   VexEmNote ew = EmNote_NONE;

   if ((mxcsr & 0x1F80) != 0x1F80) {
      /* unmasked exceptions! */
      ew = EmWarn_X86_sseExns;
   }
   else
   if (mxcsr & (1<<15)) {
      /* FZ is set */
      ew = EmWarn_X86_fz;
   }
   else
   if (mxcsr & (1<<6)) {
      /* DAZ is set */
      ew = EmWarn_X86_daz;
   }

   return (((ULong)ew) << 32) | ((ULong)rmode);
}


/* CALLED FROM GENERATED CODE */
/* CLEAN HELPER */
/* Given sseround as an IRRoundingMode value, create a suitable SSE
   native format MXCSR value. */
ULong amd64g_create_mxcsr ( ULong sseround )
{
   sseround &= 3;
   return 0x1F80 | (sseround << 13);
}


/* CLEAN HELPER */
/* fpucw[15:0] contains a x87 native format FPU control word.
   Extract from it the required FPROUND value and any resulting
   emulation warning, and return (warn << 32) | fpround value.
*/
ULong amd64g_check_fldcw ( ULong fpucw )
{
   /* Decide on a rounding mode.  fpucw[11:10] holds it. */
   /* NOTE, encoded exactly as per enum IRRoundingMode. */
   ULong rmode = (fpucw >> 10) & 3;

   /* Detect any required emulation warnings. */
   VexEmNote ew = EmNote_NONE;

   if ((fpucw & 0x3F) != 0x3F) {
      /* unmasked exceptions! */
      ew = EmWarn_X86_x87exns;
   }
   else
   if (((fpucw >> 8) & 3) != 3) {
      /* unsupported precision */
      ew = EmWarn_X86_x87precision;
   }

   return (((ULong)ew) << 32) | ((ULong)rmode);
}


/* CLEAN HELPER */
/* Given fpround as an IRRoundingMode value, create a suitable x87
   native format FPU control word. */
ULong amd64g_create_fpucw ( ULong fpround )
{
   fpround &= 3;
   return 0x037F | (fpround << 10);
}


/* This is used to implement 'fldenv'.
   Reads 28 bytes at x87_state[0 .. 27]. */
/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER */
VexEmNote amd64g_dirtyhelper_FLDENV ( /*OUT*/VexGuestAMD64State* vex_state,
                                      /*IN*/HWord x87_state)
{
   return do_put_x87( False, (Fpu_State*)x87_state, vex_state );
}


/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER */
/* Create an x87 FPU env from the guest state, as close as we can
   approximate it.  Writes 28 bytes at x87_state[0..27]. */
void amd64g_dirtyhelper_FSTENV ( /*IN*/VexGuestAMD64State* vex_state,
                                 /*OUT*/HWord x87_state )
{
   Int        i, stno, preg;
   UInt       tagw;
   UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
   Fpu_State* x87     = (Fpu_State*)x87_state;
   UInt       ftop    = vex_state->guest_FTOP;
   ULong      c3210   = vex_state->guest_FC3210;

   for (i = 0; i < 14; i++)
      x87->env[i] = 0;

   x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
   x87->env[FP_ENV_STAT]
      = toUShort(toUInt( ((ftop & 7) << 11) | (c3210 & 0x4700) ));
   x87->env[FP_ENV_CTRL]
      = toUShort(toUInt( amd64g_create_fpucw( vex_state->guest_FPROUND ) ));

   /* Compute the x87 tag word. */
   tagw = 0;
   for (stno = 0; stno < 8; stno++) {
      preg = (stno + ftop) & 7;
      if (vexTags[preg] == 0) {
         /* register is empty */
         tagw |= (3 << (2*preg));
      } else {
         /* register is full. */
         tagw |= (0 << (2*preg));
      }
   }
   x87->env[FP_ENV_TAG] = toUShort(tagw);

   /* We don't dump the x87 registers, tho. */
}


/* This is used to implement 'fnsave'.
   Writes 108 bytes at x87_state[0 .. 107]. */
/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER */
void amd64g_dirtyhelper_FNSAVE ( /*IN*/VexGuestAMD64State* vex_state,
                                 /*OUT*/HWord x87_state)
{
   do_get_x87( vex_state, (Fpu_State*)x87_state );
}


/* This is used to implement 'fnsaves'.
   Writes 94 bytes at x87_state[0 .. 93]. */
/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER */
void amd64g_dirtyhelper_FNSAVES ( /*IN*/VexGuestAMD64State* vex_state,
                                  /*OUT*/HWord x87_state)
{
   Int           i, stno, preg;
   UInt          tagw;
   ULong*        vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
   UChar*        vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
   Fpu_State_16* x87     = (Fpu_State_16*)x87_state;
   UInt          ftop    = vex_state->guest_FTOP;
   UInt          c3210   = vex_state->guest_FC3210;

   for (i = 0; i < 7; i++)
      x87->env[i] = 0;

   x87->env[FPS_ENV_STAT]
      = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
   x87->env[FPS_ENV_CTRL]
      = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));

   /* Dump the register stack in ST order. */
   tagw = 0;
   for (stno = 0; stno < 8; stno++) {
      preg = (stno + ftop) & 7;
      if (vexTags[preg] == 0) {
         /* register is empty */
         tagw |= (3 << (2*preg));
         convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
                                 &x87->reg[10*stno] );
      } else {
         /* register is full. */
         tagw |= (0 << (2*preg));
         convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
                                 &x87->reg[10*stno] );
      }
   }
   x87->env[FPS_ENV_TAG] = toUShort(tagw);
}


/* This is used to implement 'frstor'.
   Reads 108 bytes at x87_state[0 .. 107]. */
/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER */
VexEmNote amd64g_dirtyhelper_FRSTOR ( /*OUT*/VexGuestAMD64State* vex_state,
                                      /*IN*/HWord x87_state)
{
   return do_put_x87( True, (Fpu_State*)x87_state, vex_state );
}


/* This is used to implement 'frstors'.
   Reads 94 bytes at x87_state[0 .. 93]. */
/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER */
VexEmNote amd64g_dirtyhelper_FRSTORS ( /*OUT*/VexGuestAMD64State* vex_state,
                                       /*IN*/HWord x87_state)
{
   Int           stno, preg;
   UInt          tag;
   ULong*        vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
   UChar*        vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
   Fpu_State_16* x87     = (Fpu_State_16*)x87_state;
   UInt          ftop    = (x87->env[FPS_ENV_STAT] >> 11) & 7;
   UInt          tagw    = x87->env[FPS_ENV_TAG];
   UInt          fpucw   = x87->env[FPS_ENV_CTRL];
   UInt          c3210   = x87->env[FPS_ENV_STAT] & 0x4700;
   VexEmNote     ew;
   UInt          fpround;
   ULong         pair;

   /* Copy registers and tags */
   for (stno = 0; stno < 8; stno++) {
      preg = (stno + ftop) & 7;
      tag = (tagw >> (2*preg)) & 3;
      if (tag == 3) {
         /* register is empty */
         /* hmm, if it's empty, does it still get written?  Probably
            safer to say it does.  If we don't, memcheck could get out
            of sync, in that it thinks all FP registers are defined by
            this helper, but in reality some have not been updated. */
         vexRegs[preg] = 0; /* IEEE754 64-bit zero */
         vexTags[preg] = 0;
      } else {
         /* register is non-empty */
         convert_f80le_to_f64le( &x87->reg[10*stno],
                                 (UChar*)&vexRegs[preg] );
         vexTags[preg] = 1;
      }
   }

   /* stack pointer */
   vex_state->guest_FTOP = ftop;

   /* status word */
   vex_state->guest_FC3210 = c3210;

   /* handle the control word, setting FPROUND and detecting any
      emulation warnings. */
   pair    = amd64g_check_fldcw ( (ULong)fpucw );
   fpround = (UInt)pair & 0xFFFFFFFFULL;
   ew      = (VexEmNote)(pair >> 32);

   vex_state->guest_FPROUND = fpround & 3;

   /* emulation warnings --> caller */
   return ew;
}


/*---------------------------------------------------------------*/
/*--- CPUID helpers.                                          ---*/
/*---------------------------------------------------------------*/

/* Claim to be the following CPU, which is probably representative of
   the lowliest (earliest) amd64 offerings.  It can do neither sse3
   nor cx16.

   vendor_id       : AuthenticAMD
   cpu family      : 15
   model           : 5
   model name      : AMD Opteron (tm) Processor 848
   stepping        : 10
   cpu MHz         : 1797.682
   cache size      : 1024 KB
   fpu             : yes
   fpu_exception   : yes
   cpuid level     : 1
   wp              : yes
   flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
                     mtrr pge mca cmov pat pse36 clflush mmx fxsr
                     sse sse2 syscall nx mmxext lm 3dnowext 3dnow
   bogomips        : 3600.62
   TLB size        : 1088 4K pages
   clflush size    : 64
   cache_alignment : 64
   address sizes   : 40 bits physical, 48 bits virtual
   power management: ts fid vid ttp

   2012-Feb-21: don't claim 3dnow or 3dnowext, since in fact
   we don't support them.  See #291568.  3dnow is 80000001.EDX.31
   and 3dnowext is 80000001.EDX.30.
*/
void amd64g_dirtyhelper_CPUID_baseline ( VexGuestAMD64State* st )
{
#  define SET_ABCD(_a,_b,_c,_d)                \
      do { st->guest_RAX = (ULong)(_a);        \
           st->guest_RBX = (ULong)(_b);        \
           st->guest_RCX = (ULong)(_c);        \
           st->guest_RDX = (ULong)(_d);        \
      } while (0)

   switch (0xFFFFFFFF & st->guest_RAX) {
      case 0x00000000:
         SET_ABCD(0x00000001, 0x68747541, 0x444d4163, 0x69746e65);
         break;
      case 0x00000001:
         SET_ABCD(0x00000f5a, 0x01000800, 0x00000000, 0x078bfbff);
         break;
      case 0x80000000:
         SET_ABCD(0x80000018, 0x68747541, 0x444d4163, 0x69746e65);
         break;
      case 0x80000001:
         /* Don't claim to support 3dnow or 3dnowext.  0xe1d3fbff is
            the original it-is-supported value that the h/w provides.
            See #291568. */
         SET_ABCD(0x00000f5a, 0x00000505, 0x00000000, /*0xe1d3fbff*/
                                                      0x21d3fbff);
         break;
      case 0x80000002:
         SET_ABCD(0x20444d41, 0x6574704f, 0x206e6f72, 0x296d7428);
         break;
      case 0x80000003:
         SET_ABCD(0x6f725020, 0x73736563, 0x3820726f, 0x00003834);
         break;
      case 0x80000004:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x80000005:
         SET_ABCD(0xff08ff08, 0xff20ff20, 0x40020140, 0x40020140);
         break;
      case 0x80000006:
         SET_ABCD(0x00000000, 0x42004200, 0x04008140, 0x00000000);
         break;
      case 0x80000007:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x0000000f);
         break;
      case 0x80000008:
         SET_ABCD(0x00003028, 0x00000000, 0x00000000, 0x00000000);
         break;
      default:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
         break;
   }
#  undef SET_ABCD
}


/* Claim to be the following CPU (2 x ...), which is sse3 and cx16
   capable.

   vendor_id       : GenuineIntel
   cpu family      : 6
   model           : 15
   model name      : Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
   stepping        : 6
   cpu MHz         : 2394.000
   cache size      : 4096 KB
   physical id     : 0
   siblings        : 2
   core id         : 0
   cpu cores       : 2
   fpu             : yes
   fpu_exception   : yes
   cpuid level     : 10
   wp              : yes
   flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
                     mtrr pge mca cmov pat pse36 clflush dts acpi
                     mmx fxsr sse sse2 ss ht tm syscall nx lm
                     constant_tsc pni monitor ds_cpl vmx est tm2
                     cx16 xtpr lahf_lm
   bogomips        : 4798.78
   clflush size    : 64
   cache_alignment : 64
   address sizes   : 36 bits physical, 48 bits virtual
   power management:
*/
void amd64g_dirtyhelper_CPUID_sse3_and_cx16 ( VexGuestAMD64State* st )
{
#  define SET_ABCD(_a,_b,_c,_d)                \
      do { st->guest_RAX = (ULong)(_a);        \
           st->guest_RBX = (ULong)(_b);        \
           st->guest_RCX = (ULong)(_c);        \
           st->guest_RDX = (ULong)(_d);        \
      } while (0)

   switch (0xFFFFFFFF & st->guest_RAX) {
      case 0x00000000:
         SET_ABCD(0x0000000a, 0x756e6547, 0x6c65746e, 0x49656e69);
         break;
      case 0x00000001:
         SET_ABCD(0x000006f6, 0x00020800, 0x0000e3bd, 0xbfebfbff);
         break;
      case 0x00000002:
         SET_ABCD(0x05b0b101, 0x005657f0, 0x00000000, 0x2cb43049);
         break;
      case 0x00000003:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x00000004: {
         switch (0xFFFFFFFF & st->guest_RCX) {
            case 0x00000000: SET_ABCD(0x04000121, 0x01c0003f,
                                      0x0000003f, 0x00000001); break;
            case 0x00000001: SET_ABCD(0x04000122, 0x01c0003f,
                                      0x0000003f, 0x00000001); break;
            case 0x00000002: SET_ABCD(0x04004143, 0x03c0003f,
                                      0x00000fff, 0x00000001); break;
            default:         SET_ABCD(0x00000000, 0x00000000,
                                      0x00000000, 0x00000000); break;
         }
         break;
      }
      case 0x00000005:
         SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00000020);
         break;
      case 0x00000006:
         SET_ABCD(0x00000001, 0x00000002, 0x00000001, 0x00000000);
         break;
      case 0x00000007:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x00000008:
         SET_ABCD(0x00000400, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x00000009:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x0000000a:
      unhandled_eax_value:
         SET_ABCD(0x07280202, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x80000000:
         SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x80000001:
         SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x20100800);
         break;
      case 0x80000002:
         SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
         break;
      case 0x80000003:
         SET_ABCD(0x43203229, 0x20205550, 0x20202020, 0x20202020);
         break;
      case 0x80000004:
         SET_ABCD(0x30303636, 0x20402020, 0x30342e32, 0x007a4847);
         break;
      case 0x80000005:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x80000006:
         SET_ABCD(0x00000000, 0x00000000, 0x10008040, 0x00000000);
         break;
      case 0x80000007:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x80000008:
         SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
         break;
      default:
         goto unhandled_eax_value;
   }
#  undef SET_ABCD
}


/* Claim to be the following CPU (4 x ...), which is sse4.2 and cx16
   capable.

   vendor_id       : GenuineIntel
   cpu family      : 6
   model           : 37
   model name      : Intel(R) Core(TM) i5 CPU         670  @ 3.47GHz
   stepping        : 2
   cpu MHz         : 3334.000
   cache size      : 4096 KB
   physical id     : 0
   siblings        : 4
   core id         : 0
   cpu cores       : 2
   apicid          : 0
   initial apicid  : 0
   fpu             : yes
   fpu_exception   : yes
   cpuid level     : 11
   wp              : yes
   flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
                     mtrr pge mca cmov pat pse36 clflush dts acpi
                     mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
                     lm constant_tsc arch_perfmon pebs bts rep_good
                     xtopology nonstop_tsc aperfmperf pni pclmulqdq
                     dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16
                     xtpr pdcm sse4_1 sse4_2 popcnt aes lahf_lm ida
                     arat tpr_shadow vnmi flexpriority ept vpid
   bogomips        : 6957.57
   clflush size    : 64
   cache_alignment : 64
   address sizes   : 36 bits physical, 48 bits virtual
   power management:
*/
void amd64g_dirtyhelper_CPUID_sse42_and_cx16 ( VexGuestAMD64State* st )
{
#  define SET_ABCD(_a,_b,_c,_d)                \
      do { st->guest_RAX = (ULong)(_a);        \
           st->guest_RBX = (ULong)(_b);        \
           st->guest_RCX = (ULong)(_c);        \
           st->guest_RDX = (ULong)(_d);        \
      } while (0)

   UInt old_eax = (UInt)st->guest_RAX;
   UInt old_ecx = (UInt)st->guest_RCX;

   switch (old_eax) {
      case 0x00000000:
         SET_ABCD(0x0000000b, 0x756e6547, 0x6c65746e, 0x49656e69);
         break;
      case 0x00000001:
         SET_ABCD(0x00020652, 0x00100800, 0x0298e3ff, 0xbfebfbff);
         break;
      case 0x00000002:
         SET_ABCD(0x55035a01, 0x00f0b2e3, 0x00000000, 0x09ca212c);
         break;
      case 0x00000003:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x00000004:
         switch (old_ecx) {
            case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
                                      0x0000003f, 0x00000000); break;
            case 0x00000001: SET_ABCD(0x1c004122, 0x00c0003f,
                                      0x0000007f, 0x00000000); break;
            case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
                                      0x000001ff, 0x00000000); break;
            case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
                                      0x00000fff, 0x00000002); break;
            default:         SET_ABCD(0x00000000, 0x00000000,
                                      0x00000000, 0x00000000); break;
         }
         break;
      case 0x00000005:
         SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
         break;
      case 0x00000006:
         SET_ABCD(0x00000007, 0x00000002, 0x00000001, 0x00000000);
         break;
      case 0x00000007:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x00000008:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x00000009:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x0000000a:
         SET_ABCD(0x07300403, 0x00000004, 0x00000000, 0x00000603);
         break;
      case 0x0000000b:
         switch (old_ecx) {
            case 0x00000000:
               SET_ABCD(0x00000001, 0x00000002,
                        0x00000100, 0x00000000); break;
            case 0x00000001:
               SET_ABCD(0x00000004, 0x00000004,
                        0x00000201, 0x00000000); break;
            default:
               SET_ABCD(0x00000000, 0x00000000,
                        old_ecx,    0x00000000); break;
         }
         break;
      case 0x0000000c:
         SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
         break;
      case 0x0000000d:
         switch (old_ecx) {
            case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
                                      0x00000100, 0x00000000); break;
            case 0x00000001: SET_ABCD(0x00000004, 0x00000004,
                                      0x00000201, 0x00000000); break;
            default:         SET_ABCD(0x00000000, 0x00000000,
                                      old_ecx,    0x00000000); break;
         }
         break;
      case 0x80000000:
         SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x80000001:
         SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
         break;
      case 0x80000002:
         SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
         break;
      case 0x80000003:
         SET_ABCD(0x35692029, 0x55504320, 0x20202020, 0x20202020);
         break;
      case 0x80000004:
         SET_ABCD(0x30373620, 0x20402020, 0x37342e33, 0x007a4847);
         break;
      case 0x80000005:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x80000006:
         SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
         break;
      case 0x80000007:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
         break;
      case 0x80000008:
         SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
         break;
      default:
         SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
         break;
   }
#  undef SET_ABCD
}


/* Claim to be the following CPU (4 x ...), which is AVX and cx16
   capable.  Plus (kludge!) it "supports" HTM.

   Also with the following change: claim that XSaveOpt is not
   available, by cpuid(eax=0xD,ecx=1).eax[0] returns 0, compared to 1
   on the real CPU.  Consequently, programs that correctly observe
   these CPUID values should only try to use 3 of the 8 XSave-family
   instructions: XGETBV, XSAVE and XRSTOR.  In particular this avoids
   having to implement the compacted or optimised save/restore
   variants.

   vendor_id       : GenuineIntel
   cpu family      : 6
   model           : 42
   model name      : Intel(R) Core(TM) i5-2300 CPU @ 2.80GHz
   stepping        : 7
   cpu MHz         : 1600.000
   cache size      : 6144 KB
   physical id     : 0
   siblings        : 4
   core id         : 3
   cpu cores       : 4
   apicid          : 6
   initial apicid  : 6
   fpu             : yes
   fpu_exception   : yes
   cpuid level     : 13
   wp              : yes
   flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
                     mtrr pge mca cmov pat pse36 clflush dts acpi
                     mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
                     lm constant_tsc arch_perfmon pebs bts rep_good
                     nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq
                     dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16
                     xtpr pdcm sse4_1 sse4_2 popcnt aes xsave avx
                     lahf_lm ida arat epb xsaveopt pln pts dts
                     tpr_shadow vnmi flexpriority ept vpid

   bogomips        : 5768.94
   clflush size    : 64
   cache_alignment : 64
   address sizes   : 36 bits physical, 48 bits virtual
   power management:
*/
void amd64g_dirtyhelper_CPUID_avx_and_cx16 ( VexGuestAMD64State* st )
{
#  define SET_ABCD(_a,_b,_c,_d)                \
      do { st->guest_RAX = (ULong)(_a);        \
           st->guest_RBX = (ULong)(_b);        \
           st->guest_RCX = (ULong)(_c);        \
           st->guest_RDX = (ULong)(_d);        \
      } while (0)

   UInt old_eax = (UInt)st->guest_RAX;
   UInt old_ecx = (UInt)st->guest_RCX;

   switch (old_eax) {
      case 0x00000000:
         SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
         break;
      case 0x00000001:
         SET_ABCD(0x000206a7, 0x00100800, 0x1f9ae3bf, 0xbfebfbff);
         break;
      case 0x00000002:
         SET_ABCD(0x76035a01, 0x00f0b0ff, 0x00000000, 0x00ca0000);
         break;
      case 0x00000003:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x00000004:
         switch (old_ecx) {
            case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
                                      0x0000003f, 0x00000000); break;
            case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
                                      0x0000003f, 0x00000000); break;
            case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
                                      0x000001ff, 0x00000000); break;
            case 0x00000003: SET_ABCD(0x1c03c163, 0x02c0003f,
                                      0x00001fff, 0x00000006); break;
            default:         SET_ABCD(0x00000000, 0x00000000,
                                      0x00000000, 0x00000000); break;
         }
         break;
      case 0x00000005:
         SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
         break;
      case 0x00000006:
         SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
         break;
      case 0x00000007:
         SET_ABCD(0x00000000, 0x00000800, 0x00000000, 0x00000000);
         break;
      case 0x00000008:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x00000009:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x0000000a:
         SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
         break;
      case 0x0000000b:
         switch (old_ecx) {
            case 0x00000000:
               SET_ABCD(0x00000001, 0x00000001,
                        0x00000100, 0x00000000); break;
            case 0x00000001:
               SET_ABCD(0x00000004, 0x00000004,
                        0x00000201, 0x00000000); break;
            default:
               SET_ABCD(0x00000000, 0x00000000,
                        old_ecx,    0x00000000); break;
         }
         break;
      case 0x0000000c:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x0000000d:
         switch (old_ecx) {
            case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
                                      0x00000340, 0x00000000); break;
            case 0x00000001: SET_ABCD(0x00000000, 0x00000000,
                                      0x00000000, 0x00000000); break;
            case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
                                      0x00000000, 0x00000000); break;
            default:         SET_ABCD(0x00000000, 0x00000000,
                                      0x00000000, 0x00000000); break;
         }
         break;
      case 0x0000000e:
         SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
         break;
      case 0x0000000f:
         SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
         break;
      case 0x80000000:
         SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x80000001:
         SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
         break;
      case 0x80000002:
         SET_ABCD(0x20202020, 0x20202020, 0x65746e49, 0x2952286c);
         break;
      case 0x80000003:
         SET_ABCD(0x726f4320, 0x4d542865, 0x35692029, 0x3033322d);
         break;
      case 0x80000004:
         SET_ABCD(0x50432030, 0x20402055, 0x30382e32, 0x007a4847);
         break;
      case 0x80000005:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x80000006:
         SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
         break;
      case 0x80000007:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
         break;
      case 0x80000008:
         SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
         break;
      default:
         SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
         break;
   }
#  undef SET_ABCD
}


/* Claim to be the following CPU (4 x ...), which is AVX2 capable.

   With the following change: claim that XSaveOpt is not available, by
   cpuid(eax=0xD,ecx=1).eax[0] returns 0, compared to 1 on the real
   CPU.  Consequently, programs that correctly observe these CPUID
   values should only try to use 3 of the 8 XSave-family instructions:
   XGETBV, XSAVE and XRSTOR.  In particular this avoids having to
   implement the compacted or optimised save/restore variants.

   vendor_id       : GenuineIntel
   cpu family      : 6
   model           : 60
   model name      : Intel(R) Core(TM) i7-4910MQ CPU @ 2.90GHz
   stepping        : 3
   microcode       : 0x1c
   cpu MHz         : 919.957
   cache size      : 8192 KB
   physical id     : 0
   siblings        : 4
   core id         : 3
   cpu cores       : 4
   apicid          : 6
   initial apicid  : 6
   fpu             : yes
   fpu_exception   : yes
   cpuid level     : 13
   wp              : yes
   flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca
                     cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht
                     tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc
                     arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc
                     aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl
                     vmx smx est tm2 ssse3 fma cx16 xtpr pdcm pcid sse4_1
                     sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave
                     avx f16c rdrand lahf_lm abm ida arat epb pln pts dtherm
                     tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust
                     bmi1 avx2 smep bmi2 erms invpcid xsaveopt
   bugs            :
   bogomips        : 5786.68
   clflush size    : 64
   cache_alignment : 64
   address sizes   : 39 bits physical, 48 bits virtual
   power management:
*/
void amd64g_dirtyhelper_CPUID_avx2 ( VexGuestAMD64State* st )
{
#  define SET_ABCD(_a,_b,_c,_d)                \
      do { st->guest_RAX = (ULong)(_a);        \
           st->guest_RBX = (ULong)(_b);        \
           st->guest_RCX = (ULong)(_c);        \
           st->guest_RDX = (ULong)(_d);        \
      } while (0)

   UInt old_eax = (UInt)st->guest_RAX;
   UInt old_ecx = (UInt)st->guest_RCX;

   switch (old_eax) {
      case 0x00000000:
         SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
         break;
      case 0x00000001:
         /* Don't advertise RDRAND support, bit 30 in ECX.  */
         SET_ABCD(0x000306c3, 0x02100800, 0x3ffafbff, 0xbfebfbff);
         break;
      case 0x00000002:
         SET_ABCD(0x76036301, 0x00f0b6ff, 0x00000000, 0x00c10000);
         break;
      case 0x00000003:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x00000004:
         switch (old_ecx) {
            case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
                                      0x0000003f, 0x00000000); break;
            case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
                                      0x0000003f, 0x00000000); break;
            case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
                                      0x000001ff, 0x00000000); break;
            case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
                                      0x00001fff, 0x00000006); break;
            default:         SET_ABCD(0x00000000, 0x00000000,
                                      0x00000000, 0x00000000); break;
         }
         break;
      case 0x00000005:
         SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00042120);
         break;
      case 0x00000006:
         SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
         break;
      case 0x00000007:
         switch (old_ecx) {
            case 0x00000000: SET_ABCD(0x00000000, 0x000027ab,
                                      0x00000000, 0x00000000); break;
            default:         SET_ABCD(0x00000000, 0x00000000,
                                      0x00000000, 0x00000000); break;
         }
         break;
      case 0x00000008:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x00000009:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x0000000a:
         SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
         break;
      case 0x0000000b:
         switch (old_ecx) {
            case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
                                      0x00000100, 0x00000002); break;
            case 0x00000001: SET_ABCD(0x00000004, 0x00000008,
                                      0x00000201, 0x00000002); break;
            default:         SET_ABCD(0x00000000, 0x00000000,
                                      old_ecx,    0x00000002); break;
         }
         break;
      case 0x0000000c:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x0000000d:
         switch (old_ecx) {
            case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
                                      0x00000340, 0x00000000); break;
            case 0x00000001: SET_ABCD(0x00000000, 0x00000000,
                                      0x00000000, 0x00000000); break;
            case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
                                      0x00000000, 0x00000000); break;
            default:         SET_ABCD(0x00000000, 0x00000000,
                                      0x00000000, 0x00000000); break;
         }
         break;
      case 0x80000000:
         SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x80000001:
         SET_ABCD(0x00000000, 0x00000000, 0x00000021, 0x2c100800);
         break;
      case 0x80000002:
         SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
         break;
      case 0x80000003:
         SET_ABCD(0x37692029, 0x3139342d, 0x20514d30, 0x20555043);
         break;
      case 0x80000004:
         SET_ABCD(0x2e322040, 0x48473039, 0x0000007a, 0x00000000);
         break;
      case 0x80000005:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x80000006:
         SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
         break;
      case 0x80000007:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
         break;
      case 0x80000008:
         SET_ABCD(0x00003027, 0x00000000, 0x00000000, 0x00000000);
         break;
      default:
         SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
         break;
   }
#  undef SET_ABCD
}


/*---------------------------------------------------------------*/
/*--- Misc integer helpers, including rotates and crypto.     ---*/
/*---------------------------------------------------------------*/

ULong amd64g_calculate_RCR ( ULong arg,
                             ULong rot_amt,
                             ULong rflags_in,
                             Long  szIN )
{
   Bool  wantRflags = toBool(szIN < 0);
   ULong sz         = wantRflags ? (-szIN) : szIN;
   ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
   ULong cf=0, of=0, tempcf;

   switch (sz) {
      case 8:
         cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
         of        = ((arg >> 63) ^ cf) & 1;
         while (tempCOUNT > 0) {
            tempcf = arg & 1;
            arg    = (arg >> 1) | (cf << 63);
            cf     = tempcf;
            tempCOUNT--;
         }
         break;
      case 4:
         while (tempCOUNT >= 33) tempCOUNT -= 33;
         cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
         of        = ((arg >> 31) ^ cf) & 1;
         while (tempCOUNT > 0) {
            tempcf = arg & 1;
            arg    = ((arg >> 1) & 0x7FFFFFFFULL) | (cf << 31);
            cf     = tempcf;
            tempCOUNT--;
         }
         break;
      case 2:
         while (tempCOUNT >= 17) tempCOUNT -= 17;
         cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
         of        = ((arg >> 15) ^ cf) & 1;
         while (tempCOUNT > 0) {
            tempcf = arg & 1;
            arg    = ((arg >> 1) & 0x7FFFULL) | (cf << 15);
            cf     = tempcf;
            tempCOUNT--;
         }
         break;
      case 1:
         while (tempCOUNT >= 9) tempCOUNT -= 9;
         cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
         of        = ((arg >> 7) ^ cf) & 1;
         while (tempCOUNT > 0) {
            tempcf = arg & 1;
            arg    = ((arg >> 1) & 0x7FULL) | (cf << 7);
            cf     = tempcf;
            tempCOUNT--;
         }
         break;
      default:
         vpanic("calculate_RCR(amd64g): invalid size");
   }

   cf &= 1;
   of &= 1;
   rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
   rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);

   /* caller can ask to have back either the resulting flags or
      resulting value, but not both */
   return wantRflags ? rflags_in : arg;
}

ULong amd64g_calculate_RCL ( ULong arg,
                             ULong rot_amt,
                             ULong rflags_in,
                             Long  szIN )
{
   Bool  wantRflags = toBool(szIN < 0);
   ULong sz         = wantRflags ? (-szIN) : szIN;
   ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
   ULong cf=0, of=0, tempcf;

   switch (sz) {
      case 8:
         cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
         while (tempCOUNT > 0) {
            tempcf = (arg >> 63) & 1;
            arg    = (arg << 1) | (cf & 1);
            cf     = tempcf;
            tempCOUNT--;
         }
         of = ((arg >> 63) ^ cf) & 1;
         break;
      case 4:
         while (tempCOUNT >= 33) tempCOUNT -= 33;
         cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
         while (tempCOUNT > 0) {
            tempcf = (arg >> 31) & 1;
            arg    = 0xFFFFFFFFULL & ((arg << 1) | (cf & 1));
            cf     = tempcf;
            tempCOUNT--;
         }
         of = ((arg >> 31) ^ cf) & 1;
         break;
      case 2:
         while (tempCOUNT >= 17) tempCOUNT -= 17;
         cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
         while (tempCOUNT > 0) {
            tempcf = (arg >> 15) & 1;
            arg    = 0xFFFFULL & ((arg << 1) | (cf & 1));
            cf     = tempcf;
            tempCOUNT--;
         }
         of = ((arg >> 15) ^ cf) & 1;
         break;
      case 1:
         while (tempCOUNT >= 9) tempCOUNT -= 9;
         cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
         while (tempCOUNT > 0) {
            tempcf = (arg >> 7) & 1;
            arg    = 0xFFULL & ((arg << 1) | (cf & 1));
            cf     = tempcf;
            tempCOUNT--;
         }
         of = ((arg >> 7) ^ cf) & 1;
         break;
      default:
         vpanic("calculate_RCL(amd64g): invalid size");
   }

   cf &= 1;
   of &= 1;
   rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
   rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);

   return wantRflags ? rflags_in : arg;
}

/* Taken from gf2x-0.9.5, released under GPLv2+ (later versions LGPLv2+)
 * svn://scm.gforge.inria.fr/svn/gf2x/trunk/hardware/opteron/gf2x_mul1.h@25
 */
ULong amd64g_calculate_pclmul(ULong a, ULong b, ULong which)
{
    ULong hi, lo, tmp, A[16];

   A[0] = 0;            A[1] = a;
   A[2] = A[1] << 1;    A[3] = A[2] ^ a;
   A[4] = A[2] << 1;    A[5] = A[4] ^ a;
   A[6] = A[3] << 1;    A[7] = A[6] ^ a;
   A[8] = A[4] << 1;    A[9] = A[8] ^ a;
   A[10] = A[5] << 1;   A[11] = A[10] ^ a;
   A[12] = A[6] << 1;   A[13] = A[12] ^ a;
   A[14] = A[7] << 1;   A[15] = A[14] ^ a;

   lo = (A[b >> 60] << 4) ^ A[(b >> 56) & 15];
   hi = lo >> 56;
   lo = (lo << 8) ^ (A[(b >> 52) & 15] << 4) ^ A[(b >> 48) & 15];
   hi = (hi << 8) | (lo >> 56);
   lo = (lo << 8) ^ (A[(b >> 44) & 15] << 4) ^ A[(b >> 40) & 15];
   hi = (hi << 8) | (lo >> 56);
   lo = (lo << 8) ^ (A[(b >> 36) & 15] << 4) ^ A[(b >> 32) & 15];
   hi = (hi << 8) | (lo >> 56);
   lo = (lo << 8) ^ (A[(b >> 28) & 15] << 4) ^ A[(b >> 24) & 15];
   hi = (hi << 8) | (lo >> 56);
   lo = (lo << 8) ^ (A[(b >> 20) & 15] << 4) ^ A[(b >> 16) & 15];
   hi = (hi << 8) | (lo >> 56);
   lo = (lo << 8) ^ (A[(b >> 12) & 15] << 4) ^ A[(b >> 8) & 15];
   hi = (hi << 8) | (lo >> 56);
   lo = (lo << 8) ^ (A[(b >> 4) & 15] << 4) ^ A[b & 15];

   ULong m0 = -1;
   m0 /= 255;
   tmp = -((a >> 63) & 1); tmp &= ((b & (m0 * 0xfe)) >> 1); hi = hi ^ tmp;
   tmp = -((a >> 62) & 1); tmp &= ((b & (m0 * 0xfc)) >> 2); hi = hi ^ tmp;
   tmp = -((a >> 61) & 1); tmp &= ((b & (m0 * 0xf8)) >> 3); hi = hi ^ tmp;
   tmp = -((a >> 60) & 1); tmp &= ((b & (m0 * 0xf0)) >> 4); hi = hi ^ tmp;
   tmp = -((a >> 59) & 1); tmp &= ((b & (m0 * 0xe0)) >> 5); hi = hi ^ tmp;
   tmp = -((a >> 58) & 1); tmp &= ((b & (m0 * 0xc0)) >> 6); hi = hi ^ tmp;
   tmp = -((a >> 57) & 1); tmp &= ((b & (m0 * 0x80)) >> 7); hi = hi ^ tmp;

   return which ? hi : lo;
}


/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (non-referentially-transparent) */
/* Horrible hack.  On non-amd64 platforms, return 1. */
ULong amd64g_dirtyhelper_RDTSC ( void )
{
#  if defined(__x86_64__)
   UInt  eax, edx;
   __asm__ __volatile__("rdtsc" : "=a" (eax), "=d" (edx));
   return (((ULong)edx) << 32) | ((ULong)eax);
#  else
   return 1ULL;
#  endif
}

/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (non-referentially-transparent) */
/* Horrible hack.  On non-amd64 platforms, return 1. */
/* This uses a different calling convention from _RDTSC just above
   only because of the difficulty of returning 96 bits from a C
   function -- RDTSC returns 64 bits and so is simple by comparison,
   on amd64. */
void amd64g_dirtyhelper_RDTSCP ( VexGuestAMD64State* st )
{
#  if defined(__x86_64__)
   UInt eax, ecx, edx;
   __asm__ __volatile__("rdtscp" : "=a" (eax), "=d" (edx), "=c" (ecx));
   st->guest_RAX = (ULong)eax;
   st->guest_RCX = (ULong)ecx;
   st->guest_RDX = (ULong)edx;
#  else
   /* Do nothing. */
#  endif
}

/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (non-referentially-transparent) */
/* Horrible hack.  On non-amd64 platforms, return 0. */
ULong amd64g_dirtyhelper_IN ( ULong portno, ULong sz/*1,2 or 4*/ )
{
#  if defined(__x86_64__)
   ULong r = 0;
   portno &= 0xFFFF;
   switch (sz) {
      case 4:
         __asm__ __volatile__("movq $0,%%rax; inl %w1,%%eax; movq %%rax,%0"
                              : "=a" (r) : "Nd" (portno));
	 break;
      case 2:
         __asm__ __volatile__("movq $0,%%rax; inw %w1,%w0"
                              : "=a" (r) : "Nd" (portno));
	 break;
      case 1:
         __asm__ __volatile__("movq $0,%%rax; inb %w1,%b0"
                              : "=a" (r) : "Nd" (portno));
	 break;
      default:
         break; /* note: no 64-bit version of insn exists */
   }
   return r;
#  else
   return 0;
#  endif
}


/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (non-referentially-transparent) */
/* Horrible hack.  On non-amd64 platforms, do nothing. */
void amd64g_dirtyhelper_OUT ( ULong portno, ULong data, ULong sz/*1,2 or 4*/ )
{
#  if defined(__x86_64__)
   portno &= 0xFFFF;
   switch (sz) {
      case 4:
         __asm__ __volatile__("movq %0,%%rax; outl %%eax, %w1"
                              : : "a" (data), "Nd" (portno));
	 break;
      case 2:
         __asm__ __volatile__("outw %w0, %w1"
                              : : "a" (data), "Nd" (portno));
	 break;
      case 1:
         __asm__ __volatile__("outb %b0, %w1"
                              : : "a" (data), "Nd" (portno));
	 break;
      default:
         break; /* note: no 64-bit version of insn exists */
   }
#  else
   /* do nothing */
#  endif
}

/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (non-referentially-transparent) */
/* Horrible hack.  On non-amd64 platforms, do nothing. */
/* op = 0: call the native SGDT instruction.
   op = 1: call the native SIDT instruction.
*/
void amd64g_dirtyhelper_SxDT ( void *address, ULong op ) {
#  if defined(__x86_64__)
   switch (op) {
      case 0:
         __asm__ __volatile__("sgdt (%0)" : : "r" (address) : "memory");
         break;
      case 1:
         __asm__ __volatile__("sidt (%0)" : : "r" (address) : "memory");
         break;
      default:
         vpanic("amd64g_dirtyhelper_SxDT");
   }
#  else
   /* do nothing */
   UChar* p = (UChar*)address;
   p[0] = p[1] = p[2] = p[3] = p[4] = p[5] = 0;
   p[6] = p[7] = p[8] = p[9] = 0;
#  endif
}

/*---------------------------------------------------------------*/
/*--- Helpers for MMX/SSE/SSE2.                               ---*/
/*---------------------------------------------------------------*/

static inline UChar abdU8 ( UChar xx, UChar yy ) {
   return toUChar(xx>yy ? xx-yy : yy-xx);
}

static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
   return (((ULong)w1) << 32) | ((ULong)w0);
}

static inline UShort sel16x4_3 ( ULong w64 ) {
   UInt hi32 = toUInt(w64 >> 32);
   return toUShort(hi32 >> 16);
}
static inline UShort sel16x4_2 ( ULong w64 ) {
   UInt hi32 = toUInt(w64 >> 32);
   return toUShort(hi32);
}
static inline UShort sel16x4_1 ( ULong w64 ) {
   UInt lo32 = toUInt(w64);
   return toUShort(lo32 >> 16);
}
static inline UShort sel16x4_0 ( ULong w64 ) {
   UInt lo32 = toUInt(w64);
   return toUShort(lo32);
}

static inline UChar sel8x8_7 ( ULong w64 ) {
   UInt hi32 = toUInt(w64 >> 32);
   return toUChar(hi32 >> 24);
}
static inline UChar sel8x8_6 ( ULong w64 ) {
   UInt hi32 = toUInt(w64 >> 32);
   return toUChar(hi32 >> 16);
}
static inline UChar sel8x8_5 ( ULong w64 ) {
   UInt hi32 = toUInt(w64 >> 32);
   return toUChar(hi32 >> 8);
}
static inline UChar sel8x8_4 ( ULong w64 ) {
   UInt hi32 = toUInt(w64 >> 32);
   return toUChar(hi32 >> 0);
}
static inline UChar sel8x8_3 ( ULong w64 ) {
   UInt lo32 = toUInt(w64);
   return toUChar(lo32 >> 24);
}
static inline UChar sel8x8_2 ( ULong w64 ) {
   UInt lo32 = toUInt(w64);
   return toUChar(lo32 >> 16);
}
static inline UChar sel8x8_1 ( ULong w64 ) {
   UInt lo32 = toUInt(w64);
   return toUChar(lo32 >> 8);
}
static inline UChar sel8x8_0 ( ULong w64 ) {
   UInt lo32 = toUInt(w64);
   return toUChar(lo32 >> 0);
}

/* CALLED FROM GENERATED CODE: CLEAN HELPER */
ULong amd64g_calculate_mmx_pmaddwd ( ULong xx, ULong yy )
{
   return
      mk32x2(
         (((Int)(Short)sel16x4_3(xx)) * ((Int)(Short)sel16x4_3(yy)))
            + (((Int)(Short)sel16x4_2(xx)) * ((Int)(Short)sel16x4_2(yy))),
         (((Int)(Short)sel16x4_1(xx)) * ((Int)(Short)sel16x4_1(yy)))
            + (((Int)(Short)sel16x4_0(xx)) * ((Int)(Short)sel16x4_0(yy)))
      );
}

/* CALLED FROM GENERATED CODE: CLEAN HELPER */
ULong amd64g_calculate_mmx_psadbw ( ULong xx, ULong yy )
{
   UInt t = 0;
   t += (UInt)abdU8( sel8x8_7(xx), sel8x8_7(yy) );
   t += (UInt)abdU8( sel8x8_6(xx), sel8x8_6(yy) );
   t += (UInt)abdU8( sel8x8_5(xx), sel8x8_5(yy) );
   t += (UInt)abdU8( sel8x8_4(xx), sel8x8_4(yy) );
   t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
   t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
   t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
   t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
   t &= 0xFFFF;
   return (ULong)t;
}

/* CALLED FROM GENERATED CODE: CLEAN HELPER */
ULong amd64g_calculate_sse_phminposuw ( ULong sLo, ULong sHi )
{
   UShort t, min;
   UInt   idx;
   t = sel16x4_0(sLo); if (True)    { min = t; idx = 0; }
   t = sel16x4_1(sLo); if (t < min) { min = t; idx = 1; }
   t = sel16x4_2(sLo); if (t < min) { min = t; idx = 2; }
   t = sel16x4_3(sLo); if (t < min) { min = t; idx = 3; }
   t = sel16x4_0(sHi); if (t < min) { min = t; idx = 4; }
   t = sel16x4_1(sHi); if (t < min) { min = t; idx = 5; }
   t = sel16x4_2(sHi); if (t < min) { min = t; idx = 6; }
   t = sel16x4_3(sHi); if (t < min) { min = t; idx = 7; }
   return ((ULong)(idx << 16)) | ((ULong)min);
}

/* CALLED FROM GENERATED CODE: CLEAN HELPER */
ULong amd64g_calc_crc32b ( ULong crcIn, ULong b )
{
   UInt  i;
   ULong crc = (b & 0xFFULL) ^ crcIn;
   for (i = 0; i < 8; i++)
      crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
   return crc;
}

/* CALLED FROM GENERATED CODE: CLEAN HELPER */
ULong amd64g_calc_crc32w ( ULong crcIn, ULong w )
{
   UInt  i;
   ULong crc = (w & 0xFFFFULL) ^ crcIn;
   for (i = 0; i < 16; i++)
      crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
   return crc;
}

/* CALLED FROM GENERATED CODE: CLEAN HELPER */
ULong amd64g_calc_crc32l ( ULong crcIn, ULong l )
{
   UInt i;
   ULong crc = (l & 0xFFFFFFFFULL) ^ crcIn;
   for (i = 0; i < 32; i++)
      crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
   return crc;
}

/* CALLED FROM GENERATED CODE: CLEAN HELPER */
ULong amd64g_calc_crc32q ( ULong crcIn, ULong q )
{
   ULong crc = amd64g_calc_crc32l(crcIn, q);
   return amd64g_calc_crc32l(crc, q >> 32);
}


/* .. helper for next fn .. */
static inline ULong sad_8x4 ( ULong xx, ULong yy )
{
   UInt t = 0;
   t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
   t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
   t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
   t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
   return (ULong)t;
}

/* CALLED FROM GENERATED CODE: CLEAN HELPER */
ULong amd64g_calc_mpsadbw ( ULong sHi, ULong sLo,
                            ULong dHi, ULong dLo,
                            ULong imm_and_return_control_bit )
{
   UInt imm8     = imm_and_return_control_bit & 7;
   Bool calcHi   = (imm_and_return_control_bit >> 7) & 1;
   UInt srcOffsL = imm8 & 3; /* src offs in 32-bit (L) chunks */
   UInt dstOffsL = (imm8 >> 2) & 1; /* dst offs in ditto chunks */
   /* For src we only need 32 bits, so get them into the
      lower half of a 64 bit word. */
   ULong src = ((srcOffsL & 2) ? sHi : sLo) >> (32 * (srcOffsL & 1));
   /* For dst we need to get hold of 56 bits (7 bytes) from a total of
      11 bytes.  If calculating the low part of the result, need bytes
      dstOffsL * 4 + (0 .. 6); if calculating the high part,
      dstOffsL * 4 + (4 .. 10). */
   ULong dst;
   /* dstOffL = 0, Lo  ->  0 .. 6
      dstOffL = 1, Lo  ->  4 .. 10
      dstOffL = 0, Hi  ->  4 .. 10
      dstOffL = 1, Hi  ->  8 .. 14
   */
   if (calcHi && dstOffsL) {
      /* 8 .. 14 */
      dst = dHi & 0x00FFFFFFFFFFFFFFULL;
   }
   else if (!calcHi && !dstOffsL) {
      /* 0 .. 6 */
      dst = dLo & 0x00FFFFFFFFFFFFFFULL;
   }
   else {
      /* 4 .. 10 */
      dst = (dLo >> 32) | ((dHi & 0x00FFFFFFULL) << 32);
   }
   ULong r0  = sad_8x4( dst >>  0, src );
   ULong r1  = sad_8x4( dst >>  8, src );
   ULong r2  = sad_8x4( dst >> 16, src );
   ULong r3  = sad_8x4( dst >> 24, src );
   ULong res = (r3 << 48) | (r2 << 32) | (r1 << 16) | r0;
   return res;
}

/* CALLED FROM GENERATED CODE: CLEAN HELPER */
ULong amd64g_calculate_pext ( ULong src_masked, ULong mask )
{
   ULong dst = 0;
   ULong src_bit;
   ULong dst_bit = 1;
   for (src_bit = 1; src_bit; src_bit <<= 1) {
      if (mask & src_bit) {
         if (src_masked & src_bit) dst |= dst_bit;
         dst_bit <<= 1;
      }
   }
   return dst;
}

/* CALLED FROM GENERATED CODE: CLEAN HELPER */
ULong amd64g_calculate_pdep ( ULong src, ULong mask )
{
   ULong dst = 0;
   ULong dst_bit;
   ULong src_bit = 1;
   for (dst_bit = 1; dst_bit; dst_bit <<= 1) {
      if (mask & dst_bit) {
         if (src & src_bit) dst |= dst_bit;
         src_bit <<= 1;
      }
   }
   return dst;
}

/*---------------------------------------------------------------*/
/*--- Helpers for SSE4.2 PCMP{E,I}STR{I,M}                    ---*/
/*---------------------------------------------------------------*/

static UInt zmask_from_V128 ( V128* arg )
{
   UInt i, res = 0;
   for (i = 0; i < 16; i++) {
      res |=  ((arg->w8[i] == 0) ? 1 : 0) << i;
   }
   return res;
}

static UInt zmask_from_V128_wide ( V128* arg )
{
   UInt i, res = 0;
   for (i = 0; i < 8; i++) {
      res |=  ((arg->w16[i] == 0) ? 1 : 0) << i;
   }
   return res;
}

/* Helps with PCMP{I,E}STR{I,M}.

   CALLED FROM GENERATED CODE: DIRTY HELPER(s).  (But not really,
   actually it could be a clean helper, but for the fact that we can't
   pass by value 2 x V128 to a clean helper, nor have one returned.)
   Reads guest state, writes to guest state for the xSTRM cases, no
   accesses of memory, is a pure function.

   opc_and_imm contains (4th byte of opcode << 8) | the-imm8-byte so
   the callee knows which I/E and I/M variant it is dealing with and
   what the specific operation is.  4th byte of opcode is in the range
   0x60 to 0x63:
       istri  66 0F 3A 63
       istrm  66 0F 3A 62
       estri  66 0F 3A 61
       estrm  66 0F 3A 60

   gstOffL and gstOffR are the guest state offsets for the two XMM
   register inputs.  We never have to deal with the memory case since
   that is handled by pre-loading the relevant value into the fake
   XMM16 register.

   For ESTRx variants, edxIN and eaxIN hold the values of those two
   registers.

   In all cases, the bottom 16 bits of the result contain the new
   OSZACP %rflags values.  For xSTRI variants, bits[31:16] of the
   result hold the new %ecx value.  For xSTRM variants, the helper
   writes the result directly to the guest XMM0.

   Declarable side effects: in all cases, reads guest state at
   [gstOffL, +16) and [gstOffR, +16).  For xSTRM variants, also writes
   guest_XMM0.

   Is expected to be called with opc_and_imm combinations which have
   actually been validated, and will assert if otherwise.  The front
   end should ensure we're only called with verified values.
*/
ULong amd64g_dirtyhelper_PCMPxSTRx (
          VexGuestAMD64State* gst,
          HWord opc4_and_imm,
          HWord gstOffL, HWord gstOffR,
          HWord edxIN, HWord eaxIN
       )
{
   HWord opc4 = (opc4_and_imm >> 8) & 0xFF;
   HWord imm8 = opc4_and_imm & 0xFF;
   HWord isISTRx = opc4 & 2;
   HWord isxSTRM = (opc4 & 1) ^ 1;
   vassert((opc4 & 0xFC) == 0x60); /* 0x60 .. 0x63 */
   HWord wide = (imm8 & 1);

   // where the args are
   V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
   V128* argR = (V128*)( ((UChar*)gst) + gstOffR );

   /* Create the arg validity masks, either from the vectors
      themselves or from the supplied edx/eax values. */
   // FIXME: this is only right for the 8-bit data cases.
   // At least that is asserted above.
   UInt zmaskL, zmaskR;

   // temp spot for the resulting flags and vector.
   V128 resV;
   UInt resOSZACP;

   // for checking whether case was handled
   Bool ok = False;

   if (wide) {
      if (isISTRx) {
         zmaskL = zmask_from_V128_wide(argL);
         zmaskR = zmask_from_V128_wide(argR);
      } else {
         Int tmp;
         tmp = edxIN & 0xFFFFFFFF;
         if (tmp < -8) tmp = -8;
         if (tmp > 8)  tmp = 8;
         if (tmp < 0)  tmp = -tmp;
         vassert(tmp >= 0 && tmp <= 8);
         zmaskL = (1 << tmp) & 0xFF;
         tmp = eaxIN & 0xFFFFFFFF;
         if (tmp < -8) tmp = -8;
         if (tmp > 8)  tmp = 8;
         if (tmp < 0)  tmp = -tmp;
         vassert(tmp >= 0 && tmp <= 8);
         zmaskR = (1 << tmp) & 0xFF;
      }
      // do the meyaath
      ok = compute_PCMPxSTRx_wide (
              &resV, &resOSZACP, argL, argR,
              zmaskL, zmaskR, imm8, (Bool)isxSTRM
           );
   } else {
      if (isISTRx) {
         zmaskL = zmask_from_V128(argL);
         zmaskR = zmask_from_V128(argR);
      } else {
         Int tmp;
         tmp = edxIN & 0xFFFFFFFF;
         if (tmp < -16) tmp = -16;
         if (tmp > 16)  tmp = 16;
         if (tmp < 0)   tmp = -tmp;
         vassert(tmp >= 0 && tmp <= 16);
         zmaskL = (1 << tmp) & 0xFFFF;
         tmp = eaxIN & 0xFFFFFFFF;
         if (tmp < -16) tmp = -16;
         if (tmp > 16)  tmp = 16;
         if (tmp < 0)   tmp = -tmp;
         vassert(tmp >= 0 && tmp <= 16);
         zmaskR = (1 << tmp) & 0xFFFF;
      }
      // do the meyaath
      ok = compute_PCMPxSTRx (
              &resV, &resOSZACP, argL, argR,
              zmaskL, zmaskR, imm8, (Bool)isxSTRM
           );
   }

   // front end shouldn't pass us any imm8 variants we can't
   // handle.  Hence:
   vassert(ok);

   // So, finally we need to get the results back to the caller.
   // In all cases, the new OSZACP value is the lowest 16 of
   // the return value.
   if (isxSTRM) {
      gst->guest_YMM0[0] = resV.w32[0];
      gst->guest_YMM0[1] = resV.w32[1];
      gst->guest_YMM0[2] = resV.w32[2];
      gst->guest_YMM0[3] = resV.w32[3];
      return resOSZACP & 0x8D5;
   } else {
      UInt newECX = resV.w32[0] & 0xFFFF;
      return (newECX << 16) | (resOSZACP & 0x8D5);
   }
}

/*---------------------------------------------------------------*/
/*--- AES primitives and helpers                              ---*/
/*---------------------------------------------------------------*/
/* a 16 x 16 matrix */
static const UChar sbox[256] = {                   // row nr
   0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, // 1
   0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
   0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, // 2
   0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
   0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, // 3
   0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
   0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, // 4
   0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
   0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, // 5
   0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
   0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, // 6
   0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
   0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, // 7
   0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
   0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, // 8
   0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
   0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, // 9
   0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
   0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, //10
   0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
   0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, //11
   0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
   0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, //12
   0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
   0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, //13
   0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
   0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, //14
   0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
   0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, //15
   0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
   0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, //16
   0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
};
static void SubBytes (V128* v)
{
   V128 r;
   UInt i;
   for (i = 0; i < 16; i++)
      r.w8[i] = sbox[v->w8[i]];
   *v = r;
}

/* a 16 x 16 matrix */
static const UChar invsbox[256] = {                // row nr
   0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, // 1
   0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
   0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, // 2
   0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
   0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, // 3
   0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
   0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, // 4
   0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
   0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, // 5
   0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
   0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, // 6
   0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
   0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, // 7
   0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
   0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, // 8
   0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
   0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, // 9
   0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
   0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, //10
   0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
   0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, //11
   0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
   0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, //12
   0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
   0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, //13
   0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
   0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, //14
   0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
   0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, //15
   0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
   0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, //16
   0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
};
static void InvSubBytes (V128* v)
{
   V128 r;
   UInt i;
   for (i = 0; i < 16; i++)
      r.w8[i] = invsbox[v->w8[i]];
   *v = r;
}

static const UChar ShiftRows_op[16] =
   {11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0};
static void ShiftRows (V128* v)
{
   V128 r;
   UInt i;
   for (i = 0; i < 16; i++)
      r.w8[i] = v->w8[ShiftRows_op[15-i]];
   *v = r;
}

static const UChar InvShiftRows_op[16] =
   {3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0};
static void InvShiftRows (V128* v)
{
   V128 r;
   UInt i;
   for (i = 0; i < 16; i++)
      r.w8[i] = v->w8[InvShiftRows_op[15-i]];
   *v = r;
}

/* Multiplication of the finite fields elements of AES.
   See "A Specification for The AES Algorithm Rijndael
        (by Joan Daemen & Vincent Rijmen)"
        Dr. Brian Gladman, v3.1, 3rd March 2001. */
/* N values so that (hex) xy = 0x03^N.
   0x00 cannot be used. We put 0xff for this value.*/
/* a 16 x 16 matrix */
static const UChar Nxy[256] = {                    // row nr
   0xff, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, // 1
   0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03,
   0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, // 2
   0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1,
   0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, // 3
   0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78,
   0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, // 4
   0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e,
   0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, // 5
   0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38,
   0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, // 6
   0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10,
   0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, // 7
   0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba,
   0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, // 8
   0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57,
   0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, // 9
   0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8,
   0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, //10
   0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0,
   0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, //11
   0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7,
   0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, //12
   0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d,
   0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, //13
   0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1,
   0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, //14
   0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab,
   0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, //15
   0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5,
   0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, //16
   0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07
};

/* E values so that E = 0x03^xy. */
static const UChar Exy[256] = {                    // row nr
   0x01, 0x03, 0x05, 0x0f, 0x11, 0x33, 0x55, 0xff, // 1
   0x1a, 0x2e, 0x72, 0x96, 0xa1, 0xf8, 0x13, 0x35,
   0x5f, 0xe1, 0x38, 0x48, 0xd8, 0x73, 0x95, 0xa4, // 2
   0xf7, 0x02, 0x06, 0x0a, 0x1e, 0x22, 0x66, 0xaa,
   0xe5, 0x34, 0x5c, 0xe4, 0x37, 0x59, 0xeb, 0x26, // 3
   0x6a, 0xbe, 0xd9, 0x70, 0x90, 0xab, 0xe6, 0x31,
   0x53, 0xf5, 0x04, 0x0c, 0x14, 0x3c, 0x44, 0xcc, // 4
   0x4f, 0xd1, 0x68, 0xb8, 0xd3, 0x6e, 0xb2, 0xcd,
   0x4c, 0xd4, 0x67, 0xa9, 0xe0, 0x3b, 0x4d, 0xd7, // 5
   0x62, 0xa6, 0xf1, 0x08, 0x18, 0x28, 0x78, 0x88,
   0x83, 0x9e, 0xb9, 0xd0, 0x6b, 0xbd, 0xdc, 0x7f, // 6
   0x81, 0x98, 0xb3, 0xce, 0x49, 0xdb, 0x76, 0x9a,
   0xb5, 0xc4, 0x57, 0xf9, 0x10, 0x30, 0x50, 0xf0, // 7
   0x0b, 0x1d, 0x27, 0x69, 0xbb, 0xd6, 0x61, 0xa3,
   0xfe, 0x19, 0x2b, 0x7d, 0x87, 0x92, 0xad, 0xec, // 8
   0x2f, 0x71, 0x93, 0xae, 0xe9, 0x20, 0x60, 0xa0,
   0xfb, 0x16, 0x3a, 0x4e, 0xd2, 0x6d, 0xb7, 0xc2, // 9
   0x5d, 0xe7, 0x32, 0x56, 0xfa, 0x15, 0x3f, 0x41,
   0xc3, 0x5e, 0xe2, 0x3d, 0x47, 0xc9, 0x40, 0xc0, //10
   0x5b, 0xed, 0x2c, 0x74, 0x9c, 0xbf, 0xda, 0x75,
   0x9f, 0xba, 0xd5, 0x64, 0xac, 0xef, 0x2a, 0x7e, //11
   0x82, 0x9d, 0xbc, 0xdf, 0x7a, 0x8e, 0x89, 0x80,
   0x9b, 0xb6, 0xc1, 0x58, 0xe8, 0x23, 0x65, 0xaf, //12
   0xea, 0x25, 0x6f, 0xb1, 0xc8, 0x43, 0xc5, 0x54,
   0xfc, 0x1f, 0x21, 0x63, 0xa5, 0xf4, 0x07, 0x09, //13
   0x1b, 0x2d, 0x77, 0x99, 0xb0, 0xcb, 0x46, 0xca,
   0x45, 0xcf, 0x4a, 0xde, 0x79, 0x8b, 0x86, 0x91, //14
   0xa8, 0xe3, 0x3e, 0x42, 0xc6, 0x51, 0xf3, 0x0e,
   0x12, 0x36, 0x5a, 0xee, 0x29, 0x7b, 0x8d, 0x8c, //15
   0x8f, 0x8a, 0x85, 0x94, 0xa7, 0xf2, 0x0d, 0x17,
   0x39, 0x4b, 0xdd, 0x7c, 0x84, 0x97, 0xa2, 0xfd, //16
   0x1c, 0x24, 0x6c, 0xb4, 0xc7, 0x52, 0xf6, 0x01};

static inline UChar ff_mul(UChar u1, UChar u2)
{
   if ((u1 > 0) && (u2 > 0)) {
      UInt ui = Nxy[u1] + Nxy[u2];
      if (ui >= 255)
         ui = ui - 255;
      return Exy[ui];
   } else {
      return 0;
   };
}

static void MixColumns (V128* v)
{
   V128 r;
   Int j;
#define P(x,row,col) (x)->w8[((row)*4+(col))]
   for (j = 0; j < 4; j++) {
      P(&r,j,0) = ff_mul(0x02, P(v,j,0)) ^ ff_mul(0x03, P(v,j,1))
         ^ P(v,j,2) ^ P(v,j,3);
      P(&r,j,1) = P(v,j,0) ^ ff_mul( 0x02, P(v,j,1) )
         ^ ff_mul(0x03, P(v,j,2) ) ^ P(v,j,3);
      P(&r,j,2) = P(v,j,0) ^ P(v,j,1) ^ ff_mul( 0x02, P(v,j,2) )
         ^ ff_mul(0x03, P(v,j,3) );
      P(&r,j,3) = ff_mul(0x03, P(v,j,0) ) ^ P(v,j,1) ^ P(v,j,2)
         ^ ff_mul( 0x02, P(v,j,3) );
   }
   *v = r;
#undef P
}

static void InvMixColumns (V128* v)
{
   V128 r;
   Int j;
#define P(x,row,col) (x)->w8[((row)*4+(col))]
   for (j = 0; j < 4; j++) {
      P(&r,j,0) = ff_mul(0x0e, P(v,j,0) ) ^ ff_mul(0x0b, P(v,j,1) )
         ^ ff_mul(0x0d,P(v,j,2) ) ^ ff_mul(0x09, P(v,j,3) );
      P(&r,j,1) = ff_mul(0x09, P(v,j,0) ) ^ ff_mul(0x0e, P(v,j,1) )
         ^ ff_mul(0x0b,P(v,j,2) ) ^ ff_mul(0x0d, P(v,j,3) );
      P(&r,j,2) = ff_mul(0x0d, P(v,j,0) ) ^ ff_mul(0x09, P(v,j,1) )
         ^ ff_mul(0x0e,P(v,j,2) ) ^ ff_mul(0x0b, P(v,j,3) );
      P(&r,j,3) = ff_mul(0x0b, P(v,j,0) ) ^ ff_mul(0x0d, P(v,j,1) )
         ^ ff_mul(0x09,P(v,j,2) ) ^ ff_mul(0x0e, P(v,j,3) );
   }
   *v = r;
#undef P

}

/* For description, see definition in guest_amd64_defs.h */
void amd64g_dirtyhelper_AES (
          VexGuestAMD64State* gst,
          HWord opc4, HWord gstOffD,
          HWord gstOffL, HWord gstOffR
       )
{
   // where the args are
   V128* argD = (V128*)( ((UChar*)gst) + gstOffD );
   V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
   V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
   V128  r;

   switch (opc4) {
      case 0xDC: /* AESENC */
      case 0xDD: /* AESENCLAST */
         r = *argR;
         ShiftRows (&r);
         SubBytes  (&r);
         if (opc4 == 0xDC)
            MixColumns (&r);
         argD->w64[0] = r.w64[0] ^ argL->w64[0];
         argD->w64[1] = r.w64[1] ^ argL->w64[1];
         break;

      case 0xDE: /* AESDEC */
      case 0xDF: /* AESDECLAST */
         r = *argR;
         InvShiftRows (&r);
         InvSubBytes (&r);
         if (opc4 == 0xDE)
            InvMixColumns (&r);
         argD->w64[0] = r.w64[0] ^ argL->w64[0];
         argD->w64[1] = r.w64[1] ^ argL->w64[1];
         break;

      case 0xDB: /* AESIMC */
         *argD = *argL;
         InvMixColumns (argD);
         break;
      default: vassert(0);
   }
}

static inline UInt RotWord (UInt   w32)
{
   return ((w32 >> 8) | (w32 << 24));
}

static inline UInt SubWord (UInt   w32)
{
   UChar *w8;
   UChar *r8;
   UInt res;
   w8 = (UChar*) &w32;
   r8 = (UChar*) &res;
   r8[0] = sbox[w8[0]];
   r8[1] = sbox[w8[1]];
   r8[2] = sbox[w8[2]];
   r8[3] = sbox[w8[3]];
   return res;
}

/* For description, see definition in guest_amd64_defs.h */
extern void amd64g_dirtyhelper_AESKEYGENASSIST (
          VexGuestAMD64State* gst,
          HWord imm8,
          HWord gstOffL, HWord gstOffR
       )
{
   // where the args are
   V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
   V128* argR = (V128*)( ((UChar*)gst) + gstOffR );

   // We have to create the result in a temporary in the
   // case where the src and dst regs are the same.  See #341698.
   V128 tmp;

   tmp.w32[3] = RotWord (SubWord (argL->w32[3])) ^ imm8;
   tmp.w32[2] = SubWord (argL->w32[3]);
   tmp.w32[1] = RotWord (SubWord (argL->w32[1])) ^ imm8;
   tmp.w32[0] = SubWord (argL->w32[1]);

   argR->w32[3] = tmp.w32[3];
   argR->w32[2] = tmp.w32[2];
   argR->w32[1] = tmp.w32[1];
   argR->w32[0] = tmp.w32[0];
}


/*---------------------------------------------------------------*/
/*--- Helpers for dealing with, and describing,               ---*/
/*--- guest state as a whole.                                 ---*/
/*---------------------------------------------------------------*/

/* Initialise the entire amd64 guest state. */
/* VISIBLE TO LIBVEX CLIENT */
void LibVEX_GuestAMD64_initialise ( /*OUT*/VexGuestAMD64State* vex_state )
{
   vex_state->host_EvC_FAILADDR = 0;
   vex_state->host_EvC_COUNTER = 0;
   vex_state->pad0 = 0;

   vex_state->guest_RAX = 0;
   vex_state->guest_RCX = 0;
   vex_state->guest_RDX = 0;
   vex_state->guest_RBX = 0;
   vex_state->guest_RSP = 0;
   vex_state->guest_RBP = 0;
   vex_state->guest_RSI = 0;
   vex_state->guest_RDI = 0;
   vex_state->guest_R8  = 0;
   vex_state->guest_R9  = 0;
   vex_state->guest_R10 = 0;
   vex_state->guest_R11 = 0;
   vex_state->guest_R12 = 0;
   vex_state->guest_R13 = 0;
   vex_state->guest_R14 = 0;
   vex_state->guest_R15 = 0;

   vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
   vex_state->guest_CC_DEP1 = 0;
   vex_state->guest_CC_DEP2 = 0;
   vex_state->guest_CC_NDEP = 0;

   vex_state->guest_DFLAG   = 1; /* forwards */
   vex_state->guest_IDFLAG  = 0;
   vex_state->guest_ACFLAG  = 0;

   /* HACK: represent the offset associated with a constant %fs.
      Typically, on linux, this assumes that %fs is only ever zero (main
      thread) or 0x63. */
   vex_state->guest_FS_CONST = 0;

   vex_state->guest_RIP = 0;

   /* Initialise the simulated FPU */
   amd64g_dirtyhelper_FINIT( vex_state );

   /* Initialise the AVX state. */
#  define AVXZERO(_ymm) \
      do { _ymm[0]=_ymm[1]=_ymm[2]=_ymm[3] = 0; \
           _ymm[4]=_ymm[5]=_ymm[6]=_ymm[7] = 0; \
      } while (0)
   vex_state->guest_SSEROUND = (ULong)Irrm_NEAREST;
   AVXZERO(vex_state->guest_YMM0);
   AVXZERO(vex_state->guest_YMM1);
   AVXZERO(vex_state->guest_YMM2);
   AVXZERO(vex_state->guest_YMM3);
   AVXZERO(vex_state->guest_YMM4);
   AVXZERO(vex_state->guest_YMM5);
   AVXZERO(vex_state->guest_YMM6);
   AVXZERO(vex_state->guest_YMM7);
   AVXZERO(vex_state->guest_YMM8);
   AVXZERO(vex_state->guest_YMM9);
   AVXZERO(vex_state->guest_YMM10);
   AVXZERO(vex_state->guest_YMM11);
   AVXZERO(vex_state->guest_YMM12);
   AVXZERO(vex_state->guest_YMM13);
   AVXZERO(vex_state->guest_YMM14);
   AVXZERO(vex_state->guest_YMM15);
   AVXZERO(vex_state->guest_YMM16);

#  undef AVXZERO

   vex_state->guest_EMNOTE = EmNote_NONE;

   /* These should not ever be either read or written, but we
      initialise them anyway. */
   vex_state->guest_CMSTART = 0;
   vex_state->guest_CMLEN   = 0;

   vex_state->guest_NRADDR   = 0;
   vex_state->guest_SC_CLASS = 0;
   vex_state->guest_GS_CONST = 0;

   vex_state->guest_IP_AT_SYSCALL = 0;
   vex_state->pad1 = 0;
}


/* Figure out if any part of the guest state contained in minoff
   .. maxoff requires precise memory exceptions.  If in doubt return
   True (but this generates significantly slower code).

   By default we enforce precise exns for guest %RSP, %RBP and %RIP
   only.  These are the minimum needed to extract correct stack
   backtraces from amd64 code.

   Only %RSP is needed in mode VexRegUpdSpAtMemAccess.
*/
Bool guest_amd64_state_requires_precise_mem_exns (
        Int minoff, Int maxoff, VexRegisterUpdates pxControl
     )
{
   Int rbp_min = offsetof(VexGuestAMD64State, guest_RBP);
   Int rbp_max = rbp_min + 8 - 1;
   Int rsp_min = offsetof(VexGuestAMD64State, guest_RSP);
   Int rsp_max = rsp_min + 8 - 1;
   Int rip_min = offsetof(VexGuestAMD64State, guest_RIP);
   Int rip_max = rip_min + 8 - 1;

   if (maxoff < rsp_min || minoff > rsp_max) {
      /* no overlap with rsp */
      if (pxControl == VexRegUpdSpAtMemAccess)
         return False; // We only need to check stack pointer.
   } else {
      return True;
   }

   if (maxoff < rbp_min || minoff > rbp_max) {
      /* no overlap with rbp */
   } else {
      return True;
   }

   if (maxoff < rip_min || minoff > rip_max) {
      /* no overlap with eip */
   } else {
      return True;
   }

   return False;
}


#define ALWAYSDEFD(field)                             \
    { offsetof(VexGuestAMD64State, field),            \
      (sizeof ((VexGuestAMD64State*)0)->field) }

VexGuestLayout
   amd64guest_layout
      = {
          /* Total size of the guest state, in bytes. */
          .total_sizeB = sizeof(VexGuestAMD64State),

          /* Describe the stack pointer. */
          .offset_SP = offsetof(VexGuestAMD64State,guest_RSP),
          .sizeof_SP = 8,

          /* Describe the frame pointer. */
          .offset_FP = offsetof(VexGuestAMD64State,guest_RBP),
          .sizeof_FP = 8,

          /* Describe the instruction pointer. */
          .offset_IP = offsetof(VexGuestAMD64State,guest_RIP),
          .sizeof_IP = 8,

          /* Describe any sections to be regarded by Memcheck as
             'always-defined'. */
          .n_alwaysDefd = 16,

          /* flags thunk: OP and NDEP are always defd, whereas DEP1
             and DEP2 have to be tracked.  See detailed comment in
             gdefs.h on meaning of thunk fields. */
          .alwaysDefd
             = { /*  0 */ ALWAYSDEFD(guest_CC_OP),
                 /*  1 */ ALWAYSDEFD(guest_CC_NDEP),
		 /*  2 */ ALWAYSDEFD(guest_DFLAG),
                 /*  3 */ ALWAYSDEFD(guest_IDFLAG),
                 /*  4 */ ALWAYSDEFD(guest_RIP),
                 /*  5 */ ALWAYSDEFD(guest_FS_CONST),
                 /*  6 */ ALWAYSDEFD(guest_FTOP),
                 /*  7 */ ALWAYSDEFD(guest_FPTAG),
                 /*  8 */ ALWAYSDEFD(guest_FPROUND),
                 /*  9 */ ALWAYSDEFD(guest_FC3210),
                 // /* */ ALWAYSDEFD(guest_CS),
                 // /* */ ALWAYSDEFD(guest_DS),
                 // /* */ ALWAYSDEFD(guest_ES),
                 // /* */ ALWAYSDEFD(guest_FS),
                 // /* */ ALWAYSDEFD(guest_GS),
                 // /* */ ALWAYSDEFD(guest_SS),
                 // /* */ ALWAYSDEFD(guest_LDT),
                 // /* */ ALWAYSDEFD(guest_GDT),
                 /* 10 */ ALWAYSDEFD(guest_EMNOTE),
                 /* 11 */ ALWAYSDEFD(guest_SSEROUND),
                 /* 12 */ ALWAYSDEFD(guest_CMSTART),
                 /* 13 */ ALWAYSDEFD(guest_CMLEN),
                 /* 14 */ ALWAYSDEFD(guest_SC_CLASS),
                 /* 15 */ ALWAYSDEFD(guest_IP_AT_SYSCALL)
               }
        };


/*---------------------------------------------------------------*/
/*--- end                               guest_amd64_helpers.c ---*/
/*---------------------------------------------------------------*/