1 
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <assert.h>
5 #include "tests/asm.h"
6 #include "tests/malloc.h"
7 #include <string.h>
8 
9 #define XSAVE_AREA_SIZE 832
10 
11 typedef  unsigned char           UChar;
12 typedef  unsigned int            UInt;
13 typedef  unsigned long long int  ULong;
14 
15 typedef  unsigned long int       UWord;
16 
17 typedef  unsigned char  Bool;
18 #define  True   ((Bool)1)
19 #define  False  ((Bool)0)
20 
21 const unsigned int vec0[8]
22    = { 0x12345678, 0x11223344, 0x55667788, 0x87654321,
23        0x15263748, 0x91929394, 0x19293949, 0x48372615 };
24 
25 const unsigned int vec1[8]
26    = { 0xABCDEF01, 0xAABBCCDD, 0xEEFF0011, 0x10FEDCBA,
27        0xBADCFE10, 0xFFEE9988, 0x11667722, 0x01EFCDAB };
28 
29 const unsigned int vecZ[8]
30    = { 0, 0, 0, 0, 0, 0, 0, 0 };
31 
32 /* A version of memset that doesn't use XMM or YMM registers. */
33 static __attribute__((noinline))
my_memset(void * s,int c,size_t n)34 void* my_memset(void* s, int c, size_t n)
35 {
36    size_t i;
37    for (i = 0; i < n; i++) {
38       ((unsigned char*)s)[i] = (unsigned char)(unsigned int)c;
39       /* Defeat any attempt at autovectorisation */
40       __asm__ __volatile__("" ::: "cc","memory");
41    }
42    return s;
43 }
44 
45 /* Ditto for memcpy */
46 static __attribute__((noinline))
my_memcpy(void * dest,const void * src,size_t n)47 void* my_memcpy(void *dest, const void *src, size_t n)
48 {
49    size_t i;
50    for (i = 0; i < n; i++) {
51       ((unsigned char*)dest)[i] = ((unsigned char*)src)[i];
52       __asm__ __volatile__("" ::: "cc","memory");
53    }
54    return dest;
55 }
56 
memalign_zeroed64(size_t size)57 static void* memalign_zeroed64(size_t size)
58 {
59    char* p = memalign64(size);
60    if (p && size > 0) {
61       my_memset(p, 0, size);
62    }
63    return p;
64 }
65 
66 __attribute__((noinline))
do_xsave(void * p,UInt rfbm)67 static void do_xsave ( void* p, UInt rfbm )
68 {
69    assert(rfbm <= 7);
70    __asm__ __volatile__(
71       "movq %0, %%rax;  xorq %%rdx, %%rdx;  xsave (%1)"
72          : /*OUT*/ : /*IN*/ "r"((ULong)rfbm), "r"(p)
73          : /*TRASH*/ "memory", "rax", "rdx"
74    );
75 }
76 
77 __attribute__((noinline))
do_xrstor(void * p,UInt rfbm)78 static void do_xrstor ( void* p, UInt rfbm )
79 {
80    assert(rfbm <= 7);
81    __asm__ __volatile__(
82       "movq %0, %%rax;  xorq %%rdx, %%rdx;  xrstor (%1)"
83          : /*OUT*/ : /*IN*/ "r"((ULong)rfbm), "r"(p)
84          : /*TRASH*/ "rax", "rdx" /* FIXME plus all X87,SSE,AVX regs */
85    );
86 }
87 
88 /* set up the FP, SSE and AVX state, and then dump it. */
do_setup_then_xsave(void * p,UInt rfbm)89 static void do_setup_then_xsave ( void* p, UInt rfbm )
90 {
91    __asm__ __volatile__("finit");
92    __asm__ __volatile__("fldpi");
93    __asm__ __volatile__("fld1");
94    __asm__ __volatile__("fldln2");
95    __asm__ __volatile__("fldlg2");
96    __asm__ __volatile__("fld %st(3)");
97    __asm__ __volatile__("fld %st(3)");
98    __asm__ __volatile__("fld1");
99    __asm__ __volatile__("vmovups (%0), %%ymm0" : : "r"(&vec0[0]) : "xmm0" );
100    __asm__ __volatile__("vmovups (%0), %%ymm1" : : "r"(&vec1[0]) : "xmm1" );
101    __asm__ __volatile__("vxorps  %ymm2, %ymm2, %ymm2");
102    __asm__ __volatile__("vmovaps %ymm0, %ymm3");
103    __asm__ __volatile__("vmovaps %ymm1, %ymm4");
104    __asm__ __volatile__("vmovaps %ymm2, %ymm5");
105    __asm__ __volatile__("vmovaps %ymm0, %ymm6");
106    __asm__ __volatile__("vmovaps %ymm1, %ymm7");
107    __asm__ __volatile__("vmovaps %ymm1, %ymm8");
108    __asm__ __volatile__("vmovaps %ymm2, %ymm9");
109    __asm__ __volatile__("vmovaps %ymm0, %ymm10");
110    __asm__ __volatile__("vmovaps %ymm1, %ymm11");
111    __asm__ __volatile__("vmovaps %ymm1, %ymm12");
112    __asm__ __volatile__("vmovaps %ymm2, %ymm13");
113    __asm__ __volatile__("vmovaps %ymm0, %ymm14");
114    __asm__ __volatile__("vmovaps %ymm1, %ymm15");
115    do_xsave(p, rfbm);
116 }
117 
isFPLsbs(int i)118 static int isFPLsbs ( int i )
119 {
120    int q;
121    q = 32; if (i == q || i == q+1) return 1;
122    q = 48; if (i == q || i == q+1) return 1;
123    q = 64; if (i == q || i == q+1) return 1;
124    q = 80; if (i == q || i == q+1) return 1;
125    q = 96; if (i == q || i == q+1) return 1;
126    q = 112; if (i == q || i == q+1) return 1;
127    q = 128; if (i == q || i == q+1) return 1;
128    q = 144; if (i == q || i == q+1) return 1;
129    return 0;
130 }
131 
show(unsigned char * buf,Bool hideBits64to79)132 static void show ( unsigned char* buf, Bool hideBits64to79 )
133 {
134    int i;
135    for (i = 0; i < XSAVE_AREA_SIZE; i++) {
136       if ((i % 16) == 0)
137          fprintf(stderr, "%3d   ", i);
138       if (hideBits64to79 && isFPLsbs(i))
139 	 fprintf(stderr, "xx ");
140       else
141          fprintf(stderr, "%02x ", buf[i]);
142       if (i > 0 && ((i % 16) == 15))
143          fprintf(stderr, "\n");
144    }
145 }
146 
cpuid(UInt * eax,UInt * ebx,UInt * ecx,UInt * edx,UInt index,UInt ecx_in)147 static void cpuid ( UInt* eax, UInt* ebx, UInt* ecx, UInt* edx,
148                     UInt index, UInt ecx_in )
149 {
150    UInt a,b,c,d;
151    asm volatile ("cpuid"
152                  : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \
153                  : "0" (index), "2"(ecx_in) );
154    *eax = a; *ebx = b; *ecx = c; *edx = d;
155    //fprintf(stderr, "%08x %08x -> %08x %08x %08x %08x\n",
156    //        index,ecx_in, a,b,c,d );
157 }
158 
xgetbv(UInt * eax,UInt * edx,UInt ecx_in)159 static void xgetbv ( UInt* eax, UInt* edx, UInt ecx_in )
160 {
161    UInt a,d;
162    asm volatile ("xgetbv"
163                  : "=a" (a), "=d" (d) \
164                  : "c"(ecx_in) );
165    *eax = a; *edx = d;
166 }
167 
check_for_xsave(void)168 static void check_for_xsave ( void )
169 {
170    UInt eax, ebx, ecx, edx;
171    Bool ok = True;
172 
173    eax = ebx = ecx = edx = 0;
174    cpuid(&eax, &ebx, &ecx, &edx, 1,0);
175    //fprintf(stderr, "cpuid(1).ecx[26=xsave]   = %u\n", (ecx >> 26) & 1);
176    ok = ok && (((ecx >> 26) & 1) == 1);
177 
178    eax = ebx = ecx = edx = 0;
179    cpuid(&eax, &ebx, &ecx, &edx, 1,0);
180    //fprintf(stderr, "cpuid(1).ecx[27=osxsave] = %u\n", (ecx >> 27) & 1);
181    ok = ok && (((ecx >> 27) & 1) == 1);
182 
183    eax = ebx = ecx = edx = 0;
184    xgetbv(&eax, &edx, 0);
185    //fprintf(stderr, "xgetbv(0) = %u:%u\n", edx, eax);
186    ok = ok && (edx == 0) && (eax == 7);
187 
188    if (ok) return;
189 
190    fprintf(stderr,
191            "This program must be run on a CPU that supports AVX and XSAVE.\n");
192    exit(1);
193 }
194 
195 
test_xsave(Bool hideBits64to79)196 void test_xsave ( Bool hideBits64to79 )
197 {
198    /* Testing XSAVE:
199 
200       For RBFM in 0 .. 7 (that is, all combinations): set the x87, SSE
201       and AVX registers with some values, do XSAVE to dump it, and
202       print the resulting buffer. */
203 
204    UInt rfbm;
205    for (rfbm = 0; rfbm <= 7; rfbm++) {
206       UChar* saved_img = memalign_zeroed64(XSAVE_AREA_SIZE);
207 
208       my_memset(saved_img, 0xAA, XSAVE_AREA_SIZE);
209       saved_img[512] = 0;
210       do_setup_then_xsave(saved_img, rfbm);
211 
212       fprintf(stderr,
213               "------------------ XSAVE, rfbm = %u ------------------\n", rfbm);
214       show(saved_img, hideBits64to79);
215       fprintf(stderr, "\n");
216 
217       free(saved_img);
218    }
219 }
220 
221 
test_xrstor(Bool hideBits64to79)222 void test_xrstor ( Bool hideBits64to79 )
223 {
224    /* Testing XRSTOR is more complex than testing XSAVE, because the
225       loaded value(s) depend not only on what bits are requested (by
226       RBFM) but also on what bits are actually present in the image
227       (defined by XSTATE_BV).  So we have to test all 64 (8 x 8)
228       combinations.
229 
230       The approach is to fill a memory buffer with data, do XRSTOR
231       from the buffer, them dump all components with XSAVE in a new
232       buffer, and print the result.  This is complicated by the fact
233       that we need to be able to see which parts of the state (in
234       registers) are neither overwritten nor zeroed by the restore.
235       Hence the registers must be pre-filled with values which are
236       neither zero nor the data to be loaded.  We choose to use 0x55
237       where possible. */
238 
239    UChar* fives = memalign_zeroed64(XSAVE_AREA_SIZE);
240    my_memset(fives, 0x55, XSAVE_AREA_SIZE);
241    /* Set MXCSR so that the insn doesn't fault */
242    fives[24] = 0x80;
243    fives[25] = 0x1f;
244    fives[26] = 0;
245    fives[27] = 0;
246    /* Ditto for the XSAVE header area.  Also set XSTATE_BV. */
247    fives[512] = 7;
248    UInt i;
249    for (i = 1; i <= 23; i++) fives[512+i] = 0;
250    /* Fill the x87 register values with something that VEX's
251       80-vs-64-bit kludging won't mess up -- an 80 bit number which is
252       representable also as 64 bit: 123456789.0123 */
253    for (i = 0; i <= 7; i++) {
254       UChar* p = &fives[32 + 16 * i];
255       p[0]=0x00; p[1]=0xf8; p[2]=0xc2; p[3]=0x64; p[4]=0xa0;
256       p[5]=0xa2; p[6]=0x79; p[7]=0xeb; p[8]=0x19; p[9]=0x40;
257    }
258    /* And mark the tags for all 8 dumped regs as "valid". */
259    fives[4/*FTW*/] = 0xFF;
260 
261    /* (1) (see comment in loop below) */
262    UChar* standard_test_data = memalign_zeroed64(XSAVE_AREA_SIZE);
263    do_setup_then_xsave(standard_test_data, 7);
264 
265    UInt xstate_bv, rfbm;
266    for (xstate_bv = 0; xstate_bv <= 7; xstate_bv++) {
267       for (rfbm = 0; rfbm <= 7; rfbm++) {
268    //{ xstate_bv = 7;
269    //      { rfbm = 6;
270          /* 1.  Copy the "standard test data" into registers, and dump
271                 it with XSAVE.  This gives us an image we can try
272                 restoring from.
273 
274             2.  Set the register state to all-0x55s (as far as is
275                 possible), so we can see which parts get overwritten
276                 and which parts get zeroed on the test restore.
277 
278             3.  Do the restore from the image prepared in (1).
279 
280             4.  Dump the state with XSAVE and print it.
281          */
282 
283          /* (3a).  We can't use |standard_test_data| directly, since we
284             need to put in the required |xstate_bv| value.  So make a
285             copy and modify that instead. */
286          UChar* img_to_restore_from = memalign_zeroed64(XSAVE_AREA_SIZE);
287          my_memcpy(img_to_restore_from, standard_test_data, XSAVE_AREA_SIZE);
288          img_to_restore_from[512] = xstate_bv;
289 
290          /* (4a) */
291          UChar* saved_img = memalign_zeroed64(XSAVE_AREA_SIZE);
292          my_memset(saved_img, 0xAA, XSAVE_AREA_SIZE);
293          saved_img[512] = 0;
294 
295          /* (2) */
296          do_xrstor(fives, 7);
297 
298          // X87, SSE, AVX state LIVE
299 
300          /* (3b) */
301          /* and this is what we're actually trying to test */
302          do_xrstor(img_to_restore_from, rfbm);
303 
304          // X87, SSE, AVX state LIVE
305 
306          /* (4b) */
307          do_xsave(saved_img, 7);
308 
309          fprintf(stderr,
310                  "---------- XRSTOR, xstate_bv = %u, rfbm = %u ---------\n",
311                 xstate_bv, rfbm);
312          show(saved_img, hideBits64to79);
313          fprintf(stderr, "\n");
314 
315          free(saved_img);
316          free(img_to_restore_from);
317       }
318    }
319 }
320 
321 
main(int argc,char ** argv)322 int main ( int argc, char** argv )
323 {
324    Bool hideBits64to79 = argc > 1;
325    fprintf(stderr, "Re-run with any arg to suppress least-significant\n"
326                    "   16 bits of 80-bit FP numbers\n");
327 
328    check_for_xsave();
329 
330    if (1)
331    test_xsave(hideBits64to79);
332 
333    if (1)
334    test_xrstor(hideBits64to79);
335 
336    return 0;
337 }
338