1 /* Runtime detection of optional processor characteristics.
2  *
3  * Contents:
4  *   1. Checking for support of x86 vector code
5  *   2. Internal code used in those checks
6  *   3. Unit tests
7  *   4. Test driver
8  *   5. Example
9  *
10  * References:
11  *   https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
12  *   https://software.intel.com/en-us/articles/how-to-detect-knl-instruction-support
13  *   https://en.wikipedia.org/wiki/CPUID
14  */
15 #include "esl_config.h"
16 
17 #include <stdlib.h>
18 #include <stdint.h>
19 #include <stdio.h>
20 #if defined(_MSC_VER)
21 #include <intrin.h>
22 #endif
23 
24 #include "easel.h"
25 #include "esl_cpu.h"
26 
27 /* declarations of static functions that come in section (2)  */
28 #if defined(eslENABLE_SSE) || defined(eslENABLE_SSE4) || defined(eslENABLE_AVX) || defined(eslENABLE_AVX512)
29 static void cpu_run_id(uint32_t eax, uint32_t ecx, uint32_t *abcd);
30 #endif
31 #ifdef eslENABLE_SSE
32 static int  cpu_has_sse(void);
33 #endif
34 #ifdef eslENABLE_SSE4
35 static int  cpu_has_sse4(void);
36 #endif
37 #ifdef eslENABLE_AVX
38 static int  cpu_check_xcr0_ymm(void);
39 static int  cpu_has_avx(void);
40 #endif
41 #ifdef eslENABLE_AVX512
42 static int  cpu_check_xcr0_zmm(void);
43 static int  cpu_has_avx512(void);
44 #endif
45 
46 /*****************************************************************
47  * 1. Checking for support of x86 vector code
48  *****************************************************************/
49 
50 /* Function:  esl_cpu_has_sse()
51  * Synopsis:  Check if processor supports x86 SSE/SSE2
52  * Incept:    SRE, Wed Feb  1 09:19:11 2017
53  *
54  * Purpose:   Returns TRUE if our code has an available SSE vector
55  *            implementation compiled in, and the processor we're
56  *            running on can support it (i.e. has SSE+SSE2).
57  *            Else returns FALSE.
58  *
59  * Note:      Although these use static flags, they are thread-safe.
60  *            They can only go in one direction, from a not-set-yet
61  *            state to a set state. Worst that happens in a race
62  *            condition is that we set the flag twice to the same
63  *            thing.
64  */
65 int
esl_cpu_has_sse(void)66 esl_cpu_has_sse(void)
67 {
68 #ifdef eslENABLE_SSE
69   static int sse_support = -1;
70   if (sse_support < 0)
71     sse_support = cpu_has_sse();
72   return sse_support;
73 #else
74   return 0;
75 #endif
76 }
77 
78 
79 /* Function:  esl_cpu_has_sse4()
80  * Synopsis:  Check if processor supports x86 <= SSE4.1
81  * Incept:    SRE, Wed Jun  6 11:49:46 2018 [OdjBox, Otto Croy]
82  *
83  * Purpose:   Returns TRUE if our code has an available SSE4 vector
84  *            implementation compiled in, and the processor we're
85  *            running on can support it (i.e. has SSE+SSE2+SSE4.1).
86  *            Else returns FALSE.
87  */
88 int
esl_cpu_has_sse4(void)89 esl_cpu_has_sse4(void)
90 {
91 #ifdef eslENABLE_SSE4
92   static int sse4_support = -1;
93   if (sse4_support < 0)
94     sse4_support = cpu_has_sse4();
95   return sse4_support;
96 #else
97   return 0;
98 #endif
99 }
100 
101 
102 
103 /* Function:  esl_cpu_has_avx()
104  * Synopsis:  Check if processor supports x86 AVX/AVX2.
105  * Incept:    SRE, Wed Feb  1 09:46:36 2017
106  *
107  * Purpose:   Returns TRUE if our code has an available AVX vector
108  *            implementation compiled in, and the processor we're
109  *            running on can support it (i.e. has AVX+AVX2).  Else
110  *            returns FALSE.
111  */
112 int
esl_cpu_has_avx(void)113 esl_cpu_has_avx(void)
114 {
115 #ifdef eslENABLE_AVX
116   static int avx_support = -1;
117   if (avx_support < 0)
118     avx_support = cpu_has_avx();
119   return avx_support;
120 #else
121   return 0;
122 #endif
123 }
124 
125 /* Function:  esl_cpu_has_avx512()
126  * Synopsis:  Check if processor supports x86 AVX-512.
127  * Incept:    SRE, Wed Feb  1 09:47:24 2017
128  *
129  * Purpose:   Returns TRUE if our code has an available AVX512 vector
130  *            implementation compiled in, and the processor we're
131  *            running on can support it (i.e. has
132  *            AVX-512{F,PF,ER,CD,BW}). Else returns FALSE.
133  */
134 int
esl_cpu_has_avx512(void)135 esl_cpu_has_avx512(void)
136 {
137 #ifdef eslENABLE_AVX512
138   static int avx512_support = -1;
139   if (avx512_support < 0)
140     avx512_support = cpu_has_avx512();
141   return avx512_support;
142 #else
143   return 0;
144 #endif
145 }
146 
147 
148 
149 /* Function:  esl_cpu_Get()
150  * Synopsis:  Returns a string showing which implementation our dispatchers choose.
151  * Incept:    SRE, Tue May 23 12:30:37 2017 [Handsome Family, Winnebago Skeletons]
152  *
153  * Purpose:   Return a string indicating which vector implementation is
154  *            chosen by our dispatchers, assuming they follow our
155  *            standard pattern.
156  */
157 char *
esl_cpu_Get(void)158 esl_cpu_Get(void)
159 {
160 #ifdef eslENABLE_AVX512  // Fastest first.
161   if (esl_cpu_has_avx512()) return "AVX512";
162 #endif
163 #ifdef eslENABLE_AVX
164   if (esl_cpu_has_avx())    return "AVX";
165 #endif
166 #ifdef eslENABLE_SSE4
167   if (esl_cpu_has_sse4())   return "SSE4";
168 #endif
169 #ifdef eslENABLE_SSE
170   if (esl_cpu_has_sse())    return "SSE";
171 #endif
172 #ifdef eslENABLE_NEON
173   return "NEON";
174 #endif
175 //#ifdef eslENABLE_VMX
176 //  return "VMX";
177 //#endif
178   return "none";
179 }
180 /*---------- end, API for x86 vector instruction checks ---------*/
181 
182 
183 
184 /*****************************************************************
185  * 2. Internal code used in x86 vector code checks
186  *****************************************************************/
187 
188 #if defined(eslENABLE_SSE) || defined(eslENABLE_SSE4) || defined(eslENABLE_AVX) || defined(eslENABLE_AVX512)
189 /* cpu_run_id()
190  *
191  * Bit flags in EAX (and maybe ECX) registers specify the information
192  * you want to query from the x86 processor. The cpuid opcode returns
193  * results by setting bits in EAX, EBX, ECX, EDX registers, which we
194  * return in abcd[0..3], respectively.
195  *
196  * [What all the bits mean](https://en.wikipedia.org/wiki/CPUID)
197  *
198  * Adapted from run_cpuid() in:
199  * https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
200  */
201 static void
cpu_run_id(uint32_t eax,uint32_t ecx,uint32_t * abcd)202 cpu_run_id(uint32_t eax, uint32_t ecx, uint32_t *abcd)
203 {
204 #if defined(_MSC_VER)
205   __cpuidex(abcd, eax, ecx);
206 #else
207   uint32_t ebx = 0;
208   uint32_t edx = 0;
209 #if defined( __i386__ ) && defined ( __PIC__ )   /* in case of PIC under 32-bit EBX cannot be clobbered */
210   __asm__ ( "movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi" : "=D" (ebx), "+a" (eax), "+c" (ecx), "=d" (edx) );
211 #else
212   __asm__ ( "cpuid" : "+b" (ebx), "+a" (eax), "+c" (ecx), "=d" (edx) );
213 #endif
214   abcd[0] = eax; abcd[1] = ebx; abcd[2] = ecx; abcd[3] = edx;
215 #endif // ! _MSC_VER
216 }
217 #endif // eslENABLE_SSE | eslENABLE_SSE4 | eslENABLE_AVX | eslENABLE_AVX512
218 
219 
220 
221 #ifdef eslENABLE_AVX
222 /* cpu_check_xcr0_ymm()
223  *
224  * Check for OS support of AVX. AVX uses the YMM registers, and the
225  * operating system must support saving YMM state on a context switch.
226  * The check depends on the `xgetbv` intrinsic on x86 processors.
227  *
228  * xgetbv's result has set:
229  *   bits 7<<5 = zmm (AVX-512)
230  *   bit  1<<2 = ymm (AVX)
231  *   bit  1<<1 = xmm
232  *
233  * Some Mac OS/X assemblers do not recognize the xgetbv instruction,
234  * but you can still emit the raw byte codes for it. So instead of
235  *   __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" );
236  * we have
237  *   __asm__(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx" );
238  */
239 static int
cpu_check_xcr0_ymm(void)240 cpu_check_xcr0_ymm(void)
241 {
242   uint32_t xcr0;
243   uint32_t ymm_xmm = (1 << 2) | (1 << 1);
244 #if defined(_MSC_VER)
245   xcr0 = (uint32_t)_xgetbv(0);  /* min VS2010 SP1 compiler is required */
246 #else
247   __asm__(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx" );
248 #endif
249   return ((xcr0 & ymm_xmm) == ymm_xmm);
250 }
251 #endif
252 
253 
254 #ifdef eslENABLE_AVX512
255 /* cpu_check_xcr0_zmm()
256  *
257  * Similarly, check for OS support of AVX-512, which uses ZMM and YMM registers.
258  */
259 static int
cpu_check_xcr0_zmm(void)260 cpu_check_xcr0_zmm(void)
261 {
262   uint32_t xcr0;
263   uint32_t zmm_ymm_xmm = (7 << 5) | (1 << 2) | (1 << 1);
264 #if defined(_MSC_VER)
265   xcr0 = (uint32_t)_xgetbv(0);  /* min VS2010 SP1 compiler is required */
266 #else
267   __asm__ (".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx" );
268 #endif
269   return ((xcr0 & zmm_ymm_xmm) == zmm_ymm_xmm);
270 }
271 #endif
272 
273 
274 #ifdef eslENABLE_SSE
275 /* cpu_has_sse()
276  *
277  * Test whether processor supports SSE/SSE2 instructions.
278  * Note that Easel's "SSE" vector code means SSE+SSE2.
279  */
280 static int
cpu_has_sse(void)281 cpu_has_sse(void)
282 {
283   uint32_t abcd[4];
284   uint32_t sse2_mask =  (1 << 25) |  // edx: SSE
285                         (1 << 26);   //      SSE2
286 
287   cpu_run_id( 1, 0, abcd );
288   if ( (abcd[3] & sse2_mask)  != sse2_mask)  // edx check
289     return 0;
290   return 1;
291 }
292 #endif // eslENABLE_SSE
293 
294 
295 #ifdef eslENABLE_SSE4
296 /* cpu_has_sse4()
297  *
298  * Test whether processor supports SSE/SSE2/SSE4.1 instructions.
299  * Note that Easel's "SSE4" vector code means SSE+SSE2+SSE4.1.
300  */
301 static int
cpu_has_sse4(void)302 cpu_has_sse4(void)
303 {
304   uint32_t abcd[4];
305   uint32_t sse2_mask =  (1 << 25) |  // edx: SSE
306                         (1 << 26);   //      SSE2
307   uint32_t sse41_mask = (1 << 19);   // ecx: SSE4.1
308 
309   cpu_run_id( 1, 0, abcd );
310   if ( (abcd[3] & sse2_mask)  != sse2_mask || // edx check
311        (abcd[2] & sse41_mask) != sse41_mask)  // ecx check
312     return 0;
313   return 1;
314 }
315 #endif // eslENABLE_SSE4
316 
317 
318 
319 #ifdef eslENABLE_AVX
320 /* cpu_has_avx
321  *
322  * Test whether processor supports AVX/AVX2 instructions.
323  * Easel "AVX" vector code requires AVX+AVX2.
324  */
325 static int
cpu_has_avx(void)326 cpu_has_avx(void)
327 {
328   uint32_t abcd[4];
329   uint32_t fma_movbe_osxsave_mask = ((1 << 12) | (1 << 22) | (1 << 27));
330   uint32_t avx2_bmi12_mask = (1 << 5) | (1 << 3) | (1 << 8);
331 
332   /* CPUID.(EAX=01H, ECX=0H):ECX.FMA[bit 12]==1   &&
333      CPUID.(EAX=01H, ECX=0H):ECX.MOVBE[bit 22]==1 &&
334      CPUID.(EAX=01H, ECX=0H):ECX.OSXSAVE[bit 27]==1 */
335   cpu_run_id( 1, 0, abcd );
336   if ( (abcd[2] & fma_movbe_osxsave_mask) != fma_movbe_osxsave_mask )
337     return 0;
338 
339   if ( ! cpu_check_xcr0_ymm() )
340     return 0;
341 
342   /*  CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5]==1  &&
343       CPUID.(EAX=07H, ECX=0H):EBX.BMI1[bit 3]==1  &&
344       CPUID.(EAX=07H, ECX=0H):EBX.BMI2[bit 8]==1  */
345   cpu_run_id( 7, 0, abcd );
346   if ( (abcd[1] & avx2_bmi12_mask) != avx2_bmi12_mask )
347     return 0;
348 
349   /* CPUID.(EAX=80000001H):ECX.LZCNT[bit 5]==1 */
350   cpu_run_id( 0x80000001, 0, abcd );
351   if ( (abcd[2] & (1 << 5)) == 0)
352     return 0;
353 
354   return 1;
355 }
356 #endif // eslENABLE_AVX
357 
358 
359 #ifdef eslENABLE_AVX512
360 /* cpu_has_avx512()
361  *
362  * Test whether processors supports AVX-512.  Our AVX-512 code
363  * currently can depend on Foundation, Double/Quadword, and Byte/Word
364  * subsets (F, DQ, BW), and requires Intel Skylake Xeon (Purley)
365  * processors or later.
366  */
367 static int
cpu_has_avx512(void)368 cpu_has_avx512(void)
369 {
370   uint32_t abcd[4];
371   uint32_t osxsave_mask = (1 << 27);
372   uint32_t knl_mask     = (1 << 16) | // AVX-512F
373                           (1 << 17) | // AVX-512DQ
374                           (1 << 30);  // AVX-512BW
375 
376   cpu_run_id( 1, 0, abcd );
377   if ( (abcd[2] & osxsave_mask) != osxsave_mask )
378     return 0;
379 
380   if ( ! cpu_check_xcr0_zmm() )
381     return 0;
382 
383   cpu_run_id( 7, 0, abcd );
384   if ( (abcd[1] & knl_mask) != knl_mask )
385     return 0;
386 
387   return 1;
388 }
389 #endif // eslENABLE_AVX512
390 
391 
392 /*------------ end, x86 processor interrogation -----------------*/
393 
394 
395 
396 
397 /*****************************************************************
398  * 3. Unit tests
399  *****************************************************************/
400 
401 #ifdef eslCPU_TESTDRIVE
402 
403 
404 /* utest_consistency()
405  *
406  * If we support AVX-512, we must support AVX; if we support AVX, we
407  * must support SSE. This isn't a strong test of anything, but since
408  * we don't know anything about the processor we're running unit
409  * testing on, it's hard to guarantee any stronger test.
410  *
411  * #ifdef's are required, because Easel applications are allowed
412  * to define any subset of vector implementations they want;
413  * for example, H4 implements SSE4 but not SSE.
414  */
415 static void
utest_consistency(void)416 utest_consistency(void)
417 {
418   // it's possible that none of the `#if defined` blocks are used, so
419   // don't put a char msg[] here, or compiler could bark about it being unused.
420 #if defined (eslENABLE_AVX512) && defined (eslENABLE_AVX)
421   if (esl_cpu_has_avx512() && ! esl_cpu_has_avx())  esl_fatal("utest_consistency() failed");
422 #endif
423 #if defined (eslENABLE_AVX) && defined (eslENABLE_SSE4)
424   if (esl_cpu_has_avx()    && ! esl_cpu_has_sse4()) esl_fatal("utest_consistency() failed");
425 #endif
426 #if defined (eslENABLE_SSE4) && defined (eslENABLE_SSE)
427   if (esl_cpu_has_sse4()   && ! esl_cpu_has_sse())  esl_fatal("utest_consistency() failed");
428 #endif
429 }
430 
431 #endif // eslCPU_TESTDRIVE
432 
433 
434 /*****************************************************************
435  * 4. Test driver
436  *****************************************************************/
437 #ifdef eslCPU_TESTDRIVE
438 
439 int
main(int argc,char ** argv)440 main(int argc, char **argv)
441 {
442   fprintf(stderr, "## %s\n", argv[0]);
443 
444   utest_consistency();
445 
446   fprintf(stderr, "#  status = ok\n");
447   return eslOK;
448 }
449 #endif // eslCPU_TESTDRIVE
450 
451 
452 /*****************************************************************
453  * 5. Example
454  *****************************************************************/
455 #ifdef eslCPU_EXAMPLE
456 
457 #include "esl_config.h"
458 
459 #include "easel.h"
460 #include "esl_cpu.h"
461 
462 int
main(int argc,char ** argv)463 main(int argc, char **argv)
464 {
465   printf("your cpu supports our SSE code    : %s\n",  esl_cpu_has_sse()    ? "yes" : "no");
466   printf("               ...our SSE4 code   : %s\n",  esl_cpu_has_sse4()   ? "yes" : "no");
467   printf("               ...our AVX code    : %s\n",  esl_cpu_has_avx()    ? "yes" : "no");
468   printf("               ...our AVX512 code : %s\n",  esl_cpu_has_avx512() ? "yes" : "no");
469   printf("Our dispatchers will choose       : %s\n",  esl_cpu_Get());
470 }
471 #endif // eslCPU_EXAMPLE
472