1 /* Runtime detection of optional processor characteristics.
2 *
3 * Contents:
4 * 1. Checking for support of x86 vector code
5 * 2. Internal code used in those checks
6 * 3. Unit tests
7 * 4. Test driver
8 * 5. Example
9 *
10 * References:
11 * https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
12 * https://software.intel.com/en-us/articles/how-to-detect-knl-instruction-support
13 * https://en.wikipedia.org/wiki/CPUID
14 */
15 #include "esl_config.h"
16
17 #include <stdlib.h>
18 #include <stdint.h>
19 #include <stdio.h>
20 #if defined(_MSC_VER)
21 #include <intrin.h>
22 #endif
23
24 #include "easel.h"
25 #include "esl_cpu.h"
26
27 /* declarations of static functions that come in section (2) */
28 #if defined(eslENABLE_SSE) || defined(eslENABLE_SSE4) || defined(eslENABLE_AVX) || defined(eslENABLE_AVX512)
29 static void cpu_run_id(uint32_t eax, uint32_t ecx, uint32_t *abcd);
30 #endif
31 #ifdef eslENABLE_SSE
32 static int cpu_has_sse(void);
33 #endif
34 #ifdef eslENABLE_SSE4
35 static int cpu_has_sse4(void);
36 #endif
37 #ifdef eslENABLE_AVX
38 static int cpu_check_xcr0_ymm(void);
39 static int cpu_has_avx(void);
40 #endif
41 #ifdef eslENABLE_AVX512
42 static int cpu_check_xcr0_zmm(void);
43 static int cpu_has_avx512(void);
44 #endif
45
46 /*****************************************************************
47 * 1. Checking for support of x86 vector code
48 *****************************************************************/
49
50 /* Function: esl_cpu_has_sse()
51 * Synopsis: Check if processor supports x86 SSE/SSE2
52 * Incept: SRE, Wed Feb 1 09:19:11 2017
53 *
54 * Purpose: Returns TRUE if our code has an available SSE vector
55 * implementation compiled in, and the processor we're
56 * running on can support it (i.e. has SSE+SSE2).
57 * Else returns FALSE.
58 *
59 * Note: Although these use static flags, they are thread-safe.
60 * They can only go in one direction, from a not-set-yet
61 * state to a set state. Worst that happens in a race
62 * condition is that we set the flag twice to the same
63 * thing.
64 */
65 int
esl_cpu_has_sse(void)66 esl_cpu_has_sse(void)
67 {
68 #ifdef eslENABLE_SSE
69 static int sse_support = -1;
70 if (sse_support < 0)
71 sse_support = cpu_has_sse();
72 return sse_support;
73 #else
74 return 0;
75 #endif
76 }
77
78
79 /* Function: esl_cpu_has_sse4()
80 * Synopsis: Check if processor supports x86 <= SSE4.1
81 * Incept: SRE, Wed Jun 6 11:49:46 2018 [OdjBox, Otto Croy]
82 *
83 * Purpose: Returns TRUE if our code has an available SSE4 vector
84 * implementation compiled in, and the processor we're
85 * running on can support it (i.e. has SSE+SSE2+SSE4.1).
86 * Else returns FALSE.
87 */
88 int
esl_cpu_has_sse4(void)89 esl_cpu_has_sse4(void)
90 {
91 #ifdef eslENABLE_SSE4
92 static int sse4_support = -1;
93 if (sse4_support < 0)
94 sse4_support = cpu_has_sse4();
95 return sse4_support;
96 #else
97 return 0;
98 #endif
99 }
100
101
102
103 /* Function: esl_cpu_has_avx()
104 * Synopsis: Check if processor supports x86 AVX/AVX2.
105 * Incept: SRE, Wed Feb 1 09:46:36 2017
106 *
107 * Purpose: Returns TRUE if our code has an available AVX vector
108 * implementation compiled in, and the processor we're
109 * running on can support it (i.e. has AVX+AVX2). Else
110 * returns FALSE.
111 */
112 int
esl_cpu_has_avx(void)113 esl_cpu_has_avx(void)
114 {
115 #ifdef eslENABLE_AVX
116 static int avx_support = -1;
117 if (avx_support < 0)
118 avx_support = cpu_has_avx();
119 return avx_support;
120 #else
121 return 0;
122 #endif
123 }
124
125 /* Function: esl_cpu_has_avx512()
126 * Synopsis: Check if processor supports x86 AVX-512.
127 * Incept: SRE, Wed Feb 1 09:47:24 2017
128 *
129 * Purpose: Returns TRUE if our code has an available AVX512 vector
130 * implementation compiled in, and the processor we're
131 * running on can support it (i.e. has
132 * AVX-512{F,PF,ER,CD,BW}). Else returns FALSE.
133 */
134 int
esl_cpu_has_avx512(void)135 esl_cpu_has_avx512(void)
136 {
137 #ifdef eslENABLE_AVX512
138 static int avx512_support = -1;
139 if (avx512_support < 0)
140 avx512_support = cpu_has_avx512();
141 return avx512_support;
142 #else
143 return 0;
144 #endif
145 }
146
147
148
149 /* Function: esl_cpu_Get()
150 * Synopsis: Returns a string showing which implementation our dispatchers choose.
151 * Incept: SRE, Tue May 23 12:30:37 2017 [Handsome Family, Winnebago Skeletons]
152 *
153 * Purpose: Return a string indicating which vector implementation is
154 * chosen by our dispatchers, assuming they follow our
155 * standard pattern.
156 */
157 char *
esl_cpu_Get(void)158 esl_cpu_Get(void)
159 {
160 #ifdef eslENABLE_AVX512 // Fastest first.
161 if (esl_cpu_has_avx512()) return "AVX512";
162 #endif
163 #ifdef eslENABLE_AVX
164 if (esl_cpu_has_avx()) return "AVX";
165 #endif
166 #ifdef eslENABLE_SSE4
167 if (esl_cpu_has_sse4()) return "SSE4";
168 #endif
169 #ifdef eslENABLE_SSE
170 if (esl_cpu_has_sse()) return "SSE";
171 #endif
172 #ifdef eslENABLE_NEON
173 return "NEON";
174 #endif
175 //#ifdef eslENABLE_VMX
176 // return "VMX";
177 //#endif
178 return "none";
179 }
180 /*---------- end, API for x86 vector instruction checks ---------*/
181
182
183
184 /*****************************************************************
185 * 2. Internal code used in x86 vector code checks
186 *****************************************************************/
187
188 #if defined(eslENABLE_SSE) || defined(eslENABLE_SSE4) || defined(eslENABLE_AVX) || defined(eslENABLE_AVX512)
189 /* cpu_run_id()
190 *
191 * Bit flags in EAX (and maybe ECX) registers specify the information
192 * you want to query from the x86 processor. The cpuid opcode returns
193 * results by setting bits in EAX, EBX, ECX, EDX registers, which we
194 * return in abcd[0..3], respectively.
195 *
196 * [What all the bits mean](https://en.wikipedia.org/wiki/CPUID)
197 *
198 * Adapted from run_cpuid() in:
199 * https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
200 */
201 static void
cpu_run_id(uint32_t eax,uint32_t ecx,uint32_t * abcd)202 cpu_run_id(uint32_t eax, uint32_t ecx, uint32_t *abcd)
203 {
204 #if defined(_MSC_VER)
205 __cpuidex(abcd, eax, ecx);
206 #else
207 uint32_t ebx = 0;
208 uint32_t edx = 0;
209 #if defined( __i386__ ) && defined ( __PIC__ ) /* in case of PIC under 32-bit EBX cannot be clobbered */
210 __asm__ ( "movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi" : "=D" (ebx), "+a" (eax), "+c" (ecx), "=d" (edx) );
211 #else
212 __asm__ ( "cpuid" : "+b" (ebx), "+a" (eax), "+c" (ecx), "=d" (edx) );
213 #endif
214 abcd[0] = eax; abcd[1] = ebx; abcd[2] = ecx; abcd[3] = edx;
215 #endif // ! _MSC_VER
216 }
217 #endif // eslENABLE_SSE | eslENABLE_SSE4 | eslENABLE_AVX | eslENABLE_AVX512
218
219
220
221 #ifdef eslENABLE_AVX
222 /* cpu_check_xcr0_ymm()
223 *
224 * Check for OS support of AVX. AVX uses the YMM registers, and the
225 * operating system must support saving YMM state on a context switch.
226 * The check depends on the `xgetbv` intrinsic on x86 processors.
227 *
228 * xgetbv's result has set:
229 * bits 7<<5 = zmm (AVX-512)
230 * bit 1<<2 = ymm (AVX)
231 * bit 1<<1 = xmm
232 *
233 * Some Mac OS/X assemblers do not recognize the xgetbv instruction,
234 * but you can still emit the raw byte codes for it. So instead of
235 * __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" );
236 * we have
237 * __asm__(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx" );
238 */
239 static int
cpu_check_xcr0_ymm(void)240 cpu_check_xcr0_ymm(void)
241 {
242 uint32_t xcr0;
243 uint32_t ymm_xmm = (1 << 2) | (1 << 1);
244 #if defined(_MSC_VER)
245 xcr0 = (uint32_t)_xgetbv(0); /* min VS2010 SP1 compiler is required */
246 #else
247 __asm__(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx" );
248 #endif
249 return ((xcr0 & ymm_xmm) == ymm_xmm);
250 }
251 #endif
252
253
254 #ifdef eslENABLE_AVX512
255 /* cpu_check_xcr0_zmm()
256 *
257 * Similarly, check for OS support of AVX-512, which uses ZMM and YMM registers.
258 */
259 static int
cpu_check_xcr0_zmm(void)260 cpu_check_xcr0_zmm(void)
261 {
262 uint32_t xcr0;
263 uint32_t zmm_ymm_xmm = (7 << 5) | (1 << 2) | (1 << 1);
264 #if defined(_MSC_VER)
265 xcr0 = (uint32_t)_xgetbv(0); /* min VS2010 SP1 compiler is required */
266 #else
267 __asm__ (".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx" );
268 #endif
269 return ((xcr0 & zmm_ymm_xmm) == zmm_ymm_xmm);
270 }
271 #endif
272
273
274 #ifdef eslENABLE_SSE
275 /* cpu_has_sse()
276 *
277 * Test whether processor supports SSE/SSE2 instructions.
278 * Note that Easel's "SSE" vector code means SSE+SSE2.
279 */
280 static int
cpu_has_sse(void)281 cpu_has_sse(void)
282 {
283 uint32_t abcd[4];
284 uint32_t sse2_mask = (1 << 25) | // edx: SSE
285 (1 << 26); // SSE2
286
287 cpu_run_id( 1, 0, abcd );
288 if ( (abcd[3] & sse2_mask) != sse2_mask) // edx check
289 return 0;
290 return 1;
291 }
292 #endif // eslENABLE_SSE
293
294
295 #ifdef eslENABLE_SSE4
296 /* cpu_has_sse4()
297 *
298 * Test whether processor supports SSE/SSE2/SSE4.1 instructions.
299 * Note that Easel's "SSE4" vector code means SSE+SSE2+SSE4.1.
300 */
301 static int
cpu_has_sse4(void)302 cpu_has_sse4(void)
303 {
304 uint32_t abcd[4];
305 uint32_t sse2_mask = (1 << 25) | // edx: SSE
306 (1 << 26); // SSE2
307 uint32_t sse41_mask = (1 << 19); // ecx: SSE4.1
308
309 cpu_run_id( 1, 0, abcd );
310 if ( (abcd[3] & sse2_mask) != sse2_mask || // edx check
311 (abcd[2] & sse41_mask) != sse41_mask) // ecx check
312 return 0;
313 return 1;
314 }
315 #endif // eslENABLE_SSE4
316
317
318
319 #ifdef eslENABLE_AVX
320 /* cpu_has_avx
321 *
322 * Test whether processor supports AVX/AVX2 instructions.
323 * Easel "AVX" vector code requires AVX+AVX2.
324 */
325 static int
cpu_has_avx(void)326 cpu_has_avx(void)
327 {
328 uint32_t abcd[4];
329 uint32_t fma_movbe_osxsave_mask = ((1 << 12) | (1 << 22) | (1 << 27));
330 uint32_t avx2_bmi12_mask = (1 << 5) | (1 << 3) | (1 << 8);
331
332 /* CPUID.(EAX=01H, ECX=0H):ECX.FMA[bit 12]==1 &&
333 CPUID.(EAX=01H, ECX=0H):ECX.MOVBE[bit 22]==1 &&
334 CPUID.(EAX=01H, ECX=0H):ECX.OSXSAVE[bit 27]==1 */
335 cpu_run_id( 1, 0, abcd );
336 if ( (abcd[2] & fma_movbe_osxsave_mask) != fma_movbe_osxsave_mask )
337 return 0;
338
339 if ( ! cpu_check_xcr0_ymm() )
340 return 0;
341
342 /* CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5]==1 &&
343 CPUID.(EAX=07H, ECX=0H):EBX.BMI1[bit 3]==1 &&
344 CPUID.(EAX=07H, ECX=0H):EBX.BMI2[bit 8]==1 */
345 cpu_run_id( 7, 0, abcd );
346 if ( (abcd[1] & avx2_bmi12_mask) != avx2_bmi12_mask )
347 return 0;
348
349 /* CPUID.(EAX=80000001H):ECX.LZCNT[bit 5]==1 */
350 cpu_run_id( 0x80000001, 0, abcd );
351 if ( (abcd[2] & (1 << 5)) == 0)
352 return 0;
353
354 return 1;
355 }
356 #endif // eslENABLE_AVX
357
358
359 #ifdef eslENABLE_AVX512
360 /* cpu_has_avx512()
361 *
362 * Test whether processors supports AVX-512. Our AVX-512 code
363 * currently can depend on Foundation, Double/Quadword, and Byte/Word
364 * subsets (F, DQ, BW), and requires Intel Skylake Xeon (Purley)
365 * processors or later.
366 */
367 static int
cpu_has_avx512(void)368 cpu_has_avx512(void)
369 {
370 uint32_t abcd[4];
371 uint32_t osxsave_mask = (1 << 27);
372 uint32_t knl_mask = (1 << 16) | // AVX-512F
373 (1 << 17) | // AVX-512DQ
374 (1 << 30); // AVX-512BW
375
376 cpu_run_id( 1, 0, abcd );
377 if ( (abcd[2] & osxsave_mask) != osxsave_mask )
378 return 0;
379
380 if ( ! cpu_check_xcr0_zmm() )
381 return 0;
382
383 cpu_run_id( 7, 0, abcd );
384 if ( (abcd[1] & knl_mask) != knl_mask )
385 return 0;
386
387 return 1;
388 }
389 #endif // eslENABLE_AVX512
390
391
392 /*------------ end, x86 processor interrogation -----------------*/
393
394
395
396
397 /*****************************************************************
398 * 3. Unit tests
399 *****************************************************************/
400
401 #ifdef eslCPU_TESTDRIVE
402
403
404 /* utest_consistency()
405 *
406 * If we support AVX-512, we must support AVX; if we support AVX, we
407 * must support SSE. This isn't a strong test of anything, but since
408 * we don't know anything about the processor we're running unit
409 * testing on, it's hard to guarantee any stronger test.
410 *
411 * #ifdef's are required, because Easel applications are allowed
412 * to define any subset of vector implementations they want;
413 * for example, H4 implements SSE4 but not SSE.
414 */
415 static void
utest_consistency(void)416 utest_consistency(void)
417 {
418 // it's possible that none of the `#if defined` blocks are used, so
419 // don't put a char msg[] here, or compiler could bark about it being unused.
420 #if defined (eslENABLE_AVX512) && defined (eslENABLE_AVX)
421 if (esl_cpu_has_avx512() && ! esl_cpu_has_avx()) esl_fatal("utest_consistency() failed");
422 #endif
423 #if defined (eslENABLE_AVX) && defined (eslENABLE_SSE4)
424 if (esl_cpu_has_avx() && ! esl_cpu_has_sse4()) esl_fatal("utest_consistency() failed");
425 #endif
426 #if defined (eslENABLE_SSE4) && defined (eslENABLE_SSE)
427 if (esl_cpu_has_sse4() && ! esl_cpu_has_sse()) esl_fatal("utest_consistency() failed");
428 #endif
429 }
430
431 #endif // eslCPU_TESTDRIVE
432
433
434 /*****************************************************************
435 * 4. Test driver
436 *****************************************************************/
437 #ifdef eslCPU_TESTDRIVE
438
439 int
main(int argc,char ** argv)440 main(int argc, char **argv)
441 {
442 fprintf(stderr, "## %s\n", argv[0]);
443
444 utest_consistency();
445
446 fprintf(stderr, "# status = ok\n");
447 return eslOK;
448 }
449 #endif // eslCPU_TESTDRIVE
450
451
452 /*****************************************************************
453 * 5. Example
454 *****************************************************************/
455 #ifdef eslCPU_EXAMPLE
456
457 #include "esl_config.h"
458
459 #include "easel.h"
460 #include "esl_cpu.h"
461
462 int
main(int argc,char ** argv)463 main(int argc, char **argv)
464 {
465 printf("your cpu supports our SSE code : %s\n", esl_cpu_has_sse() ? "yes" : "no");
466 printf(" ...our SSE4 code : %s\n", esl_cpu_has_sse4() ? "yes" : "no");
467 printf(" ...our AVX code : %s\n", esl_cpu_has_avx() ? "yes" : "no");
468 printf(" ...our AVX512 code : %s\n", esl_cpu_has_avx512() ? "yes" : "no");
469 printf("Our dispatchers will choose : %s\n", esl_cpu_Get());
470 }
471 #endif // eslCPU_EXAMPLE
472