1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
4 /* */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
8 /* */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
11 /* disclaimer. */
12 /* */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
17 /* */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
32 /* */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
38
39 #include "common.h"
40
41 #ifdef _MSC_VER
42 #define strncasecmp _strnicmp
43 #define strcasecmp _stricmp
44 #endif
45
46 #ifdef ARCH_X86
47 #define EXTERN extern
48 #else
49 #define EXTERN
50 #endif
51
52 #ifdef DYNAMIC_LIST
53 extern gotoblas_t gotoblas_PRESCOTT;
54
55 #ifdef DYN_ATHLON
56 extern gotoblas_t gotoblas_ATHLON;
57 #else
58 #define gotoblas_ATHLON gotoblas_PRESCOTT
59 #endif
60 #ifdef DYN_KATMAI
61 extern gotoblas_t gotoblas_KATMAI;
62 #else
63 #define gotoblas_KATMAI gotoblas_PRESCOTT
64 #endif
65 #ifdef DYN_BANIAS
66 extern gotoblas_t gotoblas_BANIAS;
67 #else
68 #define gotoblas_BANIAS gotoblas_PRESCOTT
69 #endif
70 #ifdef DYN_COPPERMINE
71 extern gotoblas_t gotoblas_COPPERMINE;
72 #else
73 #define gotoblas_COPPERMINE gotoblas_PRESCOTT
74 #endif
75 #ifdef DYN_NORTHWOOD
76 extern gotoblas_t gotoblas_NORTHWOOD;
77 #else
78 #define gotoblas_NORTHWOOD gotoblas_PRESCOTT
79 #endif
80 #ifdef DYN_CORE2
81 extern gotoblas_t gotoblas_CORE2;
82 #else
83 #define gotoblas_CORE2 gotoblas_PRESCOTT
84 #endif
85 #ifdef DYN_NEHALEM
86 extern gotoblas_t gotoblas_NEHALEM;
87 #else
88 #define gotoblas_NEHALEM gotoblas_PRESCOTT
89 #endif
90 #ifdef DYN_BARCELONA
91 extern gotoblas_t gotoblas_BARCELONA;
92 #elif defined(DYN_NEHALEM)
93 #define gotoblas_BARCELONA gotoblas_NEHALEM
94 #else
95 #define gotoblas_BARCELONA gotoblas_PRESCOTT
96 #endif
97 #ifdef DYN_ATOM
98 extern gotoblas_t gotoblas_ATOM;
99 elif defined(DYN_NEHALEM)
100 #define gotoblas_ATOM gotoblas_NEHALEM
101 #else
102 #define gotoblas_ATOM gotoblas_PRESCOTT
103 #endif
104 #ifdef DYN_NANO
105 extern gotoblas_t gotoblas_NANO;
106 #else
107 #define gotoblas_NANO gotoblas_PRESCOTT
108 #endif
109 #ifdef DYN_PENRYN
110 extern gotoblas_t gotoblas_PENRYN;
111 #else
112 #define gotoblas_PENRYN gotoblas_PRESCOTT
113 #endif
114 #ifdef DYN_DUNNINGTON
115 extern gotoblas_t gotoblas_DUNNINGTON;
116 #else
117 #define gotoblas_DUNNINGTON gotoblas_PRESCOTT
118 #endif
119 #ifdef DYN_OPTERON
120 extern gotoblas_t gotoblas_OPTERON;
121 #else
122 #define gotoblas_OPTERON gotoblas_PRESCOTT
123 #endif
124 #ifdef DYN_OPTERON_SSE3
125 extern gotoblas_t gotoblas_OPTERON_SSE3;
126 #else
127 #define gotoblas_OPTERON_SSE3 gotoblas_PRESCOTT
128 #endif
129 #ifdef DYN_BOBCAT
130 extern gotoblas_t gotoblas_BOBCAT;
131 #elif defined(DYN_NEHALEM)
132 #define gotoblas_BOBCAT gotoblas_NEHALEM
133 #else
134 #define gotoblas_BOBCAT gotoblas_PRESCOTT
135 #endif
136 #ifdef DYN_SANDYBRIDGE
137 extern gotoblas_t gotoblas_SANDYBRIDGE;
138 #elif defined(DYN_NEHALEM)
139 #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
140 #else
141 #define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT
142 #endif
143 #ifdef DYN_BULLDOZER
144 extern gotoblas_t gotoblas_BULLDOZER;
145 #elif defined(DYN_SANDYBRIDGE)
146 #define gotoblas_BULLDOZER gotoblas_SANDYBRIDGE
147 #elif defined(DYN_NEHALEM)
148 #define gotoblas_BULLDOZER gotoblas_NEHALEM
149 #else
150 #define gotoblas_BULLDOZER gotoblas_PRESCOTT
151 #endif
152 #ifdef DYN_PILEDRIVER
153 extern gotoblas_t gotoblas_PILEDRIVER;
154 #elif defined(DYN_SANDYBRIDGE)
155 #define gotoblas_PILEDRIVER gotoblas_SANDYBRIDGE
156 #elif defined(DYN_NEHALEM)
157 #define gotoblas_PILEDRIVER gotoblas_NEHALEM
158 #else
159 #define gotoblas_PILEDRIVER gotoblas_PRESCOTT
160 #endif
161 #ifdef DYN_STEAMROLLER
162 extern gotoblas_t gotoblas_STEAMROLLER;
163 #elif defined(DYN_SANDYBRIDGE)
164 #define gotoblas_STEAMROLLER gotoblas_SANDYBRIDGE
165 #elif defined(DYN_NEHALEM)
166 #define gotoblas_STEAMROLLER gotoblas_NEHALEM
167 #else
168 #define gotoblas_STEAMROLLER gotoblas_PRESCOTT
169 #endif
170 #ifdef DYN_EXCAVATOR
171 extern gotoblas_t gotoblas_EXCAVATOR;
172 #elif defined(DYN_SANDYBRIDGE)
173 #define gotoblas_EXCAVATOR gotoblas_SANDYBRIDGE
174 #elif defined(DYN_NEHALEM)
175 #define gotoblas_EXCAVATOR gotoblas_NEHALEM
176 #else
177 #define gotoblas_EXCAVATOR gotoblas_PRESCOTT
178 #endif
179 #ifdef DYN_HASWELL
180 extern gotoblas_t gotoblas_HASWELL;
181 #elif defined(DYN_SANDYBRIDGE)
182 #define gotoblas_HASWELL gotoblas_SANDYBRIDGE
183 #elif defined(DYN_NEHALEM)
184 #define gotoblas_HASWELL gotoblas_NEHALEM
185 #else
186 #define gotoblas_HASWELL gotoblas_PRESCOTT
187 #endif
188 #ifdef DYN_ZEN
189 extern gotoblas_t gotoblas_ZEN;
190 #elif defined(DYN_HASWELL)
191 #define gotoblas_ZEN gotoblas_HASWELL
192 #elif defined(DYN_SANDYBRIDGE)
193 #define gotoblas_ZEN gotoblas_SANDYBRIDGE
194 #elif defined(DYN_NEHALEM)
195 #define gotoblas_ZEN gotoblas_NEHALEM
196 #else
197 #define gotoblas_ZEN gotoblas_PRESCOTT
198 #endif
199 #ifdef DYN_SKYLAKEX
200 extern gotoblas_t gotoblas_SKYLAKEX;
201 #elif defined(DYN_HASWELL)
202 #define gotoblas_SKYLAKEX gotoblas_HASWELL
203 #elif defined(DYN_SANDYBRIDGE)
204 #define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
205 #elif defined(DYN_NEHALEM)
206 #define gotoblas_SKYLAKEX gotoblas_NEHALEM
207 #else
208 #define gotoblas_SKYLAKEX gotoblas_PRESCOTT
209 #endif
210 #ifdef DYN_COOPERLAKE
211 extern gotoblas_t gotoblas_COOPERLAKE;
212 #elif defined(DYN_SKYLAKEX)
213 #define gotoblas_COOPERLAKE gotoblas_SKYLAKEX
214 #elif defined(DYN_HASWELL)
215 #define gotoblas_COOPERLAKE gotoblas_HASWELL
216 #elif defined(DYN_SANDYBRIDGE)
217 #define gotoblas_COOPERLAKE gotoblas_SANDYBRIDGE
218 #elif defined(DYN_NEHALEM)
219 #define gotoblas_COOPERLAKE gotoblas_NEHALEM
220 #else
221 #define gotoblas_COOPERLAKE gotoblas_PRESCOTT
222 #endif
223
224
225 #else // not DYNAMIC_LIST
226 EXTERN gotoblas_t gotoblas_KATMAI;
227 EXTERN gotoblas_t gotoblas_COPPERMINE;
228 EXTERN gotoblas_t gotoblas_NORTHWOOD;
229 EXTERN gotoblas_t gotoblas_BANIAS;
230 EXTERN gotoblas_t gotoblas_ATHLON;
231
232 extern gotoblas_t gotoblas_PRESCOTT;
233 extern gotoblas_t gotoblas_CORE2;
234 extern gotoblas_t gotoblas_NEHALEM;
235 extern gotoblas_t gotoblas_BARCELONA;
236 #ifdef DYNAMIC_OLDER
237 extern gotoblas_t gotoblas_ATOM;
238 extern gotoblas_t gotoblas_NANO;
239 extern gotoblas_t gotoblas_PENRYN;
240 extern gotoblas_t gotoblas_DUNNINGTON;
241 extern gotoblas_t gotoblas_OPTERON;
242 extern gotoblas_t gotoblas_OPTERON_SSE3;
243 extern gotoblas_t gotoblas_BOBCAT;
244 #else
245 #define gotoblas_ATOM gotoblas_NEHALEM
246 #define gotoblas_NANO gotoblas_NEHALEM
247 #define gotoblas_PENRYN gotoblas_CORE2
248 #define gotoblas_DUNNINGTON gotoblas_CORE2
249 #define gotoblas_OPTERON gotoblas_CORE2
250 #define gotoblas_OPTERON_SSE3 gotoblas_CORE2
251 #define gotoblas_BOBCAT gotoblas_CORE2
252 #endif
253
254 #ifndef NO_AVX
255 extern gotoblas_t gotoblas_SANDYBRIDGE;
256 extern gotoblas_t gotoblas_BULLDOZER;
257 extern gotoblas_t gotoblas_PILEDRIVER;
258 extern gotoblas_t gotoblas_STEAMROLLER;
259 extern gotoblas_t gotoblas_EXCAVATOR;
260 #ifdef NO_AVX2
261 #define gotoblas_HASWELL gotoblas_SANDYBRIDGE
262 #define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
263 #define gotoblas_COOPERLAKE gotoblas_SANDYBRIDGE
264 #define gotoblas_ZEN gotoblas_SANDYBRIDGE
265 #else
266 extern gotoblas_t gotoblas_HASWELL;
267 extern gotoblas_t gotoblas_ZEN;
268 #ifndef NO_AVX512
269 extern gotoblas_t gotoblas_SKYLAKEX;
270 extern gotoblas_t gotoblas_COOPERLAKE;
271 #else
272 #define gotoblas_SKYLAKEX gotoblas_HASWELL
273 #define gotoblas_COOPERLAKE gotoblas_HASWELL
274 #endif
275 #endif
276 #else
277 //Use NEHALEM kernels for sandy bridge
278 #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
279 #define gotoblas_HASWELL gotoblas_NEHALEM
280 #define gotoblas_SKYLAKEX gotoblas_NEHALEM
281 #define gotoblas_COOPERLAKE gotoblas_NEHALEM
282 #define gotoblas_BULLDOZER gotoblas_BARCELONA
283 #define gotoblas_PILEDRIVER gotoblas_BARCELONA
284 #define gotoblas_STEAMROLLER gotoblas_BARCELONA
285 #define gotoblas_EXCAVATOR gotoblas_BARCELONA
286 #define gotoblas_ZEN gotoblas_BARCELONA
287 #endif
288
289 #endif // DYNAMIC_LIST
290
291 #define VENDOR_INTEL 1
292 #define VENDOR_AMD 2
293 #define VENDOR_CENTAUR 3
294 #define VENDOR_HYGON 4
295 #define VENDOR_ZHAOXIN 5
296 #define VENDOR_UNKNOWN 99
297
298 #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
299
300 #ifndef NO_AVX
xgetbv(int op,int * eax,int * edx)301 static inline void xgetbv(int op, int * eax, int * edx){
302 //Use binary code for xgetbv
303 __asm__ __volatile__
304 (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
305 }
306 #endif
307
support_avx()308 int support_avx(){
309 #ifndef NO_AVX
310 int eax, ebx, ecx, edx;
311 int ret=0;
312
313 cpuid(1, &eax, &ebx, &ecx, &edx);
314 if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){
315 xgetbv(0, &eax, &edx);
316 if((eax & 6) == 6){
317 ret=1; //OS support AVX
318 }
319 }
320 return ret;
321 #else
322 return 0;
323 #endif
324 }
325
support_avx2()326 int support_avx2(){
327 #ifndef NO_AVX2
328 int eax, ebx, ecx=0, edx;
329 int ret=0;
330
331 if (!support_avx())
332 return 0;
333 cpuid(7, &eax, &ebx, &ecx, &edx);
334 if((ebx & (1<<5)) != 0)
335 ret=1; //AVX2 flag is set
336 return ret;
337 #else
338 return 0;
339 #endif
340 }
341
support_avx512()342 int support_avx512(){
343 #if !defined(NO_AVX) && !defined(NO_AVX512)
344 int eax, ebx, ecx, edx;
345 int ret=0;
346
347 if (!support_avx())
348 return 0;
349 cpuid(7, &eax, &ebx, &ecx, &edx);
350 if((ebx & (1<<5)) == 0){
351 ret=0; //cpu does not have avx2 flag
352 }
353 if((ebx & (1<<31)) != 0){ //AVX512VL flag is set
354 xgetbv(0, &eax, &edx);
355 if((eax & 0xe0) == 0xe0)
356 ret=1; //OS supports saving zmm register
357 }
358 return ret;
359 #else
360 return 0;
361 #endif
362 }
363
support_avx512_bf16()364 int support_avx512_bf16(){
365 #if !defined(NO_AVX) && !defined(NO_AVX512)
366 int eax, ebx, ecx, edx;
367 int ret=0;
368
369 if (!support_avx512())
370 return 0;
371 cpuid_count(7, 1, &eax, &ebx, &ecx, &edx);
372 if((eax & 32) == 32){
373 ret=1; // CPUID.7.1:EAX[bit 5] indicates whether avx512_bf16 supported or not
374 }
375 return ret;
376 #else
377 return 0;
378 #endif
379 }
380
381 extern void openblas_warning(int verbose, const char * msg);
382 #define FALLBACK_VERBOSE 1
383 #define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"
384 #define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n"
385 #define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n"
386 #define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"
387
get_vendor(void)388 static int get_vendor(void){
389 int eax, ebx, ecx, edx;
390
391 union
392 {
393 char vchar[16];
394 int vint[4];
395 } vendor;
396
397 cpuid(0, &eax, &ebx, &ecx, &edx);
398
399 *(&vendor.vint[0]) = ebx;
400 *(&vendor.vint[1]) = edx;
401 *(&vendor.vint[2]) = ecx;
402
403 vendor.vchar[12] = '\0';
404
405 if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL;
406 if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD;
407 if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR;
408 if (!strcmp(vendor.vchar, " Shanghai ")) return VENDOR_ZHAOXIN;
409 if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON;
410
411 if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
412
413 return VENDOR_UNKNOWN;
414 }
415
get_coretype(void)416 static gotoblas_t *get_coretype(void){
417
418 int eax, ebx, ecx, edx;
419 int family, exfamily, model, vendor, exmodel, stepping;
420
421 cpuid(1, &eax, &ebx, &ecx, &edx);
422
423 family = BITMASK(eax, 8, 0x0f);
424 exfamily = BITMASK(eax, 20, 0xff);
425 model = BITMASK(eax, 4, 0x0f);
426 exmodel = BITMASK(eax, 16, 0x0f);
427 stepping = BITMASK(eax, 0, 0x0f);
428
429 vendor = get_vendor();
430
431 if (vendor == VENDOR_INTEL){
432 switch (family) {
433 case 0x6:
434 switch (exmodel) {
435 case 0:
436 if (model <= 0x7) return &gotoblas_KATMAI;
437 if ((model == 0x8) || (model == 0xa) || (model == 0xb)) return &gotoblas_COPPERMINE;
438 if ((model == 0x9) || (model == 0xd)) return &gotoblas_BANIAS;
439 if (model == 14) return &gotoblas_BANIAS;
440 if (model == 15) return &gotoblas_CORE2;
441 return NULL;
442
443 case 1:
444 if (model == 6) return &gotoblas_CORE2;
445 if (model == 7) return &gotoblas_PENRYN;
446 if (model == 13) return &gotoblas_DUNNINGTON;
447 if ((model == 10) || (model == 11) || (model == 14) || (model == 15)) return &gotoblas_NEHALEM;
448 if (model == 12) return &gotoblas_ATOM;
449 return NULL;
450
451 case 2:
452 //Intel Core (Clarkdale) / Core (Arrandale)
453 // Pentium (Clarkdale) / Pentium Mobile (Arrandale)
454 // Xeon (Clarkdale), 32nm
455 if (model == 5) return &gotoblas_NEHALEM;
456
457 //Intel Xeon Processor 5600 (Westmere-EP)
458 //Xeon Processor E7 (Westmere-EX)
459 //Xeon E7540
460 if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM;
461
462 //Intel Core i5-2000 /i7-2000 (Sandy Bridge)
463 //Intel Core i7-3000 / Xeon E5
464 if (model == 10 || model == 13) {
465 if(support_avx())
466 return &gotoblas_SANDYBRIDGE;
467 else{
468 openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
469 return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
470 }
471 }
472 return NULL;
473 case 3:
474 //Intel Sandy Bridge 22nm (Ivy Bridge?)
475 if (model == 10 || model == 14) {
476 if(support_avx())
477 return &gotoblas_SANDYBRIDGE;
478 else{
479 openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
480 return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
481 }
482 }
483 //Intel Haswell
484 if (model == 12 || model == 15) {
485 if(support_avx2())
486 return &gotoblas_HASWELL;
487 if(support_avx()) {
488 openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
489 return &gotoblas_SANDYBRIDGE;
490 } else {
491 openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
492 return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
493 }
494 }
495 //Intel Broadwell
496 if (model == 13) {
497 if(support_avx2())
498 return &gotoblas_HASWELL;
499 if(support_avx()) {
500 openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
501 return &gotoblas_SANDYBRIDGE;
502 } else {
503 openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
504 return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
505 }
506 }
507 if (model == 7) return &gotoblas_ATOM; //Bay Trail
508 return NULL;
509 case 4:
510 //Intel Haswell
511 if (model == 5 || model == 6) {
512 if(support_avx2())
513 return &gotoblas_HASWELL;
514 if(support_avx()) {
515 openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
516 return &gotoblas_SANDYBRIDGE;
517 } else {
518 openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
519 return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
520 }
521 }
522 //Intel Broadwell
523 if (model == 7 || model == 15) {
524 if(support_avx2())
525 return &gotoblas_HASWELL;
526 if(support_avx()) {
527 openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
528 return &gotoblas_SANDYBRIDGE;
529 } else {
530 openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
531 return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
532 }
533 }
534 //Intel Skylake
535 if (model == 14) {
536 if(support_avx2())
537 return &gotoblas_HASWELL;
538 if(support_avx()) {
539 openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
540 return &gotoblas_SANDYBRIDGE;
541 } else {
542 openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
543 return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
544 }
545 }
546 //Intel Braswell / Avoton
547 if (model == 12 || model == 13) {
548 return &gotoblas_NEHALEM;
549 }
550 return NULL;
551 case 5:
552 //Intel Broadwell
553 if (model == 6) {
554 if(support_avx2())
555 return &gotoblas_HASWELL;
556 if(support_avx()) {
557 openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
558 return &gotoblas_SANDYBRIDGE;
559 } else {
560 openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
561 return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
562 }
563 }
564 if (model == 5) {
565 // Intel Cooperlake
566 if(support_avx512_bf16())
567 return &gotoblas_COOPERLAKE;
568 // Intel Skylake X
569 if (support_avx512())
570 return &gotoblas_SKYLAKEX;
571 if(support_avx2()){
572 openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
573 return &gotoblas_HASWELL;
574 }
575 if(support_avx()) {
576 openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
577 return &gotoblas_SANDYBRIDGE;
578 } else {
579 openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
580 return &gotoblas_NEHALEM;
581 }
582 }
583 //Intel Skylake
584 if (model == 14) {
585 if(support_avx2())
586 return &gotoblas_HASWELL;
587 if(support_avx()) {
588 openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
589 return &gotoblas_SANDYBRIDGE;
590 } else {
591 openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
592 return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
593 }
594 }
595 //Intel Phi Knights Landing
596 if (model == 7) {
597 if(support_avx2()){
598 openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
599 return &gotoblas_HASWELL;
600 }
601 if(support_avx()) {
602 openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
603 return &gotoblas_SANDYBRIDGE;
604 } else {
605 openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
606 return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
607 }
608 }
609 //Apollo Lake or Denverton
610 if (model == 12 || model == 15) {
611 return &gotoblas_NEHALEM;
612 }
613 return NULL;
614 case 6:
615 if (model == 6) {
616 // Cannon Lake
617 if(support_avx2())
618 return &gotoblas_HASWELL;
619 if(support_avx()) {
620 openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
621 return &gotoblas_SANDYBRIDGE;
622 } else {
623 openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
624 return &gotoblas_NEHALEM;
625 }
626 }
627 if (model == 10) {
628 // Ice Lake SP
629 if(support_avx512_bf16())
630 return &gotoblas_COOPERLAKE;
631 if (support_avx512())
632 return &gotoblas_SKYLAKEX;
633 if(support_avx2())
634 return &gotoblas_HASWELL;
635 if(support_avx()) {
636 openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
637 return &gotoblas_SANDYBRIDGE;
638 } else {
639 openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
640 return &gotoblas_NEHALEM;
641 }
642 }
643 return NULL;
644 case 7:
645 if (model == 10) // Goldmont Plus
646 return &gotoblas_NEHALEM;
647 if (model == 14) {
648 // Ice Lake
649 if (support_avx512())
650 return &gotoblas_SKYLAKEX;
651 if(support_avx2()){
652 openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
653 return &gotoblas_HASWELL;
654 }
655 if(support_avx()) {
656 openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
657 return &gotoblas_SANDYBRIDGE;
658 } else {
659 openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
660 return &gotoblas_NEHALEM;
661 }
662 }
663 return NULL;
664 case 9:
665 case 8:
666 if (model == 12) { // Tiger Lake
667 if (support_avx512())
668 return &gotoblas_SKYLAKEX;
669 if(support_avx2()){
670 openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
671 return &gotoblas_HASWELL;
672 }
673 if(support_avx()) {
674 openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
675 return &gotoblas_SANDYBRIDGE;
676 } else {
677 openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
678 return &gotoblas_NEHALEM;
679 }
680 }
681 if (model == 14 ) { // Kaby Lake, Coffee Lake
682 if(support_avx2())
683 return &gotoblas_HASWELL;
684 if(support_avx()) {
685 openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
686 return &gotoblas_SANDYBRIDGE;
687 } else {
688 openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
689 return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
690 }
691 }
692 case 10:
693 if (model == 5 || model == 6) {
694 if(support_avx2())
695 return &gotoblas_HASWELL;
696 if(support_avx()) {
697 openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
698 return &gotoblas_SANDYBRIDGE;
699 } else {
700 openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
701 return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
702 }
703 }
704 if (model == 7) {
705 if (support_avx512())
706 return &gotoblas_SKYLAKEX;
707 if(support_avx2())
708 return &gotoblas_HASWELL;
709 if(support_avx()) {
710 openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
711 return &gotoblas_SANDYBRIDGE;
712 } else {
713 openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
714 return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
715 }
716 }
717 return NULL;
718 }
719 case 0xf:
720 if (model <= 0x2) return &gotoblas_NORTHWOOD;
721 return &gotoblas_PRESCOTT;
722 }
723 }
724
725 if (vendor == VENDOR_AMD || vendor == VENDOR_HYGON){
726 if (family <= 0xe) {
727 // Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon
728 cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
729 if ( (eax & 0xffff) >= 0x01) {
730 cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
731 if ((edx & (1 << 30)) == 0 || (edx & (1u << 31)) == 0)
732 return NULL;
733 }
734 else
735 return NULL;
736
737 return &gotoblas_ATHLON;
738 }
739 if (family == 0xf){
740 if ((exfamily == 0) || (exfamily == 2)) {
741 if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3;
742 else return &gotoblas_OPTERON;
743 } else if (exfamily == 5 || exfamily == 7) {
744 return &gotoblas_BOBCAT;
745 } else if (exfamily == 6) {
746 if(model == 1){
747 //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
748 if(support_avx())
749 return &gotoblas_BULLDOZER;
750 else{
751 openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
752 return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
753 }
754 }else if(model == 2 || model == 3){
755 //AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300
756 if(support_avx())
757 return &gotoblas_PILEDRIVER;
758 else{
759 openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
760 return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
761 }
762 }else if(model == 5){
763 if(support_avx())
764 return &gotoblas_EXCAVATOR;
765 else{
766 openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
767 return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
768 }
769 }else if(model == 0 || model == 8){
770 if (exmodel == 1) {
771 //AMD Trinity
772 if(support_avx())
773 return &gotoblas_PILEDRIVER;
774 else{
775 openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
776 return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
777 }
778 }else if (exmodel == 3) {
779 //AMD STEAMROLLER
780 if(support_avx())
781 return &gotoblas_STEAMROLLER;
782 else{
783 openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
784 return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
785 }
786 }else if (exmodel == 6) {
787 if(support_avx())
788 return &gotoblas_EXCAVATOR;
789 else{
790 openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
791 return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
792 }
793
794 }
795 }
796 } else if (exfamily == 8) {
797 /* if (model == 1 || model == 8) */ {
798 if(support_avx())
799 return &gotoblas_ZEN;
800 else{
801 openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
802 return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
803 }
804 }
805 } else if (exfamily == 9) {
806 if(support_avx())
807 return &gotoblas_ZEN;
808 else{
809 openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
810 return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
811 }
812 } else if (exfamily == 10) {
813 if(support_avx())
814 return &gotoblas_ZEN;
815 else{
816 openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
817 return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
818 }
819 }else {
820 return &gotoblas_BARCELONA;
821 }
822
823 }
824 }
825
826 if (vendor == VENDOR_CENTAUR) {
827 switch (family) {
828 case 0x6:
829 if (model == 0xf && stepping < 0xe)
830 return &gotoblas_NANO;
831 return &gotoblas_NEHALEM;
832 default:
833 if (family >= 0x7)
834 return &gotoblas_NEHALEM;
835 }
836 }
837
838 if (vendor == VENDOR_ZHAOXIN) {
839 return &gotoblas_NEHALEM;
840 }
841
842 return NULL;
843 }
844
845 static char *corename[] = {
846 "Unknown",
847 "Katmai",
848 "Coppermine",
849 "Northwood",
850 "Prescott",
851 "Banias",
852 "Atom",
853 "Core2",
854 "Penryn",
855 "Dunnington",
856 "Nehalem",
857 "Athlon",
858 "Opteron",
859 "Opteron_SSE3",
860 "Barcelona",
861 "Nano",
862 "Sandybridge",
863 "Bobcat",
864 "Bulldozer",
865 "Piledriver",
866 "Haswell",
867 "Steamroller",
868 "Excavator",
869 "Zen",
870 "SkylakeX",
871 "Cooperlake"
872 };
873
gotoblas_corename(void)874 char *gotoblas_corename(void) {
875
876 if (gotoblas == &gotoblas_KATMAI) return corename[ 1];
877 if (gotoblas == &gotoblas_COPPERMINE) return corename[ 2];
878 if (gotoblas == &gotoblas_NORTHWOOD) return corename[ 3];
879 if (gotoblas == &gotoblas_PRESCOTT) return corename[ 4];
880 if (gotoblas == &gotoblas_BANIAS) return corename[ 5];
881 if (gotoblas == &gotoblas_ATOM)
882 #ifdef DYNAMIC_OLDER
883 return corename[ 6];
884 #else
885 return corename[10];
886 #endif
887 if (gotoblas == &gotoblas_CORE2) return corename[ 7];
888 if (gotoblas == &gotoblas_PENRYN)
889 #ifdef DYNAMIC_OLDER
890 return corename[ 8];
891 #else
892 return corename[7];
893 #endif
894 if (gotoblas == &gotoblas_DUNNINGTON)
895 #ifdef DYNAMIC_OLDER
896 return corename[ 9];
897 #else
898 return corename[7];
899 #endif
900 if (gotoblas == &gotoblas_NEHALEM) return corename[10];
901 if (gotoblas == &gotoblas_ATHLON) return corename[11];
902 if (gotoblas == &gotoblas_OPTERON_SSE3)
903 #ifdef DYNAMIC_OLDER
904 return corename[12];
905 #else
906 return corename[7];
907 #endif
908 if (gotoblas == &gotoblas_OPTERON)
909 #ifdef DYNAMIC_OLDER
910 return corename[13];
911 #else
912 return corename[7];
913 #endif
914 if (gotoblas == &gotoblas_BARCELONA) return corename[14];
915 if (gotoblas == &gotoblas_NANO)
916 #ifdef DYNAMIC_OLDER
917 return corename[15];
918 #else
919 return corename[10];
920 #endif
921 if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
922 if (gotoblas == &gotoblas_BOBCAT)
923 #ifdef DYNAMIC_OLDER
924 return corename[17];
925 #else
926 return corename[7];
927 #endif
928 if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
929 if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];
930 if (gotoblas == &gotoblas_HASWELL) return corename[20];
931 if (gotoblas == &gotoblas_STEAMROLLER) return corename[21];
932 if (gotoblas == &gotoblas_EXCAVATOR) return corename[22];
933 if (gotoblas == &gotoblas_ZEN) return corename[23];
934 if (gotoblas == &gotoblas_SKYLAKEX) return corename[24];
935 if (gotoblas == &gotoblas_COOPERLAKE) return corename[25];
936 return corename[0];
937 }
938
939
940
force_coretype(char * coretype)941 static gotoblas_t *force_coretype(char *coretype){
942
943 int i ;
944 int found = -1;
945 char message[128];
946 //char mname[20];
947
948 for ( i=1 ; i <= 24; i++)
949 {
950 if (!strncasecmp(coretype,corename[i],20))
951 {
952 found = i;
953 break;
954 }
955 }
956 if (found < 0)
957 {
958 //strncpy(mname,coretype,20);
959 snprintf(message, 128, "Core not found: %s\n",coretype);
960 openblas_warning(1, message);
961 return(NULL);
962 }
963
964 switch (found)
965 {
966 case 25: return (&gotoblas_COOPERLAKE);
967 case 24: return (&gotoblas_SKYLAKEX);
968 case 23: return (&gotoblas_ZEN);
969 case 22: return (&gotoblas_EXCAVATOR);
970 case 21: return (&gotoblas_STEAMROLLER);
971 case 20: return (&gotoblas_HASWELL);
972 case 19: return (&gotoblas_PILEDRIVER);
973 case 18: return (&gotoblas_BULLDOZER);
974 case 17: return (&gotoblas_BOBCAT);
975 case 16: return (&gotoblas_SANDYBRIDGE);
976 case 15: return (&gotoblas_NANO);
977 case 14: return (&gotoblas_BARCELONA);
978 case 13: return (&gotoblas_OPTERON);
979 case 12: return (&gotoblas_OPTERON_SSE3);
980 case 11: return (&gotoblas_ATHLON);
981 case 10: return (&gotoblas_NEHALEM);
982 case 9: return (&gotoblas_DUNNINGTON);
983 case 8: return (&gotoblas_PENRYN);
984 case 7: return (&gotoblas_CORE2);
985 case 6: return (&gotoblas_ATOM);
986 case 5: return (&gotoblas_BANIAS);
987 case 4: return (&gotoblas_PRESCOTT);
988 case 3: return (&gotoblas_NORTHWOOD);
989 case 2: return (&gotoblas_COPPERMINE);
990 case 1: return (&gotoblas_KATMAI);
991 }
992 return(NULL);
993
994 }
995
996
997
998
gotoblas_dynamic_init(void)999 void gotoblas_dynamic_init(void) {
1000
1001 char coremsg[128];
1002 char coren[22];
1003 char *p;
1004
1005
1006 if (gotoblas) return;
1007
1008 p = getenv("OPENBLAS_CORETYPE");
1009 if ( p )
1010 {
1011 gotoblas = force_coretype(p);
1012 }
1013 else
1014 {
1015 gotoblas = get_coretype();
1016 }
1017
1018 #ifdef ARCH_X86
1019 if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI;
1020 #else
1021 if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT;
1022 /* sanity check, if 64bit pointer we can't have a 32 bit cpu */
1023 if (sizeof(void*) == 8) {
1024 if (gotoblas == &gotoblas_KATMAI ||
1025 gotoblas == &gotoblas_COPPERMINE ||
1026 gotoblas == &gotoblas_NORTHWOOD ||
1027 gotoblas == &gotoblas_BANIAS ||
1028 gotoblas == &gotoblas_ATHLON)
1029 gotoblas = &gotoblas_PRESCOTT;
1030 }
1031 #endif
1032
1033 if (gotoblas && gotoblas -> init) {
1034 strncpy(coren,gotoblas_corename(),20);
1035 sprintf(coremsg, "Core: %s\n",coren);
1036 openblas_warning(2, coremsg);
1037 gotoblas -> init();
1038 } else {
1039 openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
1040 exit(1);
1041 }
1042
1043 }
1044
gotoblas_dynamic_quit(void)1045 void gotoblas_dynamic_quit(void) {
1046
1047 gotoblas = NULL;
1048
1049 }
1050