1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin.           */
3 /* All rights reserved.                                              */
4 /*                                                                   */
5 /* Redistribution and use in source and binary forms, with or        */
6 /* without modification, are permitted provided that the following   */
7 /* conditions are met:                                               */
8 /*                                                                   */
9 /*   1. Redistributions of source code must retain the above         */
10 /*      copyright notice, this list of conditions and the following  */
11 /*      disclaimer.                                                  */
12 /*                                                                   */
13 /*   2. Redistributions in binary form must reproduce the above      */
14 /*      copyright notice, this list of conditions and the following  */
15 /*      disclaimer in the documentation and/or other materials       */
16 /*      provided with the distribution.                              */
17 /*                                                                   */
18 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
32 /*                                                                   */
33 /* The views and conclusions contained in the software and           */
34 /* documentation are those of the authors and should not be          */
35 /* interpreted as representing official policies, either expressed   */
36 /* or implied, of The University of Texas at Austin.                 */
37 /*********************************************************************/
38 
39 #include "common.h"
40 
41 #ifdef _MSC_VER
42 #define strncasecmp _strnicmp
43 #define strcasecmp _stricmp
44 #endif
45 
46 #ifdef ARCH_X86
47 #define EXTERN extern
48 #else
49 #define EXTERN
50 #endif
51 
52 #ifdef DYNAMIC_LIST
53 extern gotoblas_t gotoblas_PRESCOTT;
54 
55 #ifdef DYN_ATHLON
56 extern gotoblas_t gotoblas_ATHLON;
57 #else
58 #define gotoblas_ATHLON gotoblas_PRESCOTT
59 #endif
60 #ifdef DYN_KATMAI
61 extern gotoblas_t gotoblas_KATMAI;
62 #else
63 #define gotoblas_KATMAI gotoblas_PRESCOTT
64 #endif
65 #ifdef DYN_BANIAS
66 extern gotoblas_t gotoblas_BANIAS;
67 #else
68 #define gotoblas_BANIAS gotoblas_PRESCOTT
69 #endif
70 #ifdef DYN_COPPERMINE
71 extern gotoblas_t gotoblas_COPPERMINE;
72 #else
73 #define gotoblas_COPPERMINE gotoblas_PRESCOTT
74 #endif
75 #ifdef DYN_NORTHWOOD
76 extern gotoblas_t gotoblas_NORTHWOOD;
77 #else
78 #define gotoblas_NORTHWOOD gotoblas_PRESCOTT
79 #endif
80 #ifdef DYN_CORE2
81 extern gotoblas_t gotoblas_CORE2;
82 #else
83 #define gotoblas_CORE2 gotoblas_PRESCOTT
84 #endif
85 #ifdef DYN_NEHALEM
86 extern gotoblas_t gotoblas_NEHALEM;
87 #else
88 #define gotoblas_NEHALEM gotoblas_PRESCOTT
89 #endif
90 #ifdef DYN_BARCELONA
91 extern gotoblas_t gotoblas_BARCELONA;
92 #elif defined(DYN_NEHALEM)
93 #define gotoblas_BARCELONA gotoblas_NEHALEM
94 #else
95 #define gotoblas_BARCELONA gotoblas_PRESCOTT
96 #endif
97 #ifdef DYN_ATOM
98 extern gotoblas_t gotoblas_ATOM;
99 elif defined(DYN_NEHALEM)
100 #define gotoblas_ATOM gotoblas_NEHALEM
101 #else
102 #define gotoblas_ATOM gotoblas_PRESCOTT
103 #endif
104 #ifdef DYN_NANO
105 extern gotoblas_t gotoblas_NANO;
106 #else
107 #define gotoblas_NANO gotoblas_PRESCOTT
108 #endif
109 #ifdef DYN_PENRYN
110 extern gotoblas_t gotoblas_PENRYN;
111 #else
112 #define gotoblas_PENRYN gotoblas_PRESCOTT
113 #endif
114 #ifdef DYN_DUNNINGTON
115 extern gotoblas_t gotoblas_DUNNINGTON;
116 #else
117 #define gotoblas_DUNNINGTON gotoblas_PRESCOTT
118 #endif
119 #ifdef DYN_OPTERON
120 extern gotoblas_t gotoblas_OPTERON;
121 #else
122 #define gotoblas_OPTERON gotoblas_PRESCOTT
123 #endif
124 #ifdef DYN_OPTERON_SSE3
125 extern gotoblas_t gotoblas_OPTERON_SSE3;
126 #else
127 #define gotoblas_OPTERON_SSE3 gotoblas_PRESCOTT
128 #endif
129 #ifdef DYN_BOBCAT
130 extern gotoblas_t gotoblas_BOBCAT;
131 #elif defined(DYN_NEHALEM)
132 #define gotoblas_BOBCAT gotoblas_NEHALEM
133 #else
134 #define gotoblas_BOBCAT gotoblas_PRESCOTT
135 #endif
136 #ifdef DYN_SANDYBRIDGE
137 extern gotoblas_t gotoblas_SANDYBRIDGE;
138 #elif defined(DYN_NEHALEM)
139 #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
140 #else
141 #define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT
142 #endif
143 #ifdef DYN_BULLDOZER
144 extern gotoblas_t gotoblas_BULLDOZER;
145 #elif defined(DYN_SANDYBRIDGE)
146 #define gotoblas_BULLDOZER gotoblas_SANDYBRIDGE
147 #elif defined(DYN_NEHALEM)
148 #define gotoblas_BULLDOZER gotoblas_NEHALEM
149 #else
150 #define gotoblas_BULLDOZER gotoblas_PRESCOTT
151 #endif
152 #ifdef DYN_PILEDRIVER
153 extern gotoblas_t gotoblas_PILEDRIVER;
154 #elif defined(DYN_SANDYBRIDGE)
155 #define gotoblas_PILEDRIVER gotoblas_SANDYBRIDGE
156 #elif defined(DYN_NEHALEM)
157 #define gotoblas_PILEDRIVER gotoblas_NEHALEM
158 #else
159 #define gotoblas_PILEDRIVER gotoblas_PRESCOTT
160 #endif
161 #ifdef DYN_STEAMROLLER
162 extern gotoblas_t gotoblas_STEAMROLLER;
163 #elif defined(DYN_SANDYBRIDGE)
164 #define gotoblas_STEAMROLLER gotoblas_SANDYBRIDGE
165 #elif defined(DYN_NEHALEM)
166 #define gotoblas_STEAMROLLER gotoblas_NEHALEM
167 #else
168 #define gotoblas_STEAMROLLER gotoblas_PRESCOTT
169 #endif
170 #ifdef DYN_EXCAVATOR
171 extern gotoblas_t gotoblas_EXCAVATOR;
172 #elif defined(DYN_SANDYBRIDGE)
173 #define gotoblas_EXCAVATOR gotoblas_SANDYBRIDGE
174 #elif defined(DYN_NEHALEM)
175 #define gotoblas_EXCAVATOR gotoblas_NEHALEM
176 #else
177 #define gotoblas_EXCAVATOR gotoblas_PRESCOTT
178 #endif
179 #ifdef DYN_HASWELL
180 extern gotoblas_t gotoblas_HASWELL;
181 #elif defined(DYN_SANDYBRIDGE)
182 #define gotoblas_HASWELL gotoblas_SANDYBRIDGE
183 #elif defined(DYN_NEHALEM)
184 #define gotoblas_HASWELL gotoblas_NEHALEM
185 #else
186 #define gotoblas_HASWELL gotoblas_PRESCOTT
187 #endif
188 #ifdef DYN_ZEN
189 extern gotoblas_t gotoblas_ZEN;
190 #elif defined(DYN_HASWELL)
191 #define gotoblas_ZEN gotoblas_HASWELL
192 #elif defined(DYN_SANDYBRIDGE)
193 #define gotoblas_ZEN gotoblas_SANDYBRIDGE
194 #elif defined(DYN_NEHALEM)
195 #define gotoblas_ZEN gotoblas_NEHALEM
196 #else
197 #define gotoblas_ZEN gotoblas_PRESCOTT
198 #endif
199 #ifdef DYN_SKYLAKEX
200 extern gotoblas_t gotoblas_SKYLAKEX;
201 #elif defined(DYN_HASWELL)
202 #define gotoblas_SKYLAKEX gotoblas_HASWELL
203 #elif defined(DYN_SANDYBRIDGE)
204 #define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
205 #elif defined(DYN_NEHALEM)
206 #define gotoblas_SKYLAKEX gotoblas_NEHALEM
207 #else
208 #define gotoblas_SKYLAKEX gotoblas_PRESCOTT
209 #endif
210 #ifdef DYN_COOPERLAKE
211 extern gotoblas_t gotoblas_COOPERLAKE;
212 #elif defined(DYN_SKYLAKEX)
213 #define gotoblas_COOPERLAKE gotoblas_SKYLAKEX
214 #elif defined(DYN_HASWELL)
215 #define gotoblas_COOPERLAKE gotoblas_HASWELL
216 #elif defined(DYN_SANDYBRIDGE)
217 #define gotoblas_COOPERLAKE gotoblas_SANDYBRIDGE
218 #elif defined(DYN_NEHALEM)
219 #define gotoblas_COOPERLAKE gotoblas_NEHALEM
220 #else
221 #define gotoblas_COOPERLAKE gotoblas_PRESCOTT
222 #endif
223 
224 
225 #else // not DYNAMIC_LIST
226 EXTERN gotoblas_t  gotoblas_KATMAI;
227 EXTERN gotoblas_t  gotoblas_COPPERMINE;
228 EXTERN gotoblas_t  gotoblas_NORTHWOOD;
229 EXTERN gotoblas_t  gotoblas_BANIAS;
230 EXTERN gotoblas_t  gotoblas_ATHLON;
231 
232 extern gotoblas_t  gotoblas_PRESCOTT;
233 extern gotoblas_t  gotoblas_CORE2;
234 extern gotoblas_t  gotoblas_NEHALEM;
235 extern gotoblas_t  gotoblas_BARCELONA;
236 #ifdef DYNAMIC_OLDER
237 extern gotoblas_t  gotoblas_ATOM;
238 extern gotoblas_t  gotoblas_NANO;
239 extern gotoblas_t  gotoblas_PENRYN;
240 extern gotoblas_t  gotoblas_DUNNINGTON;
241 extern gotoblas_t  gotoblas_OPTERON;
242 extern gotoblas_t  gotoblas_OPTERON_SSE3;
243 extern gotoblas_t  gotoblas_BOBCAT;
244 #else
245 #define gotoblas_ATOM gotoblas_NEHALEM
246 #define gotoblas_NANO gotoblas_NEHALEM
247 #define gotoblas_PENRYN gotoblas_CORE2
248 #define gotoblas_DUNNINGTON gotoblas_CORE2
249 #define gotoblas_OPTERON gotoblas_CORE2
250 #define gotoblas_OPTERON_SSE3 gotoblas_CORE2
251 #define gotoblas_BOBCAT gotoblas_CORE2
252 #endif
253 
254 #ifndef NO_AVX
255 extern gotoblas_t  gotoblas_SANDYBRIDGE;
256 extern gotoblas_t  gotoblas_BULLDOZER;
257 extern gotoblas_t  gotoblas_PILEDRIVER;
258 extern gotoblas_t  gotoblas_STEAMROLLER;
259 extern gotoblas_t  gotoblas_EXCAVATOR;
260 #ifdef NO_AVX2
261 #define gotoblas_HASWELL gotoblas_SANDYBRIDGE
262 #define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
263 #define gotoblas_COOPERLAKE gotoblas_SANDYBRIDGE
264 #define gotoblas_ZEN gotoblas_SANDYBRIDGE
265 #else
266 extern gotoblas_t  gotoblas_HASWELL;
267 extern gotoblas_t  gotoblas_ZEN;
268 #ifndef NO_AVX512
269 extern gotoblas_t  gotoblas_SKYLAKEX;
270 extern gotoblas_t  gotoblas_COOPERLAKE;
271 #else
272 #define gotoblas_SKYLAKEX gotoblas_HASWELL
273 #define gotoblas_COOPERLAKE gotoblas_HASWELL
274 #endif
275 #endif
276 #else
277 //Use NEHALEM kernels for sandy bridge
278 #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
279 #define gotoblas_HASWELL gotoblas_NEHALEM
280 #define gotoblas_SKYLAKEX gotoblas_NEHALEM
281 #define gotoblas_COOPERLAKE gotoblas_NEHALEM
282 #define gotoblas_BULLDOZER gotoblas_BARCELONA
283 #define gotoblas_PILEDRIVER gotoblas_BARCELONA
284 #define gotoblas_STEAMROLLER gotoblas_BARCELONA
285 #define gotoblas_EXCAVATOR gotoblas_BARCELONA
286 #define gotoblas_ZEN gotoblas_BARCELONA
287 #endif
288 
289 #endif // DYNAMIC_LIST
290 
291 #define VENDOR_INTEL      1
292 #define VENDOR_AMD        2
293 #define VENDOR_CENTAUR    3
294 #define VENDOR_HYGON	  4
295 #define VENDOR_ZHAOXIN    5
296 #define VENDOR_UNKNOWN   99
297 
298 #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
299 
300 #ifndef NO_AVX
xgetbv(int op,int * eax,int * edx)301 static inline void xgetbv(int op, int * eax, int * edx){
302   //Use binary code for xgetbv
303   __asm__ __volatile__
304     (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
305 }
306 #endif
307 
support_avx()308 int support_avx(){
309 #ifndef NO_AVX
310   int eax, ebx, ecx, edx;
311   int ret=0;
312 
313   cpuid(1, &eax, &ebx, &ecx, &edx);
314   if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){
315     xgetbv(0, &eax, &edx);
316     if((eax & 6) == 6){
317       ret=1;  //OS support AVX
318     }
319   }
320   return ret;
321 #else
322   return 0;
323 #endif
324 }
325 
support_avx2()326 int support_avx2(){
327 #ifndef NO_AVX2
328   int eax, ebx, ecx=0, edx;
329   int ret=0;
330 
331   if (!support_avx())
332     return 0;
333   cpuid(7, &eax, &ebx, &ecx, &edx);
334   if((ebx & (1<<5)) != 0)
335       ret=1;  //AVX2 flag is set
336   return ret;
337 #else
338   return 0;
339 #endif
340 }
341 
support_avx512()342 int support_avx512(){
343 #if !defined(NO_AVX) && !defined(NO_AVX512)
344   int eax, ebx, ecx, edx;
345   int ret=0;
346 
347   if (!support_avx())
348     return 0;
349   cpuid(7, &eax, &ebx, &ecx, &edx);
350   if((ebx & (1<<5)) == 0){
351       ret=0;  //cpu does not have avx2 flag
352   }
353   if((ebx & (1<<31)) != 0){ //AVX512VL flag is set
354     xgetbv(0, &eax, &edx);
355     if((eax & 0xe0) == 0xe0)
356       ret=1;  //OS supports saving zmm register
357   }
358   return ret;
359 #else
360   return 0;
361 #endif
362 }
363 
support_avx512_bf16()364 int support_avx512_bf16(){
365 #if !defined(NO_AVX) && !defined(NO_AVX512)
366   int eax, ebx, ecx, edx;
367   int ret=0;
368 
369   if (!support_avx512())
370     return 0;
371   cpuid_count(7, 1, &eax, &ebx, &ecx, &edx);
372   if((eax & 32) == 32){
373       ret=1;  // CPUID.7.1:EAX[bit 5] indicates whether avx512_bf16 supported or not
374   }
375   return ret;
376 #else
377   return 0;
378 #endif
379 }
380 
381 extern void openblas_warning(int verbose, const char * msg);
382 #define FALLBACK_VERBOSE 1
383 #define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"
384 #define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n"
385 #define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n"
386 #define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"
387 
get_vendor(void)388 static int get_vendor(void){
389   int eax, ebx, ecx, edx;
390 
391   union
392   {
393         char vchar[16];
394         int  vint[4];
395   } vendor;
396 
397   cpuid(0, &eax, &ebx, &ecx, &edx);
398 
399   *(&vendor.vint[0]) = ebx;
400   *(&vendor.vint[1]) = edx;
401   *(&vendor.vint[2]) = ecx;
402 
403   vendor.vchar[12] = '\0';
404 
405   if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL;
406   if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD;
407   if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR;
408   if (!strcmp(vendor.vchar, "  Shanghai  ")) return VENDOR_ZHAOXIN;
409   if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON;
410 
411   if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
412 
413   return VENDOR_UNKNOWN;
414 }
415 
get_coretype(void)416 static gotoblas_t *get_coretype(void){
417 
418   int eax, ebx, ecx, edx;
419   int family, exfamily, model, vendor, exmodel, stepping;
420 
421   cpuid(1, &eax, &ebx, &ecx, &edx);
422 
423   family   = BITMASK(eax,  8, 0x0f);
424   exfamily = BITMASK(eax, 20, 0xff);
425   model    = BITMASK(eax,  4, 0x0f);
426   exmodel  = BITMASK(eax, 16, 0x0f);
427   stepping = BITMASK(eax,  0, 0x0f);
428 
429   vendor = get_vendor();
430 
431   if (vendor == VENDOR_INTEL){
432     switch (family) {
433     case 0x6:
434       switch (exmodel) {
435       case 0:
436 	if (model <= 0x7) return &gotoblas_KATMAI;
437 	if ((model == 0x8) || (model == 0xa) || (model == 0xb)) return &gotoblas_COPPERMINE;
438 	if ((model == 0x9) || (model == 0xd)) return &gotoblas_BANIAS;
439 	if (model == 14) return &gotoblas_BANIAS;
440 	if (model == 15) return &gotoblas_CORE2;
441 	return NULL;
442 
443       case 1:
444 	if (model == 6) return &gotoblas_CORE2;
445 	if (model == 7) return &gotoblas_PENRYN;
446 	if (model == 13) return &gotoblas_DUNNINGTON;
447 	if ((model == 10) || (model == 11) || (model == 14) || (model == 15)) return &gotoblas_NEHALEM;
448 	if (model == 12) return &gotoblas_ATOM;
449 	return NULL;
450 
451       case 2:
452 	//Intel Core (Clarkdale) / Core (Arrandale)
453 	// Pentium (Clarkdale) / Pentium Mobile (Arrandale)
454 	// Xeon (Clarkdale), 32nm
455 	if (model ==  5) return &gotoblas_NEHALEM;
456 
457 	//Intel Xeon Processor 5600 (Westmere-EP)
458 	//Xeon Processor E7 (Westmere-EX)
459 	//Xeon E7540
460 	if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM;
461 
462 	//Intel Core i5-2000 /i7-2000 (Sandy Bridge)
463 	//Intel Core i7-3000 / Xeon E5
464 	if (model == 10 || model == 13) {
465 	  if(support_avx())
466 	    return &gotoblas_SANDYBRIDGE;
467 	  else{
468 	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
469 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
470 	  }
471 	}
472 	return NULL;
473       case 3:
474 	//Intel Sandy Bridge 22nm (Ivy Bridge?)
475 	if (model == 10 || model == 14) {
476 	  if(support_avx())
477 	    return &gotoblas_SANDYBRIDGE;
478 	  else{
479 	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
480 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
481 	  }
482 	}
483 	//Intel Haswell
484 	if (model == 12 || model == 15) {
485 	  if(support_avx2())
486 	    return &gotoblas_HASWELL;
487 	  if(support_avx()) {
488 	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
489 	    return &gotoblas_SANDYBRIDGE;
490 	  } else {
491 	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
492 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
493 	  }
494 	}
495 	//Intel Broadwell
496 	if (model == 13) {
497 	  if(support_avx2())
498 	    return &gotoblas_HASWELL;
499 	  if(support_avx()) {
500 	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
501 	    return &gotoblas_SANDYBRIDGE;
502 	  } else {
503 	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
504 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
505 	  }
506 	}
507 	if (model == 7) return &gotoblas_ATOM; //Bay Trail
508 	return NULL;
509       case 4:
510 		//Intel Haswell
511 	if (model == 5 || model == 6) {
512 	  if(support_avx2())
513 	    return &gotoblas_HASWELL;
514 	  if(support_avx()) {
515 	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
516 	    return &gotoblas_SANDYBRIDGE;
517 	  } else {
518 	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
519 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
520 	  }
521 	}
522 	//Intel Broadwell
523 	if (model == 7 || model == 15) {
524 	  if(support_avx2())
525 	    return &gotoblas_HASWELL;
526 	  if(support_avx()) {
527 	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
528 	    return &gotoblas_SANDYBRIDGE;
529 	  } else {
530 	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
531 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
532 	  }
533 	}
534 	//Intel Skylake
535 	if (model == 14) {
536 	  if(support_avx2())
537 	    return &gotoblas_HASWELL;
538 	  if(support_avx()) {
539 	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
540 	    return &gotoblas_SANDYBRIDGE;
541 	  } else {
542 	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
543 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
544 	  }
545 	}
546 	//Intel Braswell / Avoton
547 	if (model == 12 || model == 13) {
548 	  return &gotoblas_NEHALEM;
549 	}
550 	return NULL;
551       case 5:
552 	//Intel Broadwell
553 	if (model == 6) {
554 	  if(support_avx2())
555 	    return &gotoblas_HASWELL;
556 	  if(support_avx()) {
557 	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
558 	    return &gotoblas_SANDYBRIDGE;
559 	  } else {
560 	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
561 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
562 	  }
563 	}
564 	if (model == 5) {
565 	// Intel Cooperlake
566           if(support_avx512_bf16())
567              return &gotoblas_COOPERLAKE;
568 	// Intel Skylake X
569           if (support_avx512())
570 	    return &gotoblas_SKYLAKEX;
571 	  if(support_avx2()){
572 	    openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
573 	    return &gotoblas_HASWELL;
574           }
575 	  if(support_avx()) {
576 	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
577 	    return &gotoblas_SANDYBRIDGE;
578 	  } else {
579           openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
580           return &gotoblas_NEHALEM;
581           }
582 	}
583 	//Intel Skylake
584 	if (model == 14) {
585 	  if(support_avx2())
586 	    return &gotoblas_HASWELL;
587 	  if(support_avx()) {
588 	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
589 	    return &gotoblas_SANDYBRIDGE;
590 	  } else {
591 	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
592 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
593 	  }
594 	}
595 	//Intel Phi Knights Landing
596 	if (model == 7) {
597 	  if(support_avx2()){
598 	    openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
599 	    return &gotoblas_HASWELL;
600 	  }
601 	  if(support_avx()) {
602 	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
603 	    return &gotoblas_SANDYBRIDGE;
604 	  } else {
605 	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
606 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
607 	  }
608 	}
609 	//Apollo Lake or Denverton
610 	if (model == 12 || model == 15) {
611 	  return &gotoblas_NEHALEM;
612 	}
613 	return NULL;
614       case 6:
615         if (model == 6) {
616           // Cannon Lake
617 	  if(support_avx2())
618 	    return &gotoblas_HASWELL;
619 	  if(support_avx()) {
620 	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
621 	    return &gotoblas_SANDYBRIDGE;
622 	  } else {
623 	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
624 	    return &gotoblas_NEHALEM;
625 	  }
626         }
627 	if (model == 10) {
628           // Ice Lake SP
629 	   if(support_avx512_bf16())
630              return &gotoblas_COOPERLAKE;
631           if (support_avx512())
632 	    return &gotoblas_SKYLAKEX;
633 	  if(support_avx2())
634 	    return &gotoblas_HASWELL;
635 	  if(support_avx()) {
636 	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
637 	    return &gotoblas_SANDYBRIDGE;
638 	  } else {
639 	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
640 	    return &gotoblas_NEHALEM;
641 	  }
642         }
643         return NULL;
644       case 7:
645 	if (model == 10) // Goldmont Plus
646 	   return &gotoblas_NEHALEM;
647         if (model == 14) {
648 	// Ice Lake
649           if (support_avx512())
650 	    return &gotoblas_SKYLAKEX;
651 	  if(support_avx2()){
652 	    openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
653 	    return &gotoblas_HASWELL;
654           }
655 	  if(support_avx()) {
656 	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
657 	    return &gotoblas_SANDYBRIDGE;
658 	  } else {
659           openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
660           return &gotoblas_NEHALEM;
661           }
662         }
663         return NULL;
664       case 9:
665       case 8:
666         if (model == 12) { // Tiger Lake
667           if (support_avx512())
668             return &gotoblas_SKYLAKEX;
669           if(support_avx2()){
670             openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
671             return &gotoblas_HASWELL;
672           }
673           if(support_avx()) {
674             openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
675             return &gotoblas_SANDYBRIDGE;
676           } else {
677           openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
678           return &gotoblas_NEHALEM;
679           }
680         }
681 	if (model == 14 ) { // Kaby Lake, Coffee Lake
682 	  if(support_avx2())
683 	    return &gotoblas_HASWELL;
684 	  if(support_avx()) {
685 	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
686 	    return &gotoblas_SANDYBRIDGE;
687 	  } else {
688 	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
689 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
690 	  }
691 	}
692       case 10:
693         if (model == 5 || model == 6) {
694 	  if(support_avx2())
695 	    return &gotoblas_HASWELL;
696 	  if(support_avx()) {
697 	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
698 	    return &gotoblas_SANDYBRIDGE;
699 	  } else {
700 	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
701 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
702 	  }
703         }
704         if (model == 7) {
705 	  if (support_avx512())
706 	    return &gotoblas_SKYLAKEX;
707 	  if(support_avx2())
708 	    return &gotoblas_HASWELL;
709 	  if(support_avx()) {
710 	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
711 	    return &gotoblas_SANDYBRIDGE;
712 	  } else {
713 	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
714 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
715 	  }
716         }
717 	return NULL;
718       }
719       case 0xf:
720       if (model <= 0x2) return &gotoblas_NORTHWOOD;
721       return &gotoblas_PRESCOTT;
722     }
723   }
724 
725   if (vendor == VENDOR_AMD || vendor == VENDOR_HYGON){
726     if (family <= 0xe) {
727         // Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon
728         cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
729         if ( (eax & 0xffff)  >= 0x01) {
730             cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
731             if ((edx & (1 << 30)) == 0 || (edx & (1u << 31)) == 0)
732               return NULL;
733           }
734         else
735           return NULL;
736 
737         return &gotoblas_ATHLON;
738       }
739     if (family == 0xf){
740       if ((exfamily == 0) || (exfamily == 2)) {
741 	if (ecx & (1 <<  0)) return &gotoblas_OPTERON_SSE3;
742 	else return &gotoblas_OPTERON;
743       }  else if (exfamily == 5 || exfamily == 7) {
744 	return &gotoblas_BOBCAT;
745       } else if (exfamily == 6) {
746 	if(model == 1){
747 	  //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
748 	  if(support_avx())
749 	    return &gotoblas_BULLDOZER;
750 	  else{
751 	    openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
752 	    return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
753 	  }
754 	}else if(model == 2 || model == 3){
755 	  //AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300
756 	  if(support_avx())
757 	    return &gotoblas_PILEDRIVER;
758 	  else{
759 	    openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
760 	    return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
761 	  }
762 	}else if(model == 5){
763 	  if(support_avx())
764 	    return &gotoblas_EXCAVATOR;
765 	  else{
766 	    openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
767 	    return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
768 	  }
769 	}else if(model == 0 || model == 8){
770 	  if (exmodel == 1) {
771 	    //AMD Trinity
772 	    if(support_avx())
773 	      return &gotoblas_PILEDRIVER;
774 	    else{
775 	      openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
776 	      return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
777 	    }
778 	   }else if (exmodel == 3) {
779 	    //AMD STEAMROLLER
780 	    if(support_avx())
781 	      return &gotoblas_STEAMROLLER;
782 	    else{
783 	      openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
784 	      return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
785 	    }
786 	  }else if (exmodel == 6) {
787 	    if(support_avx())
788 	      return &gotoblas_EXCAVATOR;
789 	    else{
790 	      openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
791 	      return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
792 	    }
793 
794 	  }
795 	}
796       } else if (exfamily == 8) {
797 	/* if (model == 1 || model == 8) */ {
798 	  if(support_avx())
799 	    return &gotoblas_ZEN;
800 	  else{
801 	    openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
802 	    return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
803 	  }
804 	}
805       } else if (exfamily == 9) {
806 	  if(support_avx())
807 	    return &gotoblas_ZEN;
808 	  else{
809 	    openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
810 	    return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
811           }
812       } else if (exfamily == 10) {
813 	  if(support_avx())
814 	    return &gotoblas_ZEN;
815 	  else{
816 	    openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
817 	    return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
818           }
819       }else {
820 	return &gotoblas_BARCELONA;
821       }
822 
823     }
824   }
825 
826   if (vendor == VENDOR_CENTAUR) {
827     switch (family) {
828     case 0x6:
829       if (model == 0xf && stepping < 0xe)
830         return &gotoblas_NANO;
831       return &gotoblas_NEHALEM;
832     default:
833       if (family >= 0x7)
834         return &gotoblas_NEHALEM;
835     }
836   }
837 
838   if (vendor == VENDOR_ZHAOXIN) {
839       return &gotoblas_NEHALEM;
840   }
841 
842   return NULL;
843 }
844 
845 static char *corename[] = {
846     "Unknown",
847     "Katmai",
848     "Coppermine",
849     "Northwood",
850     "Prescott",
851     "Banias",
852     "Atom",
853     "Core2",
854     "Penryn",
855     "Dunnington",
856     "Nehalem",
857     "Athlon",
858     "Opteron",
859     "Opteron_SSE3",
860     "Barcelona",
861     "Nano",
862     "Sandybridge",
863     "Bobcat",
864     "Bulldozer",
865     "Piledriver",
866     "Haswell",
867     "Steamroller",
868     "Excavator",
869     "Zen",
870     "SkylakeX",
871     "Cooperlake"
872 };
873 
gotoblas_corename(void)874 char *gotoblas_corename(void) {
875 
876   if (gotoblas == &gotoblas_KATMAI)       return corename[ 1];
877   if (gotoblas == &gotoblas_COPPERMINE)   return corename[ 2];
878   if (gotoblas == &gotoblas_NORTHWOOD)    return corename[ 3];
879   if (gotoblas == &gotoblas_PRESCOTT)     return corename[ 4];
880   if (gotoblas == &gotoblas_BANIAS)       return corename[ 5];
881   if (gotoblas == &gotoblas_ATOM)
882 #ifdef DYNAMIC_OLDER
883            return corename[ 6];
884 #else
885            return corename[10];
886 #endif
887   if (gotoblas == &gotoblas_CORE2)        return corename[ 7];
888   if (gotoblas == &gotoblas_PENRYN)
889 #ifdef DYNAMIC_OLDER
890            return corename[ 8];
891 #else
892            return corename[7];
893 #endif
894   if (gotoblas == &gotoblas_DUNNINGTON)
895 #ifdef DYNAMIC_OLDER
896            return corename[ 9];
897 #else
898            return corename[7];
899 #endif
900   if (gotoblas == &gotoblas_NEHALEM)      return corename[10];
901   if (gotoblas == &gotoblas_ATHLON)       return corename[11];
902   if (gotoblas == &gotoblas_OPTERON_SSE3)
903 #ifdef DYNAMIC_OLDER
904            return corename[12];
905 #else
906            return corename[7];
907 #endif
908   if (gotoblas == &gotoblas_OPTERON)
909 #ifdef DYNAMIC_OLDER
910            return corename[13];
911 #else
912            return corename[7];
913 #endif
914   if (gotoblas == &gotoblas_BARCELONA)    return corename[14];
915   if (gotoblas == &gotoblas_NANO)
916 #ifdef DYNAMIC_OLDER
917            return corename[15];
918 #else
919            return corename[10];
920 #endif
921   if (gotoblas == &gotoblas_SANDYBRIDGE)  return corename[16];
922   if (gotoblas == &gotoblas_BOBCAT)
923 #ifdef DYNAMIC_OLDER
924            return corename[17];
925 #else
926            return corename[7];
927 #endif
928   if (gotoblas == &gotoblas_BULLDOZER)    return corename[18];
929   if (gotoblas == &gotoblas_PILEDRIVER)   return corename[19];
930   if (gotoblas == &gotoblas_HASWELL)      return corename[20];
931   if (gotoblas == &gotoblas_STEAMROLLER)  return corename[21];
932   if (gotoblas == &gotoblas_EXCAVATOR)    return corename[22];
933   if (gotoblas == &gotoblas_ZEN)          return corename[23];
934   if (gotoblas == &gotoblas_SKYLAKEX)     return corename[24];
935   if (gotoblas == &gotoblas_COOPERLAKE)   return corename[25];
936   return corename[0];
937 }
938 
939 
940 
force_coretype(char * coretype)941 static gotoblas_t *force_coretype(char *coretype){
942 
943 	int i ;
944 	int found = -1;
945 	char message[128];
946 	//char mname[20];
947 
948 	for ( i=1 ; i <= 24; i++)
949 	{
950 		if (!strncasecmp(coretype,corename[i],20))
951 		{
952 			found = i;
953 			break;
954 		}
955 	}
956 	if (found < 0)
957 	{
958 	        //strncpy(mname,coretype,20);
959 	        snprintf(message, 128, "Core not found: %s\n",coretype);
960     		openblas_warning(1, message);
961 		return(NULL);
962 	}
963 
964 	switch (found)
965 	{
966 		case 25: return (&gotoblas_COOPERLAKE);
967 		case 24: return (&gotoblas_SKYLAKEX);
968 		case 23: return (&gotoblas_ZEN);
969 		case 22: return (&gotoblas_EXCAVATOR);
970 		case 21: return (&gotoblas_STEAMROLLER);
971 		case 20: return (&gotoblas_HASWELL);
972 		case 19: return (&gotoblas_PILEDRIVER);
973 		case 18: return (&gotoblas_BULLDOZER);
974 		case 17: return (&gotoblas_BOBCAT);
975 		case 16: return (&gotoblas_SANDYBRIDGE);
976 		case 15: return (&gotoblas_NANO);
977 		case 14: return (&gotoblas_BARCELONA);
978 		case 13: return (&gotoblas_OPTERON);
979 		case 12: return (&gotoblas_OPTERON_SSE3);
980 		case 11: return (&gotoblas_ATHLON);
981 		case 10: return (&gotoblas_NEHALEM);
982 		case  9: return (&gotoblas_DUNNINGTON);
983 		case  8: return (&gotoblas_PENRYN);
984 		case  7: return (&gotoblas_CORE2);
985 		case  6: return (&gotoblas_ATOM);
986 		case  5: return (&gotoblas_BANIAS);
987 		case  4: return (&gotoblas_PRESCOTT);
988 		case  3: return (&gotoblas_NORTHWOOD);
989 		case  2: return (&gotoblas_COPPERMINE);
990 		case  1: return (&gotoblas_KATMAI);
991 	}
992 	return(NULL);
993 
994 }
995 
996 
997 
998 
gotoblas_dynamic_init(void)999 void gotoblas_dynamic_init(void) {
1000 
1001   char coremsg[128];
1002   char coren[22];
1003   char *p;
1004 
1005 
1006   if (gotoblas) return;
1007 
1008   p = getenv("OPENBLAS_CORETYPE");
1009   if ( p )
1010   {
1011 	gotoblas = force_coretype(p);
1012   }
1013   else
1014   {
1015   	gotoblas = get_coretype();
1016   }
1017 
1018 #ifdef ARCH_X86
1019   if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI;
1020 #else
1021   if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT;
1022   /* sanity check, if 64bit pointer we can't have a 32 bit cpu */
1023   if (sizeof(void*) == 8) {
1024       if (gotoblas == &gotoblas_KATMAI ||
1025           gotoblas == &gotoblas_COPPERMINE ||
1026           gotoblas == &gotoblas_NORTHWOOD ||
1027           gotoblas == &gotoblas_BANIAS ||
1028           gotoblas == &gotoblas_ATHLON)
1029           gotoblas = &gotoblas_PRESCOTT;
1030   }
1031 #endif
1032 
1033   if (gotoblas && gotoblas -> init) {
1034     strncpy(coren,gotoblas_corename(),20);
1035     sprintf(coremsg, "Core: %s\n",coren);
1036     openblas_warning(2, coremsg);
1037     gotoblas -> init();
1038   } else {
1039     openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
1040     exit(1);
1041   }
1042 
1043 }
1044 
gotoblas_dynamic_quit(void)1045 void gotoblas_dynamic_quit(void) {
1046 
1047   gotoblas = NULL;
1048 
1049 }
1050