1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin.           */
3 /* All rights reserved.                                              */
4 /*                                                                   */
5 /* Redistribution and use in source and binary forms, with or        */
6 /* without modification, are permitted provided that the following   */
7 /* conditions are met:                                               */
8 /*                                                                   */
9 /*   1. Redistributions of source code must retain the above         */
10 /*      copyright notice, this list of conditions and the following  */
11 /*      disclaimer.                                                  */
12 /*                                                                   */
13 /*   2. Redistributions in binary form must reproduce the above      */
14 /*      copyright notice, this list of conditions and the following  */
15 /*      disclaimer in the documentation and/or other materials       */
16 /*      provided with the distribution.                              */
17 /*                                                                   */
18 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
32 /*                                                                   */
33 /* The views and conclusions contained in the software and           */
34 /* documentation are those of the authors and should not be          */
35 /* interpreted as representing official policies, either expressed   */
36 /* or implied, of The University of Texas at Austin.                 */
37 /*********************************************************************/
38 
39 #include <stdio.h>
40 #include <string.h>
41 #include "common.h"
42 
43 extern int openblas_block_factor();
44 int get_L2_size(void);
45 
46 #define DEFAULT_GEMM_P 128
47 #define DEFAULT_GEMM_Q 128
48 #define DEFAULT_GEMM_R 128
49 #define DEFAULT_GEMM_OFFSET_A 0
50 #define DEFAULT_GEMM_OFFSET_B 0
51 
52 /* Global Parameter */
53 #if GEMM_OFFSET_A == gemm_offset_a
54 BLASLONG gemm_offset_a = DEFAULT_GEMM_OFFSET_A;
55 #else
56 BLASLONG gemm_offset_a = GEMM_OFFSET_A;
57 #endif
58 
59 #if GEMM_OFFSET_B == gemm_offset_b
60 BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B;
61 #else
62 BLASLONG gemm_offset_b = GEMM_OFFSET_B;
63 #endif
64 
65 #if SBGEMM_P == sbgemm_p
66 BLASLONG sbgemm_p = DEFAULT_GEMM_P;
67 #else
68 BLASLONG sbgemm_p = SBGEMM_P;
69 #endif
70 #if SGEMM_P == sgemm_p
71 BLASLONG sgemm_p = DEFAULT_GEMM_P;
72 #else
73 BLASLONG sgemm_p = SGEMM_P;
74 #endif
75 #if DGEMM_P == dgemm_p
76 BLASLONG dgemm_p = DEFAULT_GEMM_P;
77 #else
78 BLASLONG dgemm_p = DGEMM_P;
79 #endif
80 #if CGEMM_P == cgemm_p
81 BLASLONG cgemm_p = DEFAULT_GEMM_P;
82 #else
83 BLASLONG cgemm_p = CGEMM_P;
84 #endif
85 #if ZGEMM_P == zgemm_p
86 BLASLONG zgemm_p = DEFAULT_GEMM_P;
87 #else
88 BLASLONG zgemm_p = ZGEMM_P;
89 #endif
90 
91 #if SBGEMM_Q == sbgemm_q
92 BLASLONG sbgemm_q = DEFAULT_GEMM_Q;
93 #else
94 BLASLONG sbgemm_q = SBGEMM_Q;
95 #endif
96 #if SGEMM_Q == sgemm_q
97 BLASLONG sgemm_q = DEFAULT_GEMM_Q;
98 #else
99 BLASLONG sgemm_q = SGEMM_Q;
100 #endif
101 #if DGEMM_Q == dgemm_q
102 BLASLONG dgemm_q = DEFAULT_GEMM_Q;
103 #else
104 BLASLONG dgemm_q = DGEMM_Q;
105 #endif
106 #if CGEMM_Q == cgemm_q
107 BLASLONG cgemm_q = DEFAULT_GEMM_Q;
108 #else
109 BLASLONG cgemm_q = CGEMM_Q;
110 #endif
111 #if ZGEMM_Q == zgemm_q
112 BLASLONG zgemm_q = DEFAULT_GEMM_Q;
113 #else
114 BLASLONG zgemm_q = ZGEMM_Q;
115 #endif
116 
117 #if SBGEMM_R == sbgemm_r
118 BLASLONG sbgemm_r = DEFAULT_GEMM_R;
119 #else
120 BLASLONG sbgemm_r = SBGEMM_R;
121 #endif
122 #if SGEMM_R == sgemm_r
123 BLASLONG sgemm_r = DEFAULT_GEMM_R;
124 #else
125 BLASLONG sgemm_r = SGEMM_R;
126 #endif
127 #if DGEMM_R == dgemm_r
128 BLASLONG dgemm_r = DEFAULT_GEMM_R;
129 #else
130 BLASLONG dgemm_r = DGEMM_R;
131 #endif
132 #if CGEMM_R == cgemm_r
133 BLASLONG cgemm_r = DEFAULT_GEMM_R;
134 #else
135 BLASLONG cgemm_r = CGEMM_R;
136 #endif
137 #if ZGEMM_R == zgemm_r
138 BLASLONG zgemm_r = DEFAULT_GEMM_R;
139 #else
140 BLASLONG zgemm_r = ZGEMM_R;
141 #endif
142 
143 #if defined(EXPRECISION) || defined(QUAD_PRECISION)
144 #if QGEMM_P == qgemm_p
145 BLASLONG qgemm_p = DEFAULT_GEMM_P;
146 #else
147 BLASLONG qgemm_p = QGEMM_P;
148 #endif
149 #if XGEMM_P == xgemm_p
150 BLASLONG xgemm_p = DEFAULT_GEMM_P;
151 #else
152 BLASLONG xgemm_p = XGEMM_P;
153 #endif
154 #if QGEMM_Q == qgemm_q
155 BLASLONG qgemm_q = DEFAULT_GEMM_Q;
156 #else
157 BLASLONG qgemm_q = QGEMM_Q;
158 #endif
159 #if XGEMM_Q == xgemm_q
160 BLASLONG xgemm_q = DEFAULT_GEMM_Q;
161 #else
162 BLASLONG xgemm_q = XGEMM_Q;
163 #endif
164 #if QGEMM_R == qgemm_r
165 BLASLONG qgemm_r = DEFAULT_GEMM_R;
166 #else
167 BLASLONG qgemm_r = QGEMM_R;
168 #endif
169 #if XGEMM_R == xgemm_r
170 BLASLONG xgemm_r = DEFAULT_GEMM_R;
171 #else
172 BLASLONG xgemm_r = XGEMM_R;
173 #endif
174 #endif
175 
176 #if defined(ARCH_X86) || defined(ARCH_X86_64)
177 
get_L2_size(void)178 int get_L2_size(void){
179 
180   int eax, ebx, ecx, edx;
181 
182 #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
183     defined(CORE_PRESCOTT) || defined(CORE_CORE2)       || defined(PENRYN) || defined(DUNNINGTON) || \
184     defined(CORE_NEHALEM)  || defined(CORE_SANDYBRIDGE) || defined(ATOM)   || defined(GENERIC)    || \
185     defined(PILEDRIVER)    || defined(HASWELL)          || defined(STEAMROLLER) || defined(EXCAVATOR) || \
186     defined(ZEN)           || defined(SKYLAKEX)         || defined(COOPERLAKE)
187 
188   cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
189 
190   return BITMASK(ecx, 16, 0xffff);
191 
192 #else
193 
194   int info[15];
195   int i;
196 
197   cpuid(2, &eax, &ebx, &ecx, &edx);
198 
199   info[ 0] = BITMASK(eax,  8, 0xff);
200   info[ 1] = BITMASK(eax, 16, 0xff);
201   info[ 2] = BITMASK(eax, 24, 0xff);
202 
203   info[ 3] = BITMASK(ebx,  0, 0xff);
204   info[ 4] = BITMASK(ebx,  8, 0xff);
205   info[ 5] = BITMASK(ebx, 16, 0xff);
206   info[ 6] = BITMASK(ebx, 24, 0xff);
207 
208   info[ 7] = BITMASK(ecx,  0, 0xff);
209   info[ 8] = BITMASK(ecx,  8, 0xff);
210   info[ 9] = BITMASK(ecx, 16, 0xff);
211   info[10] = BITMASK(ecx, 24, 0xff);
212 
213   info[11] = BITMASK(edx,  0, 0xff);
214   info[12] = BITMASK(edx,  8, 0xff);
215   info[13] = BITMASK(edx, 16, 0xff);
216   info[14] = BITMASK(edx, 24, 0xff);
217 
218   for (i = 0; i < 15; i++){
219 
220     switch (info[i]){
221       case 0x3b :
222       case 0x41 :
223       case 0x79 :
224 	return  128;
225 	break;
226 
227       case 0x3c :
228       case 0x42 :
229       case 0x7a :
230       case 0x7e :
231       case 0x82 :
232 	return  256;
233 	break;
234 
235       case 0x43 :
236       case 0x7b :
237       case 0x7f :
238       case 0x83 :
239       case 0x86 :
240 	return  512;
241 	break;
242 
243       case 0x44 :
244       case 0x78 :
245       case 0x7c :
246       case 0x84 :
247       case 0x87 :
248 	return 1024;
249 	break;
250 
251       case 0x45 :
252       case 0x7d :
253       case 0x85 :
254 	return 2048;
255 
256       case 0x49 :
257 	return 4096;
258 	break;
259     }
260   }
261 
262   /* Never reached */
263   return 0;
264 #endif
265 }
266 
blas_set_parameter(void)267 void blas_set_parameter(void){
268 
269   int factor;
270 #if defined(BULLDOZER) || defined(PILEDRIVER)  || defined(SANDYBRIDGE) || defined(NEHALEM) || \
271     defined(HASWELL)   || defined(STEAMROLLER) || defined(EXCAVATOR)   || defined(ZEN)     || \
272     defined(SKYLAKEX)  || defined(COOPERLAKE)
273   int size = 16;
274 #else
275   int size = get_L2_size();
276 #endif
277 
278 #if defined(CORE_KATMAI)  || defined(CORE_COPPERMINE) || defined(CORE_BANIAS)
279   size >>= 7;
280 
281 #if defined(CORE_BANIAS) && (HAVE_HIT > 1)
282   sgemm_p =  64 / HAVE_HIT * size;
283   dgemm_p =  32 / HAVE_HIT * size;
284   cgemm_p =  32 / HAVE_HIT * size;
285   zgemm_p =  16 / HAVE_HIT * size;
286 #ifdef EXPRECISION
287   qgemm_p =  16 / HAVE_HIT * size;
288   xgemm_p =   8 / HAVE_HIT * size;
289 #endif
290 #ifdef QUAD_PRECISION
291   qgemm_p =   8 / HAVE_HIT * size;
292   xgemm_p =   4 / HAVE_HIT * size;
293 #endif
294 #else
295   sgemm_p =  64 * size;
296   dgemm_p =  32 * size;
297   cgemm_p =  32 * size;
298   zgemm_p =  16 * size;
299 #ifdef EXPRECISION
300   qgemm_p =  16 * size;
301   xgemm_p =   8 * size;
302 #endif
303 #ifdef QUAD_PRECISION
304   qgemm_p =   8 * size;
305   xgemm_p =   4 * size;
306 #endif
307 #endif
308 #endif
309 
310 #if defined(CORE_NORTHWOOD)
311   size >>= 7;
312 
313 #ifdef ALLOC_HUGETLB
314   sgemm_p = 128 * size;
315   dgemm_p =  64 * size;
316   cgemm_p =  64 * size;
317   zgemm_p =  32 * size;
318 #ifdef EXPRECISION
319   qgemm_p =  32 * size;
320   xgemm_p =  16 * size;
321 #endif
322 #ifdef QUAD_PRECISION
323   qgemm_p =  16 * size;
324   xgemm_p =   8 * size;
325 #endif
326 #else
327   sgemm_p =  96 * size;
328   dgemm_p =  48 * size;
329   cgemm_p =  48 * size;
330   zgemm_p =  24 * size;
331 #ifdef EXPRECISION
332   qgemm_p =  24 * size;
333   xgemm_p =  12 * size;
334 #endif
335 #ifdef QUAD_PRECISION
336   qgemm_p =  12 * size;
337   xgemm_p =   6 * size;
338 #endif
339 #endif
340 #endif
341 
342 #if defined(CORE_CORE2)
343 
344   size >>= 9;
345 
346   sgemm_p =  92 * size;
347   dgemm_p =  46 * size;
348   cgemm_p =  46 * size;
349   zgemm_p =  23 * size;
350 
351 #ifdef EXPRECISION
352   qgemm_p =  23 * size;
353   xgemm_p =  11 * size;
354 #endif
355 #ifdef QUAD_PRECISION
356   qgemm_p =  11 * size;
357   xgemm_p =   5 * size;
358 #endif
359 #endif
360 
361 #if defined(PENRYN)
362 
363   size >>= 9;
364 
365   sgemm_p = 1024;
366   dgemm_p =  512;
367   cgemm_p =  512;
368   zgemm_p =  256;
369 
370 #ifdef EXPRECISION
371   qgemm_p =  256;
372   xgemm_p =  128;
373 #endif
374 #ifdef QUAD_PRECISION
375   qgemm_p =  21 * size + 4;
376   xgemm_p =  10 * size + 2;
377 #endif
378 #endif
379 
380 #if defined(DUNNINGTON)
381 
382   size >>= 9;
383 
384   sgemm_p = 384;
385   dgemm_p = 384;
386   cgemm_p = 384;
387   zgemm_p = 384;
388 
389 #ifdef EXPRECISION
390   qgemm_p = 384;
391   xgemm_p = 384;
392 #endif
393 #ifdef QUAD_PRECISION
394   qgemm_p =  21 * size + 4;
395   xgemm_p =  10 * size + 2;
396 #endif
397 #endif
398 
399 #if defined(NEHALEM)
400   sgemm_p = 1024;
401   dgemm_p =  512;
402   cgemm_p =  512;
403   zgemm_p =  256;
404 #ifdef EXPRECISION
405   qgemm_p =  256;
406   xgemm_p =  128;
407 #endif
408 #endif
409 
410 #if defined(SANDYBRIDGE)
411   sgemm_p = 1024;
412   dgemm_p =  512;
413   cgemm_p =  512;
414   zgemm_p =  256;
415 #ifdef EXPRECISION
416   qgemm_p =  256;
417   xgemm_p =  128;
418 #endif
419 #endif
420 
421 #if defined(CORE_PRESCOTT)  || defined(GENERIC)
422   size >>= 6;
423 
424   if (size > 16) size = 16;
425 
426   sgemm_p =  56 * size;
427   dgemm_p =  28 * size;
428   cgemm_p =  28 * size;
429   zgemm_p =  14 * size;
430 #ifdef EXPRECISION
431   qgemm_p =  14 * size;
432   xgemm_p =   7 * size;
433 #endif
434 #ifdef QUAD_PRECISION
435   qgemm_p =   7 * size;
436   xgemm_p =   3 * size;
437 #endif
438 #endif
439 
440 #if defined(CORE_OPTERON)
441   sgemm_p =  224 + 14 * (size >> 5);
442   dgemm_p =  112 + 14 * (size >> 6);
443   cgemm_p =  116 + 14 * (size >> 6);
444   zgemm_p =   58 + 14 * (size >> 7);
445 #ifdef EXPRECISION
446   qgemm_p =   58 + 14 * (size >> 7);
447   xgemm_p =   29 + 14 * (size >> 8);
448 #endif
449 #ifdef QUAD_PRECISION
450   qgemm_p =   29 + 14 * (size >> 8);
451   xgemm_p =   15 + 14 * (size >> 9);
452 #endif
453 #endif
454 
455 #if defined(ATOM)
456   size >>= 8;
457 
458   sgemm_p =  256;
459   dgemm_p =  128;
460   cgemm_p =  128;
461   zgemm_p =   64;
462 #ifdef EXPRECISION
463   qgemm_p =   64;
464   xgemm_p =   32;
465 #endif
466 #ifdef QUAD_PRECISION
467   qgemm_p =   32;
468   xgemm_p =   16;
469 #endif
470 #endif
471 
472 #if defined(CORE_BARCELONA) || defined(CORE_BOBCAT)
473   size >>= 8;
474 
475   sgemm_p = 232 * size;
476   dgemm_p = 116 * size;
477   cgemm_p = 116 * size;
478   zgemm_p =  58 * size;
479 #ifdef EXPRECISION
480   qgemm_p =  58 * size;
481   xgemm_p =  26 * size;
482 #endif
483 #ifdef QUAD_PRECISION
484   qgemm_p =  26 * size;
485   xgemm_p =  13 * size;
486 #endif
487 #endif
488 
489   factor=openblas_block_factor();
490   if (factor>0) {
491     if (factor <  10) factor =  10;
492     if (factor > 200) factor = 200;
493 
494     sgemm_p = ((long)((double)sgemm_p * (double)factor * 1.e-2)) & ~7L;
495     dgemm_p = ((long)((double)dgemm_p * (double)factor * 1.e-2)) & ~7L;
496     cgemm_p = ((long)((double)cgemm_p * (double)factor * 1.e-2)) & ~7L;
497     zgemm_p = ((long)((double)zgemm_p * (double)factor * 1.e-2)) & ~7L;
498 #ifdef EXPRECISION
499     qgemm_p = ((long)((double)qgemm_p * (double)factor * 1.e-2)) & ~7L;
500     xgemm_p = ((long)((double)xgemm_p * (double)factor * 1.e-2)) & ~7L;
501 #endif
502   }
503 
504   if (sgemm_p == 0) sgemm_p = 64;
505   if (dgemm_p == 0) dgemm_p = 64;
506   if (cgemm_p == 0) cgemm_p = 64;
507   if (zgemm_p == 0) zgemm_p = 64;
508 #ifdef EXPRECISION
509   if (qgemm_p == 0) qgemm_p = 64;
510   if (xgemm_p == 0) xgemm_p = 64;
511 #endif
512 
513 #ifdef QUAD_PRECISION
514   if (qgemm_p == 0) qgemm_p = 64;
515   if (xgemm_p == 0) xgemm_p = 64;
516 #endif
517 
518   sgemm_p = ((sgemm_p + SGEMM_UNROLL_M - 1)/SGEMM_UNROLL_M) * SGEMM_UNROLL_M;
519   dgemm_p = ((dgemm_p + DGEMM_UNROLL_M - 1)/DGEMM_UNROLL_M) * DGEMM_UNROLL_M;
520   cgemm_p = ((cgemm_p + CGEMM_UNROLL_M - 1)/CGEMM_UNROLL_M) * CGEMM_UNROLL_M;
521   zgemm_p = ((zgemm_p + ZGEMM_UNROLL_M - 1)/ZGEMM_UNROLL_M) * ZGEMM_UNROLL_M;
522 #ifdef QUAD_PRECISION
523   qgemm_p = ((qgemm_p + QGEMM_UNROLL_M - 1)/QGEMM_UNROLL_M) * QGEMM_UNROLL_M;
524   xgemm_p = ((xgemm_p + XGEMM_UNROLL_M - 1)/XGEMM_UNROLL_M) * XGEMM_UNROLL_M;
525 #endif
526 
527 #ifdef BUILD_BFLOAT16
528   sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q *  4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q *  4)) - 15) & ~15;
529 #endif
530   sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q *  4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q *  4)) - 15) & ~15;
531   dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q *  8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q *  8)) - 15) & ~15;
532   cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q *  8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q *  8)) - 15) & ~15;
533   zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15;
534 #if defined(EXPRECISION) || defined(QUAD_PRECISION)
535   qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15;
536   xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15;
537 #endif
538 
539 #if 0
540   fprintf(stderr, "SGEMM ... %3d, %3d, %3d\n", SGEMM_P, SGEMM_Q, SGEMM_R);
541   fprintf(stderr, "DGEMM ... %3d, %3d, %3d\n", DGEMM_P, DGEMM_Q, DGEMM_R);
542   fprintf(stderr, "CGEMM ... %3d, %3d, %3d\n", CGEMM_P, CGEMM_Q, CGEMM_R);
543   fprintf(stderr, "ZGEMM ... %3d, %3d, %3d\n", ZGEMM_P, ZGEMM_Q, ZGEMM_R);
544 #endif
545 
546   return;
547 }
548 
549 #if 0
550 
551 int get_current_cpu_info(void){
552 
553   int nlprocs, ncores, cmplegacy;
554   int htt     = 0;
555   int apicid  = 0;
556 
557 #if defined(CORE_PRESCOTT) || defined(CORE_OPTERON)
558   int eax, ebx, ecx, edx;
559 
560   cpuid(1, &eax, &ebx, &ecx, &edx);
561   nlprocs = BITMASK(ebx, 16, 0xff);
562   apicid  = BITMASK(ebx, 24, 0xff);
563   htt     = BITMASK(edx, 28, 0x01);
564 #endif
565 
566 #if defined(CORE_PRESCOTT)
567   cpuid(4, &eax, &ebx, &ecx, &edx);
568   ncores = BITMASK(eax, 26, 0x3f);
569 
570   if (htt == 0)  nlprocs = 0;
571 #endif
572 
573 #if defined(CORE_OPTERON)
574   cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
575   ncores = BITMASK(ecx,  0, 0xff);
576 
577   cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
578   cmplegacy = BITMASK(ecx,  1, 0x01);
579 
580   if (htt == 0) {
581     nlprocs = 0;
582     ncores  = 0;
583     cmplegacy = 0;
584   }
585 #endif
586 
587   ncores  ++;
588 
589   fprintf(stderr, "APICID = %d  Number of core = %d\n", apicid, ncores);
590 
591   return 0;
592 }
593 #endif
594 
595 #endif
596 
597 #if defined(ARCH_IA64)
598 
cpuid(BLASULONG regnum)599 static inline BLASULONG cpuid(BLASULONG regnum){
600   BLASULONG value;
601 
602 #ifndef __ECC
603   asm ("mov %0=cpuid[%r1]" : "=r"(value) : "rO"(regnum));
604 #else
605  value = __getIndReg(_IA64_REG_INDR_CPUID, regnum);
606 #endif
607 
608   return value;
609 }
610 
611 #if 1
612 
blas_set_parameter(void)613 void blas_set_parameter(void){
614 
615   BLASULONG cpuid3, size;
616 
617   cpuid3 = cpuid(3);
618 
619   size = BITMASK(cpuid3, 16, 0xff);
620 
621   sbgemm_p = 192 * (size + 1);
622   sgemm_p = 192 * (size + 1);
623   dgemm_p =  96 * (size + 1);
624   cgemm_p =  96 * (size + 1);
625   zgemm_p =  48 * (size + 1);
626 #ifdef EXPRECISION
627   qgemm_p =  64 * (size + 1);
628   xgemm_p =  32 * (size + 1);
629 #endif
630 #ifdef QUAD_PRECISION
631   qgemm_p =  32 * (size + 1);
632   xgemm_p =  16 * (size + 1);
633 #endif
634 
635 #ifdef BUILD_BFLOAT16
636   sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q *  4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q *  4)) - 15) & ~15;
637 #endif
638   sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q *  4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q *  4)) - 15) & ~15;
639   dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q *  8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q *  8)) - 15) & ~15;
640   cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q *  8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q *  8)) - 15) & ~15;
641   zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15;
642 #if defined(EXPRECISION) || defined(QUAD_PRECISION)
643   qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15;
644   xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15;
645 #endif
646 
647   return;
648 }
649 
650 #else
651 
652 #define IA64_SYS_NAME  "/sys/devices/system/cpu/cpu0/cache/index3/size"
653 #define IA64_PROC_NAME "/proc/pal/cpu0/cache_info"
654 
blas_set_parameter(void)655 void blas_set_parameter(void){
656 
657   BLASULONG cpuid3;
658   int size = 0;
659 
660 #if 1
661   char buffer[128];
662   FILE *infile;
663 
664   if ((infile = fopen(IA64_SYS_NAME, "r")) != NULL) {
665 
666     fgets(buffer, sizeof(buffer), infile);
667     fclose(infile);
668 
669     size = atoi(buffer) / 1536;
670   }
671 
672   if (size <= 0) {
673     if ((infile = fopen(IA64_PROC_NAME, "r")) != NULL) {
674 
675       while(fgets(buffer, sizeof(buffer), infile) != NULL) {
676 	if ((!strncmp("Data/Instruction Cache level 3", buffer, 30))) break;
677       }
678 
679       fgets(buffer, sizeof(buffer), infile);
680 
681       fclose(infile);
682 
683       *strstr(buffer, "bytes") = (char)NULL;
684 
685       size = atoi(strchr(buffer, ':') + 1) / 1572864;
686     }
687   }
688 #endif
689 
690   /* The last resort */
691 
692   if (size <= 0) {
693     cpuid3 = cpuid(3);
694 
695     size = BITMASK(cpuid3, 16, 0xff) + 1;
696   }
697 
698   sgemm_p = 320 * size;
699   dgemm_p = 160 * size;
700   cgemm_p = 160 * size;
701   zgemm_p =  80 * size;
702 #ifdef EXPRECISION
703   qgemm_p =  80 * size;
704   xgemm_p =  40 * size;
705 #endif
706 
707   sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q *  4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q *  4)) - 15) & ~15;
708   dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q *  8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q *  8)) - 15) & ~15;
709   cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q *  8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q *  8)) - 15) & ~15;
710   zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15;
711 #ifdef EXPRECISION
712   qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15;
713   xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15;
714 #endif
715 
716   return;
717 }
718 
719 #endif
720 
721 #endif
722 
723 #if defined(ARCH_MIPS64)
blas_set_parameter(void)724 void blas_set_parameter(void){
725 #if defined(LOONGSON3R3) || defined(LOONGSON3R4)
726 #ifdef SMP
727   if(blas_num_threads == 1){
728 #endif
729     //single thread
730     dgemm_r = 1024;
731 #ifdef SMP
732   }else{
733     //multi thread
734     dgemm_r = 200;
735   }
736 #endif
737 #endif
738 
739 }
740 #endif
741 
742 #if defined(ARCH_ARM64)
743 
blas_set_parameter(void)744 void blas_set_parameter(void)
745 {
746 }
747 
748 #endif
749