1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
4 /* */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
8 /* */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
11 /* disclaimer. */
12 /* */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
17 /* */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
32 /* */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
38
39 #include <stdio.h>
40 #include <string.h>
41 #include "common.h"
42
43 extern int openblas_block_factor();
44 int get_L2_size(void);
45
46 #define DEFAULT_GEMM_P 128
47 #define DEFAULT_GEMM_Q 128
48 #define DEFAULT_GEMM_R 128
49 #define DEFAULT_GEMM_OFFSET_A 0
50 #define DEFAULT_GEMM_OFFSET_B 0
51
52 /* Global Parameter */
53 #if GEMM_OFFSET_A == gemm_offset_a
54 BLASLONG gemm_offset_a = DEFAULT_GEMM_OFFSET_A;
55 #else
56 BLASLONG gemm_offset_a = GEMM_OFFSET_A;
57 #endif
58
59 #if GEMM_OFFSET_B == gemm_offset_b
60 BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B;
61 #else
62 BLASLONG gemm_offset_b = GEMM_OFFSET_B;
63 #endif
64
65 #if SBGEMM_P == sbgemm_p
66 BLASLONG sbgemm_p = DEFAULT_GEMM_P;
67 #else
68 BLASLONG sbgemm_p = SBGEMM_P;
69 #endif
70 #if SGEMM_P == sgemm_p
71 BLASLONG sgemm_p = DEFAULT_GEMM_P;
72 #else
73 BLASLONG sgemm_p = SGEMM_P;
74 #endif
75 #if DGEMM_P == dgemm_p
76 BLASLONG dgemm_p = DEFAULT_GEMM_P;
77 #else
78 BLASLONG dgemm_p = DGEMM_P;
79 #endif
80 #if CGEMM_P == cgemm_p
81 BLASLONG cgemm_p = DEFAULT_GEMM_P;
82 #else
83 BLASLONG cgemm_p = CGEMM_P;
84 #endif
85 #if ZGEMM_P == zgemm_p
86 BLASLONG zgemm_p = DEFAULT_GEMM_P;
87 #else
88 BLASLONG zgemm_p = ZGEMM_P;
89 #endif
90
91 #if SBGEMM_Q == sbgemm_q
92 BLASLONG sbgemm_q = DEFAULT_GEMM_Q;
93 #else
94 BLASLONG sbgemm_q = SBGEMM_Q;
95 #endif
96 #if SGEMM_Q == sgemm_q
97 BLASLONG sgemm_q = DEFAULT_GEMM_Q;
98 #else
99 BLASLONG sgemm_q = SGEMM_Q;
100 #endif
101 #if DGEMM_Q == dgemm_q
102 BLASLONG dgemm_q = DEFAULT_GEMM_Q;
103 #else
104 BLASLONG dgemm_q = DGEMM_Q;
105 #endif
106 #if CGEMM_Q == cgemm_q
107 BLASLONG cgemm_q = DEFAULT_GEMM_Q;
108 #else
109 BLASLONG cgemm_q = CGEMM_Q;
110 #endif
111 #if ZGEMM_Q == zgemm_q
112 BLASLONG zgemm_q = DEFAULT_GEMM_Q;
113 #else
114 BLASLONG zgemm_q = ZGEMM_Q;
115 #endif
116
117 #if SBGEMM_R == sbgemm_r
118 BLASLONG sbgemm_r = DEFAULT_GEMM_R;
119 #else
120 BLASLONG sbgemm_r = SBGEMM_R;
121 #endif
122 #if SGEMM_R == sgemm_r
123 BLASLONG sgemm_r = DEFAULT_GEMM_R;
124 #else
125 BLASLONG sgemm_r = SGEMM_R;
126 #endif
127 #if DGEMM_R == dgemm_r
128 BLASLONG dgemm_r = DEFAULT_GEMM_R;
129 #else
130 BLASLONG dgemm_r = DGEMM_R;
131 #endif
132 #if CGEMM_R == cgemm_r
133 BLASLONG cgemm_r = DEFAULT_GEMM_R;
134 #else
135 BLASLONG cgemm_r = CGEMM_R;
136 #endif
137 #if ZGEMM_R == zgemm_r
138 BLASLONG zgemm_r = DEFAULT_GEMM_R;
139 #else
140 BLASLONG zgemm_r = ZGEMM_R;
141 #endif
142
143 #if defined(EXPRECISION) || defined(QUAD_PRECISION)
144 #if QGEMM_P == qgemm_p
145 BLASLONG qgemm_p = DEFAULT_GEMM_P;
146 #else
147 BLASLONG qgemm_p = QGEMM_P;
148 #endif
149 #if XGEMM_P == xgemm_p
150 BLASLONG xgemm_p = DEFAULT_GEMM_P;
151 #else
152 BLASLONG xgemm_p = XGEMM_P;
153 #endif
154 #if QGEMM_Q == qgemm_q
155 BLASLONG qgemm_q = DEFAULT_GEMM_Q;
156 #else
157 BLASLONG qgemm_q = QGEMM_Q;
158 #endif
159 #if XGEMM_Q == xgemm_q
160 BLASLONG xgemm_q = DEFAULT_GEMM_Q;
161 #else
162 BLASLONG xgemm_q = XGEMM_Q;
163 #endif
164 #if QGEMM_R == qgemm_r
165 BLASLONG qgemm_r = DEFAULT_GEMM_R;
166 #else
167 BLASLONG qgemm_r = QGEMM_R;
168 #endif
169 #if XGEMM_R == xgemm_r
170 BLASLONG xgemm_r = DEFAULT_GEMM_R;
171 #else
172 BLASLONG xgemm_r = XGEMM_R;
173 #endif
174 #endif
175
176 #if defined(ARCH_X86) || defined(ARCH_X86_64)
177
get_L2_size(void)178 int get_L2_size(void){
179
180 int eax, ebx, ecx, edx;
181
182 #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
183 defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
184 defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
185 defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || \
186 defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE)
187
188 cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
189
190 return BITMASK(ecx, 16, 0xffff);
191
192 #else
193
194 int info[15];
195 int i;
196
197 cpuid(2, &eax, &ebx, &ecx, &edx);
198
199 info[ 0] = BITMASK(eax, 8, 0xff);
200 info[ 1] = BITMASK(eax, 16, 0xff);
201 info[ 2] = BITMASK(eax, 24, 0xff);
202
203 info[ 3] = BITMASK(ebx, 0, 0xff);
204 info[ 4] = BITMASK(ebx, 8, 0xff);
205 info[ 5] = BITMASK(ebx, 16, 0xff);
206 info[ 6] = BITMASK(ebx, 24, 0xff);
207
208 info[ 7] = BITMASK(ecx, 0, 0xff);
209 info[ 8] = BITMASK(ecx, 8, 0xff);
210 info[ 9] = BITMASK(ecx, 16, 0xff);
211 info[10] = BITMASK(ecx, 24, 0xff);
212
213 info[11] = BITMASK(edx, 0, 0xff);
214 info[12] = BITMASK(edx, 8, 0xff);
215 info[13] = BITMASK(edx, 16, 0xff);
216 info[14] = BITMASK(edx, 24, 0xff);
217
218 for (i = 0; i < 15; i++){
219
220 switch (info[i]){
221 case 0x3b :
222 case 0x41 :
223 case 0x79 :
224 return 128;
225 break;
226
227 case 0x3c :
228 case 0x42 :
229 case 0x7a :
230 case 0x7e :
231 case 0x82 :
232 return 256;
233 break;
234
235 case 0x43 :
236 case 0x7b :
237 case 0x7f :
238 case 0x83 :
239 case 0x86 :
240 return 512;
241 break;
242
243 case 0x44 :
244 case 0x78 :
245 case 0x7c :
246 case 0x84 :
247 case 0x87 :
248 return 1024;
249 break;
250
251 case 0x45 :
252 case 0x7d :
253 case 0x85 :
254 return 2048;
255
256 case 0x49 :
257 return 4096;
258 break;
259 }
260 }
261
262 /* Never reached */
263 return 0;
264 #endif
265 }
266
blas_set_parameter(void)267 void blas_set_parameter(void){
268
269 int factor;
270 #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || \
271 defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || \
272 defined(SKYLAKEX) || defined(COOPERLAKE)
273 int size = 16;
274 #else
275 int size = get_L2_size();
276 #endif
277
278 #if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS)
279 size >>= 7;
280
281 #if defined(CORE_BANIAS) && (HAVE_HIT > 1)
282 sgemm_p = 64 / HAVE_HIT * size;
283 dgemm_p = 32 / HAVE_HIT * size;
284 cgemm_p = 32 / HAVE_HIT * size;
285 zgemm_p = 16 / HAVE_HIT * size;
286 #ifdef EXPRECISION
287 qgemm_p = 16 / HAVE_HIT * size;
288 xgemm_p = 8 / HAVE_HIT * size;
289 #endif
290 #ifdef QUAD_PRECISION
291 qgemm_p = 8 / HAVE_HIT * size;
292 xgemm_p = 4 / HAVE_HIT * size;
293 #endif
294 #else
295 sgemm_p = 64 * size;
296 dgemm_p = 32 * size;
297 cgemm_p = 32 * size;
298 zgemm_p = 16 * size;
299 #ifdef EXPRECISION
300 qgemm_p = 16 * size;
301 xgemm_p = 8 * size;
302 #endif
303 #ifdef QUAD_PRECISION
304 qgemm_p = 8 * size;
305 xgemm_p = 4 * size;
306 #endif
307 #endif
308 #endif
309
310 #if defined(CORE_NORTHWOOD)
311 size >>= 7;
312
313 #ifdef ALLOC_HUGETLB
314 sgemm_p = 128 * size;
315 dgemm_p = 64 * size;
316 cgemm_p = 64 * size;
317 zgemm_p = 32 * size;
318 #ifdef EXPRECISION
319 qgemm_p = 32 * size;
320 xgemm_p = 16 * size;
321 #endif
322 #ifdef QUAD_PRECISION
323 qgemm_p = 16 * size;
324 xgemm_p = 8 * size;
325 #endif
326 #else
327 sgemm_p = 96 * size;
328 dgemm_p = 48 * size;
329 cgemm_p = 48 * size;
330 zgemm_p = 24 * size;
331 #ifdef EXPRECISION
332 qgemm_p = 24 * size;
333 xgemm_p = 12 * size;
334 #endif
335 #ifdef QUAD_PRECISION
336 qgemm_p = 12 * size;
337 xgemm_p = 6 * size;
338 #endif
339 #endif
340 #endif
341
342 #if defined(CORE_CORE2)
343
344 size >>= 9;
345
346 sgemm_p = 92 * size;
347 dgemm_p = 46 * size;
348 cgemm_p = 46 * size;
349 zgemm_p = 23 * size;
350
351 #ifdef EXPRECISION
352 qgemm_p = 23 * size;
353 xgemm_p = 11 * size;
354 #endif
355 #ifdef QUAD_PRECISION
356 qgemm_p = 11 * size;
357 xgemm_p = 5 * size;
358 #endif
359 #endif
360
361 #if defined(PENRYN)
362
363 size >>= 9;
364
365 sgemm_p = 1024;
366 dgemm_p = 512;
367 cgemm_p = 512;
368 zgemm_p = 256;
369
370 #ifdef EXPRECISION
371 qgemm_p = 256;
372 xgemm_p = 128;
373 #endif
374 #ifdef QUAD_PRECISION
375 qgemm_p = 21 * size + 4;
376 xgemm_p = 10 * size + 2;
377 #endif
378 #endif
379
380 #if defined(DUNNINGTON)
381
382 size >>= 9;
383
384 sgemm_p = 384;
385 dgemm_p = 384;
386 cgemm_p = 384;
387 zgemm_p = 384;
388
389 #ifdef EXPRECISION
390 qgemm_p = 384;
391 xgemm_p = 384;
392 #endif
393 #ifdef QUAD_PRECISION
394 qgemm_p = 21 * size + 4;
395 xgemm_p = 10 * size + 2;
396 #endif
397 #endif
398
399 #if defined(NEHALEM)
400 sgemm_p = 1024;
401 dgemm_p = 512;
402 cgemm_p = 512;
403 zgemm_p = 256;
404 #ifdef EXPRECISION
405 qgemm_p = 256;
406 xgemm_p = 128;
407 #endif
408 #endif
409
410 #if defined(SANDYBRIDGE)
411 sgemm_p = 1024;
412 dgemm_p = 512;
413 cgemm_p = 512;
414 zgemm_p = 256;
415 #ifdef EXPRECISION
416 qgemm_p = 256;
417 xgemm_p = 128;
418 #endif
419 #endif
420
421 #if defined(CORE_PRESCOTT) || defined(GENERIC)
422 size >>= 6;
423
424 if (size > 16) size = 16;
425
426 sgemm_p = 56 * size;
427 dgemm_p = 28 * size;
428 cgemm_p = 28 * size;
429 zgemm_p = 14 * size;
430 #ifdef EXPRECISION
431 qgemm_p = 14 * size;
432 xgemm_p = 7 * size;
433 #endif
434 #ifdef QUAD_PRECISION
435 qgemm_p = 7 * size;
436 xgemm_p = 3 * size;
437 #endif
438 #endif
439
440 #if defined(CORE_OPTERON)
441 sgemm_p = 224 + 14 * (size >> 5);
442 dgemm_p = 112 + 14 * (size >> 6);
443 cgemm_p = 116 + 14 * (size >> 6);
444 zgemm_p = 58 + 14 * (size >> 7);
445 #ifdef EXPRECISION
446 qgemm_p = 58 + 14 * (size >> 7);
447 xgemm_p = 29 + 14 * (size >> 8);
448 #endif
449 #ifdef QUAD_PRECISION
450 qgemm_p = 29 + 14 * (size >> 8);
451 xgemm_p = 15 + 14 * (size >> 9);
452 #endif
453 #endif
454
455 #if defined(ATOM)
456 size >>= 8;
457
458 sgemm_p = 256;
459 dgemm_p = 128;
460 cgemm_p = 128;
461 zgemm_p = 64;
462 #ifdef EXPRECISION
463 qgemm_p = 64;
464 xgemm_p = 32;
465 #endif
466 #ifdef QUAD_PRECISION
467 qgemm_p = 32;
468 xgemm_p = 16;
469 #endif
470 #endif
471
472 #if defined(CORE_BARCELONA) || defined(CORE_BOBCAT)
473 size >>= 8;
474
475 sgemm_p = 232 * size;
476 dgemm_p = 116 * size;
477 cgemm_p = 116 * size;
478 zgemm_p = 58 * size;
479 #ifdef EXPRECISION
480 qgemm_p = 58 * size;
481 xgemm_p = 26 * size;
482 #endif
483 #ifdef QUAD_PRECISION
484 qgemm_p = 26 * size;
485 xgemm_p = 13 * size;
486 #endif
487 #endif
488
489 factor=openblas_block_factor();
490 if (factor>0) {
491 if (factor < 10) factor = 10;
492 if (factor > 200) factor = 200;
493
494 sgemm_p = ((long)((double)sgemm_p * (double)factor * 1.e-2)) & ~7L;
495 dgemm_p = ((long)((double)dgemm_p * (double)factor * 1.e-2)) & ~7L;
496 cgemm_p = ((long)((double)cgemm_p * (double)factor * 1.e-2)) & ~7L;
497 zgemm_p = ((long)((double)zgemm_p * (double)factor * 1.e-2)) & ~7L;
498 #ifdef EXPRECISION
499 qgemm_p = ((long)((double)qgemm_p * (double)factor * 1.e-2)) & ~7L;
500 xgemm_p = ((long)((double)xgemm_p * (double)factor * 1.e-2)) & ~7L;
501 #endif
502 }
503
504 if (sgemm_p == 0) sgemm_p = 64;
505 if (dgemm_p == 0) dgemm_p = 64;
506 if (cgemm_p == 0) cgemm_p = 64;
507 if (zgemm_p == 0) zgemm_p = 64;
508 #ifdef EXPRECISION
509 if (qgemm_p == 0) qgemm_p = 64;
510 if (xgemm_p == 0) xgemm_p = 64;
511 #endif
512
513 #ifdef QUAD_PRECISION
514 if (qgemm_p == 0) qgemm_p = 64;
515 if (xgemm_p == 0) xgemm_p = 64;
516 #endif
517
518 sgemm_p = ((sgemm_p + SGEMM_UNROLL_M - 1)/SGEMM_UNROLL_M) * SGEMM_UNROLL_M;
519 dgemm_p = ((dgemm_p + DGEMM_UNROLL_M - 1)/DGEMM_UNROLL_M) * DGEMM_UNROLL_M;
520 cgemm_p = ((cgemm_p + CGEMM_UNROLL_M - 1)/CGEMM_UNROLL_M) * CGEMM_UNROLL_M;
521 zgemm_p = ((zgemm_p + ZGEMM_UNROLL_M - 1)/ZGEMM_UNROLL_M) * ZGEMM_UNROLL_M;
522 #ifdef QUAD_PRECISION
523 qgemm_p = ((qgemm_p + QGEMM_UNROLL_M - 1)/QGEMM_UNROLL_M) * QGEMM_UNROLL_M;
524 xgemm_p = ((xgemm_p + XGEMM_UNROLL_M - 1)/XGEMM_UNROLL_M) * XGEMM_UNROLL_M;
525 #endif
526
527 #ifdef BUILD_BFLOAT16
528 sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15;
529 #endif
530 sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
531 dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
532 cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;
533 zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15;
534 #if defined(EXPRECISION) || defined(QUAD_PRECISION)
535 qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15;
536 xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15;
537 #endif
538
539 #if 0
540 fprintf(stderr, "SGEMM ... %3d, %3d, %3d\n", SGEMM_P, SGEMM_Q, SGEMM_R);
541 fprintf(stderr, "DGEMM ... %3d, %3d, %3d\n", DGEMM_P, DGEMM_Q, DGEMM_R);
542 fprintf(stderr, "CGEMM ... %3d, %3d, %3d\n", CGEMM_P, CGEMM_Q, CGEMM_R);
543 fprintf(stderr, "ZGEMM ... %3d, %3d, %3d\n", ZGEMM_P, ZGEMM_Q, ZGEMM_R);
544 #endif
545
546 return;
547 }
548
549 #if 0
550
551 int get_current_cpu_info(void){
552
553 int nlprocs, ncores, cmplegacy;
554 int htt = 0;
555 int apicid = 0;
556
557 #if defined(CORE_PRESCOTT) || defined(CORE_OPTERON)
558 int eax, ebx, ecx, edx;
559
560 cpuid(1, &eax, &ebx, &ecx, &edx);
561 nlprocs = BITMASK(ebx, 16, 0xff);
562 apicid = BITMASK(ebx, 24, 0xff);
563 htt = BITMASK(edx, 28, 0x01);
564 #endif
565
566 #if defined(CORE_PRESCOTT)
567 cpuid(4, &eax, &ebx, &ecx, &edx);
568 ncores = BITMASK(eax, 26, 0x3f);
569
570 if (htt == 0) nlprocs = 0;
571 #endif
572
573 #if defined(CORE_OPTERON)
574 cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
575 ncores = BITMASK(ecx, 0, 0xff);
576
577 cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
578 cmplegacy = BITMASK(ecx, 1, 0x01);
579
580 if (htt == 0) {
581 nlprocs = 0;
582 ncores = 0;
583 cmplegacy = 0;
584 }
585 #endif
586
587 ncores ++;
588
589 fprintf(stderr, "APICID = %d Number of core = %d\n", apicid, ncores);
590
591 return 0;
592 }
593 #endif
594
595 #endif
596
597 #if defined(ARCH_IA64)
598
cpuid(BLASULONG regnum)599 static inline BLASULONG cpuid(BLASULONG regnum){
600 BLASULONG value;
601
602 #ifndef __ECC
603 asm ("mov %0=cpuid[%r1]" : "=r"(value) : "rO"(regnum));
604 #else
605 value = __getIndReg(_IA64_REG_INDR_CPUID, regnum);
606 #endif
607
608 return value;
609 }
610
611 #if 1
612
blas_set_parameter(void)613 void blas_set_parameter(void){
614
615 BLASULONG cpuid3, size;
616
617 cpuid3 = cpuid(3);
618
619 size = BITMASK(cpuid3, 16, 0xff);
620
621 sbgemm_p = 192 * (size + 1);
622 sgemm_p = 192 * (size + 1);
623 dgemm_p = 96 * (size + 1);
624 cgemm_p = 96 * (size + 1);
625 zgemm_p = 48 * (size + 1);
626 #ifdef EXPRECISION
627 qgemm_p = 64 * (size + 1);
628 xgemm_p = 32 * (size + 1);
629 #endif
630 #ifdef QUAD_PRECISION
631 qgemm_p = 32 * (size + 1);
632 xgemm_p = 16 * (size + 1);
633 #endif
634
635 #ifdef BUILD_BFLOAT16
636 sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15;
637 #endif
638 sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
639 dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
640 cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;
641 zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15;
642 #if defined(EXPRECISION) || defined(QUAD_PRECISION)
643 qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15;
644 xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15;
645 #endif
646
647 return;
648 }
649
650 #else
651
652 #define IA64_SYS_NAME "/sys/devices/system/cpu/cpu0/cache/index3/size"
653 #define IA64_PROC_NAME "/proc/pal/cpu0/cache_info"
654
blas_set_parameter(void)655 void blas_set_parameter(void){
656
657 BLASULONG cpuid3;
658 int size = 0;
659
660 #if 1
661 char buffer[128];
662 FILE *infile;
663
664 if ((infile = fopen(IA64_SYS_NAME, "r")) != NULL) {
665
666 fgets(buffer, sizeof(buffer), infile);
667 fclose(infile);
668
669 size = atoi(buffer) / 1536;
670 }
671
672 if (size <= 0) {
673 if ((infile = fopen(IA64_PROC_NAME, "r")) != NULL) {
674
675 while(fgets(buffer, sizeof(buffer), infile) != NULL) {
676 if ((!strncmp("Data/Instruction Cache level 3", buffer, 30))) break;
677 }
678
679 fgets(buffer, sizeof(buffer), infile);
680
681 fclose(infile);
682
683 *strstr(buffer, "bytes") = (char)NULL;
684
685 size = atoi(strchr(buffer, ':') + 1) / 1572864;
686 }
687 }
688 #endif
689
690 /* The last resort */
691
692 if (size <= 0) {
693 cpuid3 = cpuid(3);
694
695 size = BITMASK(cpuid3, 16, 0xff) + 1;
696 }
697
698 sgemm_p = 320 * size;
699 dgemm_p = 160 * size;
700 cgemm_p = 160 * size;
701 zgemm_p = 80 * size;
702 #ifdef EXPRECISION
703 qgemm_p = 80 * size;
704 xgemm_p = 40 * size;
705 #endif
706
707 sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
708 dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
709 cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;
710 zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15;
711 #ifdef EXPRECISION
712 qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15;
713 xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15;
714 #endif
715
716 return;
717 }
718
719 #endif
720
721 #endif
722
723 #if defined(ARCH_MIPS64)
blas_set_parameter(void)724 void blas_set_parameter(void){
725 #if defined(LOONGSON3R3) || defined(LOONGSON3R4)
726 #ifdef SMP
727 if(blas_num_threads == 1){
728 #endif
729 //single thread
730 dgemm_r = 1024;
731 #ifdef SMP
732 }else{
733 //multi thread
734 dgemm_r = 200;
735 }
736 #endif
737 #endif
738
739 }
740 #endif
741
742 #if defined(ARCH_ARM64)
743
blas_set_parameter(void)744 void blas_set_parameter(void)
745 {
746 }
747
748 #endif
749