1 /*********************************************************************/
2 /*                                                                   */
3 /*             Optimized BLAS libraries                              */
4 /*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5 /*                                                                   */
6 /* Copyright (c) The University of Texas, 2009. All rights reserved. */
7 /* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8 /* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9 /* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10 /* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11 /* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12 /* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13 /* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14 /* Under no circumstances shall University be liable for incidental, */
15 /* special, indirect, direct or consequential damages or loss of     */
16 /* profits, interruption of business, or related expenses which may  */
17 /* arise from use of Software or Documentation, including but not    */
18 /* limited to those resulting from defects in Software and/or        */
19 /* Documentation, or loss or inaccuracy of data of any kind.         */
20 /*********************************************************************/
21 
22 #include <stdio.h>
23 #include "common.h"
24 
25 #ifndef BETA_OPERATION
26 #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \
27 	GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \
28 		  BETA[0], BETA[1], NULL, 0, NULL, 0, \
29 		  (FLOAT *)(C) + (M_FROM) + (N_FROM) * (LDC) * COMPSIZE, LDC)
30 #endif
31 
32 #ifndef ICOPYB_OPERATION
33 #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
34     defined(RN) || defined(RT) || defined(RC) || defined(RR)
35 #define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
36 	GEMM3M_ITCOPYB(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER)
37 #else
38 #define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
39 	GEMM3M_INCOPYB(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER)
40 #endif
41 #endif
42 
43 #ifndef ICOPYR_OPERATION
44 #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
45     defined(RN) || defined(RT) || defined(RC) || defined(RR)
46 #define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
47 	GEMM3M_ITCOPYR(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER)
48 #else
49 #define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
50 	GEMM3M_INCOPYR(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER)
51 #endif
52 #endif
53 
54 #ifndef ICOPYI_OPERATION
55 #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
56     defined(RN) || defined(RT) || defined(RC) || defined(RR)
57 #define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
58 	GEMM3M_ITCOPYI(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER)
59 #else
60 #define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
61 	GEMM3M_INCOPYI(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER)
62 #endif
63 #endif
64 
65 
66 #ifndef OCOPYB_OPERATION
67 #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
68     defined(NR) || defined(TR) || defined(CR) || defined(RR)
69 #define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
70 	GEMM3M_ONCOPYB(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER)
71 #else
72 #define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
73 	GEMM3M_OTCOPYB(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER)
74 #endif
75 #endif
76 
77 #ifndef OCOPYR_OPERATION
78 #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
79     defined(NR) || defined(TR) || defined(CR) || defined(RR)
80 #define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
81 	GEMM3M_ONCOPYR(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER)
82 #else
83 #define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
84 	GEMM3M_OTCOPYR(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER)
85 #endif
86 #endif
87 
88 
89 #ifndef OCOPYI_OPERATION
90 #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
91     defined(NR) || defined(TR) || defined(CR) || defined(RR)
92 #define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
93 	GEMM3M_ONCOPYI(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER)
94 #else
95 #define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
96 	GEMM3M_OTCOPYI(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER)
97 #endif
98 #endif
99 
100 #ifndef KERNEL_FUNC
101 #define KERNEL_FUNC	GEMM3M_KERNEL
102 #endif
103 
104 #ifndef KERNEL_OPERATION
105 #define KERNEL_OPERATION(M, N, K, ALPHA_R, ALPHA_I, SA, SB, C, LDC, X, Y) \
106 	KERNEL_FUNC(M, N, K, ALPHA_R, ALPHA_I, SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC)
107 #endif
108 
109 #ifndef A
110 #define A	args -> a
111 #endif
112 #ifndef LDA
113 #define LDA	args -> lda
114 #endif
115 #ifndef B
116 #define B	args -> b
117 #endif
118 #ifndef LDB
119 #define LDB	args -> ldb
120 #endif
121 #ifndef C
122 #define C	args -> c
123 #endif
124 #ifndef LDC
125 #define LDC	args -> ldc
126 #endif
127 #ifndef M
128 #define M	args -> m
129 #endif
130 #ifndef N
131 #define N	args -> n
132 #endif
133 #ifndef K
134 #define K	args -> k
135 #endif
136 
137 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
138 #define ALPHA1	ONE
139 #define ALPHA2	ONE
140 #define ALPHA5	ZERO
141 #define ALPHA6	ONE
142 
143 #define ALPHA7	ONE
144 #define ALPHA8	ZERO
145 #define ALPHA11	ONE
146 #define ALPHA12	-ONE
147 
148 #define ALPHA13	ZERO
149 #define ALPHA14	ONE
150 #define ALPHA17	-ONE
151 #define ALPHA18	-ONE
152 #endif
153 
154 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
155 #define ALPHA1	ONE
156 #define ALPHA2	ONE
157 #define ALPHA5	ONE
158 #define ALPHA6	ZERO
159 
160 #define ALPHA7	ZERO
161 #define ALPHA8	ONE
162 #define ALPHA11	-ONE
163 #define ALPHA12	-ONE
164 
165 #define ALPHA13	ONE
166 #define ALPHA14	ZERO
167 #define ALPHA17	-ONE
168 #define ALPHA18	ONE
169 #endif
170 
171 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
172 #define ALPHA1	ONE
173 #define ALPHA2	ONE
174 #define ALPHA5	ONE
175 #define ALPHA6	ZERO
176 
177 #define ALPHA7	ZERO
178 #define ALPHA8	ONE
179 #define ALPHA11	-ONE
180 #define ALPHA12	ONE
181 
182 #define ALPHA13	ONE
183 #define ALPHA14	ZERO
184 #define ALPHA17	-ONE
185 #define ALPHA18	-ONE
186 #endif
187 
188 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
189 #define ALPHA1	ONE
190 #define ALPHA2	ONE
191 #define ALPHA5	ZERO
192 #define ALPHA6	-ONE
193 
194 #define ALPHA7	ONE
195 #define ALPHA8	ZERO
196 #define ALPHA11	ONE
197 #define ALPHA12	ONE
198 
199 #define ALPHA13	ZERO
200 #define ALPHA14	ONE
201 #define ALPHA17	-ONE
202 #define ALPHA18	ONE
203 #endif
204 
205 #ifdef TIMING
206 #define START_RPCC()		rpcc_counter = rpcc()
207 #define STOP_RPCC(COUNTER)	COUNTER  += rpcc() - rpcc_counter
208 #else
209 #define START_RPCC()
210 #define STOP_RPCC(COUNTER)
211 #endif
212 
CNAME(blas_arg_t * args,BLASLONG * range_m,BLASLONG * range_n,FLOAT * sa,FLOAT * sb,BLASLONG dummy)213 int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
214 		  FLOAT *sa, FLOAT *sb, BLASLONG dummy){
215   BLASLONG k, lda, ldb, ldc;
216   FLOAT *alpha, *beta;
217   FLOAT *a, *b, *c;
218   BLASLONG m_from, m_to, n_from, n_to;
219 
220   BLASLONG ls, is, js, jjs;
221   BLASLONG min_l, min_i, min_j, min_jj;
222 
223 #ifdef TIMING
224   BLASULONG rpcc_counter;
225   BLASULONG BLASLONG innercost  = 0;
226   BLASULONG BLASLONG outercost  = 0;
227   BLASULONG BLASLONG kernelcost = 0;
228   double total;
229 #endif
230 
231   k = K;
232 
233   a = (FLOAT *)A;
234   b = (FLOAT *)B;
235   c = (FLOAT *)C;
236 
237   lda = LDA;
238   ldb = LDB;
239   ldc = LDC;
240 
241   alpha = (FLOAT *)args -> alpha;
242   beta  = (FLOAT *)args -> beta;
243 
244   m_from = 0;
245   m_to   = M;
246 
247   if (range_m) {
248     m_from = *(((BLASLONG *)range_m) + 0);
249     m_to   = *(((BLASLONG *)range_m) + 1);
250   }
251 
252   n_from = 0;
253   n_to   = N;
254 
255   if (range_n) {
256     n_from = *(((BLASLONG *)range_n) + 0);
257     n_to   = *(((BLASLONG *)range_n) + 1);
258   }
259 
260   if (beta) {
261 #ifndef COMPLEX
262     if (beta[0] != ONE)
263 #else
264     if ((beta[0] != ONE) || (beta[1] != ZERO))
265 #endif
266       BETA_OPERATION(m_from, m_to, n_from, n_to, beta, c, ldc);
267   }
268 
269   if ((k == 0) || (alpha == NULL)) return 0;
270 
271   if ((alpha[0] == ZERO)
272 #ifdef COMPLEX
273       && (alpha[1] == ZERO)
274 #endif
275       ) return 0;
276 
277 #if 0
278   printf("GEMM: M_from : %ld  M_to : %ld  N_from : %ld  N_to : %ld  k : %ld\n", m_from, m_to, n_from, n_to, k);
279   printf("GEMM: P = %4ld  Q = %4ld  R = %4ld\n", (BLASLONG)GEMM3M_P, (BLASLONG)GEMM3M_Q, (BLASLONG)GEMM3M_R);
280   printf("GEMM: SA .. %p  SB .. %p\n", sa, sb);
281 #endif
282 
283 #ifdef DEBUG
284   innercost = 0;
285   outercost = 0;
286   kernelcost = 0;
287 #endif
288 
289   for(js = n_from; js < n_to; js += GEMM3M_R){
290     min_j = n_to - js;
291     if (min_j > GEMM3M_R) min_j = GEMM3M_R;
292 
293     for(ls = 0; ls < k; ls += min_l){
294       min_l = k - ls;
295 
296       if (min_l >= GEMM3M_Q * 2) {
297 	min_l = GEMM3M_Q;
298       } else {
299 	if (min_l > GEMM3M_Q) {
300 	  min_l = (min_l + 1) / 2;
301 #ifdef UNROLL_X
302 	  min_l = (min_l + UNROLL_X - 1) & ~(UNROLL_X - 1);
303 #endif
304 	}
305       }
306 
307       min_i = m_to - m_from;
308       if (min_i >= GEMM3M_P * 2) {
309 	min_i = GEMM3M_P;
310       } else {
311 	if (min_i > GEMM3M_P) {
312 	  min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
313 	}
314       }
315 
316       START_RPCC();
317 
318       ICOPYB_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
319 
320       STOP_RPCC(innercost);
321 
322       for(jjs = js; jjs < js + min_j; jjs += min_jj){
323 	min_jj = min_j + js - jjs;
324 	if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
325 
326 	START_RPCC();
327 
328 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || defined(RN) || defined(RT) || defined(CN) || defined(CT)
329 	OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0],  alpha[1], ls, jjs, sb + min_l * (jjs - js));
330 #else
331 	OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js));
332 #endif
333 
334 	STOP_RPCC(outercost);
335 
336 	START_RPCC();
337 
338 	KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA5, ALPHA6,
339 			 sa, sb + min_l * (jjs - js), c, ldc, m_from, jjs);
340 
341 	STOP_RPCC(kernelcost);
342 
343       }
344 
345       for(is = m_from + min_i; is < m_to; is += min_i){
346 	min_i = m_to - is;
347 	if (min_i >= GEMM3M_P * 2) {
348 	  min_i = GEMM3M_P;
349 	} else
350 	  if (min_i > GEMM3M_P) {
351 	    min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
352 	  }
353 
354 	START_RPCC();
355 
356 	ICOPYB_OPERATION(min_l, min_i, a, lda, ls, is, sa);
357 
358 	STOP_RPCC(innercost);
359 
360 	START_RPCC();
361 
362 	KERNEL_OPERATION(min_i, min_j, min_l, ALPHA5, ALPHA6, sa, sb, c, ldc, is, js);
363 
364 	STOP_RPCC(kernelcost);
365       }
366 
367       min_i = m_to - m_from;
368       if (min_i >= GEMM3M_P * 2) {
369 	min_i = GEMM3M_P;
370       } else {
371 	if (min_i > GEMM3M_P) {
372 	  min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
373 	}
374       }
375 
376       START_RPCC();
377 
378       ICOPYR_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
379 
380       STOP_RPCC(innercost);
381 
382       for(jjs = js; jjs < js + min_j; jjs += min_jj){
383 	min_jj = min_j + js - jjs;
384 	if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
385 
386 	START_RPCC();
387 
388 #if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
389 	OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0],  alpha[1], ls, jjs, sb + min_l * (jjs - js));
390 #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
391 	OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js));
392 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
393 	OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0],  alpha[1], ls, jjs, sb + min_l * (jjs - js));
394 #else
395 	OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js));
396 #endif
397 
398 	STOP_RPCC(outercost);
399 
400 	START_RPCC();
401 
402 	KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA11, ALPHA12,
403 			 sa, sb + min_l * (jjs - js), c, ldc, m_from, jjs);
404 
405 	STOP_RPCC(kernelcost);
406 
407       }
408 
409       for(is = m_from + min_i; is < m_to; is += min_i){
410 	min_i = m_to - is;
411 	if (min_i >= GEMM3M_P * 2) {
412 	  min_i = GEMM3M_P;
413 	} else
414 	  if (min_i > GEMM3M_P) {
415 	    min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
416 	  }
417 
418 	START_RPCC();
419 
420 	ICOPYR_OPERATION(min_l, min_i, a, lda, ls, is, sa);
421 
422 	STOP_RPCC(innercost);
423 
424 	START_RPCC();
425 
426 	KERNEL_OPERATION(min_i, min_j, min_l, ALPHA11, ALPHA12, sa, sb, c, ldc, is, js);
427 
428 	STOP_RPCC(kernelcost);
429 
430       }
431 
432       min_i = m_to - m_from;
433       if (min_i >= GEMM3M_P * 2) {
434 	min_i = GEMM3M_P;
435       } else {
436 	if (min_i > GEMM3M_P) {
437 	  min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
438 	}
439       }
440 
441       START_RPCC();
442 
443       ICOPYI_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
444 
445       STOP_RPCC(innercost);
446 
447       for(jjs = js; jjs < js + min_j; jjs += min_jj){
448 	min_jj = min_j + js - jjs;
449 	if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
450 
451 	START_RPCC();
452 
453 #if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
454 	OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0],  alpha[1], ls, jjs, sb + min_l * (jjs - js));
455 #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
456 	OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js));
457 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
458 	OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0],  alpha[1], ls, jjs, sb + min_l * (jjs - js));
459 #else
460 	OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js));
461 #endif
462 
463 	STOP_RPCC(outercost);
464 
465 	START_RPCC();
466 
467 	KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA17, ALPHA18,
468 			 sa, sb + min_l * (jjs - js), c, ldc, m_from, jjs);
469 
470 	STOP_RPCC(kernelcost);
471 
472       }
473 
474       for(is = m_from + min_i; is < m_to; is += min_i){
475 	min_i = m_to - is;
476 	if (min_i >= GEMM3M_P * 2) {
477 	  min_i = GEMM3M_P;
478 	} else
479 	  if (min_i > GEMM3M_P) {
480 	    min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
481 	  }
482 
483 	START_RPCC();
484 
485 	ICOPYI_OPERATION(min_l, min_i, a, lda, ls, is, sa);
486 
487 	STOP_RPCC(innercost);
488 
489 	START_RPCC();
490 
491 	KERNEL_OPERATION(min_i, min_j, min_l, ALPHA17, ALPHA18, sa, sb, c, ldc, is, js);
492 
493 	STOP_RPCC(kernelcost);
494 
495       }
496 
497     } /* end of js */
498   } /* end of ls */
499 
500 
501 #ifdef TIMING
502   total = (double)outercost + (double)innercost + (double)kernelcost;
503 
504   printf( "Copy A : %5.2f Copy  B: %5.2f  Kernel : %5.2f\n",
505 	   innercost / total * 100., outercost / total * 100.,
506 	  kernelcost / total * 100.);
507 
508   printf( " Total %10.3f%%  %10.3f MFlops\n",
509 	  ((double)(m_to - m_from) * (double)(n_to - n_from) * (double)k) / (double)kernelcost / 2 * 100,
510 	  2400. * (2. * (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k) / (double)kernelcost);
511 #endif
512 
513   return 0;
514 }
515