1 /*********************************************************************/
2 /* */
3 /* Optimized BLAS libraries */
4 /* By Kazushige Goto <kgoto@tacc.utexas.edu> */
5 /* */
6 /* Copyright (c) The University of Texas, 2009. All rights reserved. */
7 /* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */
8 /* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */
9 /* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */
10 /* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */
11 /* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */
12 /* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */
13 /* THE USE OF THE SOFTWARE OR DOCUMENTATION. */
14 /* Under no circumstances shall University be liable for incidental, */
15 /* special, indirect, direct or consequential damages or loss of */
16 /* profits, interruption of business, or related expenses which may */
17 /* arise from use of Software or Documentation, including but not */
18 /* limited to those resulting from defects in Software and/or */
19 /* Documentation, or loss or inaccuracy of data of any kind. */
20 /*********************************************************************/
21
22 #include <stdio.h>
23 #include "common.h"
24
25 #ifndef BETA_OPERATION
26 #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \
27 GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \
28 BETA[0], BETA[1], NULL, 0, NULL, 0, \
29 (FLOAT *)(C) + (M_FROM) + (N_FROM) * (LDC) * COMPSIZE, LDC)
30 #endif
31
32 #ifndef ICOPYB_OPERATION
33 #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
34 defined(RN) || defined(RT) || defined(RC) || defined(RR)
35 #define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
36 GEMM3M_ITCOPYB(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER)
37 #else
38 #define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
39 GEMM3M_INCOPYB(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER)
40 #endif
41 #endif
42
43 #ifndef ICOPYR_OPERATION
44 #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
45 defined(RN) || defined(RT) || defined(RC) || defined(RR)
46 #define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
47 GEMM3M_ITCOPYR(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER)
48 #else
49 #define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
50 GEMM3M_INCOPYR(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER)
51 #endif
52 #endif
53
54 #ifndef ICOPYI_OPERATION
55 #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
56 defined(RN) || defined(RT) || defined(RC) || defined(RR)
57 #define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
58 GEMM3M_ITCOPYI(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER)
59 #else
60 #define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
61 GEMM3M_INCOPYI(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER)
62 #endif
63 #endif
64
65
66 #ifndef OCOPYB_OPERATION
67 #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
68 defined(NR) || defined(TR) || defined(CR) || defined(RR)
69 #define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
70 GEMM3M_ONCOPYB(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER)
71 #else
72 #define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
73 GEMM3M_OTCOPYB(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER)
74 #endif
75 #endif
76
77 #ifndef OCOPYR_OPERATION
78 #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
79 defined(NR) || defined(TR) || defined(CR) || defined(RR)
80 #define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
81 GEMM3M_ONCOPYR(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER)
82 #else
83 #define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
84 GEMM3M_OTCOPYR(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER)
85 #endif
86 #endif
87
88
89 #ifndef OCOPYI_OPERATION
90 #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
91 defined(NR) || defined(TR) || defined(CR) || defined(RR)
92 #define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
93 GEMM3M_ONCOPYI(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER)
94 #else
95 #define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
96 GEMM3M_OTCOPYI(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER)
97 #endif
98 #endif
99
100 #ifndef KERNEL_FUNC
101 #define KERNEL_FUNC GEMM3M_KERNEL
102 #endif
103
104 #ifndef KERNEL_OPERATION
105 #define KERNEL_OPERATION(M, N, K, ALPHA_R, ALPHA_I, SA, SB, C, LDC, X, Y) \
106 KERNEL_FUNC(M, N, K, ALPHA_R, ALPHA_I, SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC)
107 #endif
108
109 #ifndef A
110 #define A args -> a
111 #endif
112 #ifndef LDA
113 #define LDA args -> lda
114 #endif
115 #ifndef B
116 #define B args -> b
117 #endif
118 #ifndef LDB
119 #define LDB args -> ldb
120 #endif
121 #ifndef C
122 #define C args -> c
123 #endif
124 #ifndef LDC
125 #define LDC args -> ldc
126 #endif
127 #ifndef M
128 #define M args -> m
129 #endif
130 #ifndef N
131 #define N args -> n
132 #endif
133 #ifndef K
134 #define K args -> k
135 #endif
136
137 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
138 #define ALPHA1 ONE
139 #define ALPHA2 ONE
140 #define ALPHA5 ZERO
141 #define ALPHA6 ONE
142
143 #define ALPHA7 ONE
144 #define ALPHA8 ZERO
145 #define ALPHA11 ONE
146 #define ALPHA12 -ONE
147
148 #define ALPHA13 ZERO
149 #define ALPHA14 ONE
150 #define ALPHA17 -ONE
151 #define ALPHA18 -ONE
152 #endif
153
154 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
155 #define ALPHA1 ONE
156 #define ALPHA2 ONE
157 #define ALPHA5 ONE
158 #define ALPHA6 ZERO
159
160 #define ALPHA7 ZERO
161 #define ALPHA8 ONE
162 #define ALPHA11 -ONE
163 #define ALPHA12 -ONE
164
165 #define ALPHA13 ONE
166 #define ALPHA14 ZERO
167 #define ALPHA17 -ONE
168 #define ALPHA18 ONE
169 #endif
170
171 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
172 #define ALPHA1 ONE
173 #define ALPHA2 ONE
174 #define ALPHA5 ONE
175 #define ALPHA6 ZERO
176
177 #define ALPHA7 ZERO
178 #define ALPHA8 ONE
179 #define ALPHA11 -ONE
180 #define ALPHA12 ONE
181
182 #define ALPHA13 ONE
183 #define ALPHA14 ZERO
184 #define ALPHA17 -ONE
185 #define ALPHA18 -ONE
186 #endif
187
188 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
189 #define ALPHA1 ONE
190 #define ALPHA2 ONE
191 #define ALPHA5 ZERO
192 #define ALPHA6 -ONE
193
194 #define ALPHA7 ONE
195 #define ALPHA8 ZERO
196 #define ALPHA11 ONE
197 #define ALPHA12 ONE
198
199 #define ALPHA13 ZERO
200 #define ALPHA14 ONE
201 #define ALPHA17 -ONE
202 #define ALPHA18 ONE
203 #endif
204
205 #ifdef TIMING
206 #define START_RPCC() rpcc_counter = rpcc()
207 #define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter
208 #else
209 #define START_RPCC()
210 #define STOP_RPCC(COUNTER)
211 #endif
212
CNAME(blas_arg_t * args,BLASLONG * range_m,BLASLONG * range_n,FLOAT * sa,FLOAT * sb,BLASLONG dummy)213 int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
214 FLOAT *sa, FLOAT *sb, BLASLONG dummy){
215 BLASLONG k, lda, ldb, ldc;
216 FLOAT *alpha, *beta;
217 FLOAT *a, *b, *c;
218 BLASLONG m_from, m_to, n_from, n_to;
219
220 BLASLONG ls, is, js, jjs;
221 BLASLONG min_l, min_i, min_j, min_jj;
222
223 #ifdef TIMING
224 BLASULONG rpcc_counter;
225 BLASULONG BLASLONG innercost = 0;
226 BLASULONG BLASLONG outercost = 0;
227 BLASULONG BLASLONG kernelcost = 0;
228 double total;
229 #endif
230
231 k = K;
232
233 a = (FLOAT *)A;
234 b = (FLOAT *)B;
235 c = (FLOAT *)C;
236
237 lda = LDA;
238 ldb = LDB;
239 ldc = LDC;
240
241 alpha = (FLOAT *)args -> alpha;
242 beta = (FLOAT *)args -> beta;
243
244 m_from = 0;
245 m_to = M;
246
247 if (range_m) {
248 m_from = *(((BLASLONG *)range_m) + 0);
249 m_to = *(((BLASLONG *)range_m) + 1);
250 }
251
252 n_from = 0;
253 n_to = N;
254
255 if (range_n) {
256 n_from = *(((BLASLONG *)range_n) + 0);
257 n_to = *(((BLASLONG *)range_n) + 1);
258 }
259
260 if (beta) {
261 #ifndef COMPLEX
262 if (beta[0] != ONE)
263 #else
264 if ((beta[0] != ONE) || (beta[1] != ZERO))
265 #endif
266 BETA_OPERATION(m_from, m_to, n_from, n_to, beta, c, ldc);
267 }
268
269 if ((k == 0) || (alpha == NULL)) return 0;
270
271 if ((alpha[0] == ZERO)
272 #ifdef COMPLEX
273 && (alpha[1] == ZERO)
274 #endif
275 ) return 0;
276
277 #if 0
278 printf("GEMM: M_from : %ld M_to : %ld N_from : %ld N_to : %ld k : %ld\n", m_from, m_to, n_from, n_to, k);
279 printf("GEMM: P = %4ld Q = %4ld R = %4ld\n", (BLASLONG)GEMM3M_P, (BLASLONG)GEMM3M_Q, (BLASLONG)GEMM3M_R);
280 printf("GEMM: SA .. %p SB .. %p\n", sa, sb);
281 #endif
282
283 #ifdef DEBUG
284 innercost = 0;
285 outercost = 0;
286 kernelcost = 0;
287 #endif
288
289 for(js = n_from; js < n_to; js += GEMM3M_R){
290 min_j = n_to - js;
291 if (min_j > GEMM3M_R) min_j = GEMM3M_R;
292
293 for(ls = 0; ls < k; ls += min_l){
294 min_l = k - ls;
295
296 if (min_l >= GEMM3M_Q * 2) {
297 min_l = GEMM3M_Q;
298 } else {
299 if (min_l > GEMM3M_Q) {
300 min_l = (min_l + 1) / 2;
301 #ifdef UNROLL_X
302 min_l = (min_l + UNROLL_X - 1) & ~(UNROLL_X - 1);
303 #endif
304 }
305 }
306
307 min_i = m_to - m_from;
308 if (min_i >= GEMM3M_P * 2) {
309 min_i = GEMM3M_P;
310 } else {
311 if (min_i > GEMM3M_P) {
312 min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
313 }
314 }
315
316 START_RPCC();
317
318 ICOPYB_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
319
320 STOP_RPCC(innercost);
321
322 for(jjs = js; jjs < js + min_j; jjs += min_jj){
323 min_jj = min_j + js - jjs;
324 if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
325
326 START_RPCC();
327
328 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || defined(RN) || defined(RT) || defined(CN) || defined(CT)
329 OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js));
330 #else
331 OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js));
332 #endif
333
334 STOP_RPCC(outercost);
335
336 START_RPCC();
337
338 KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA5, ALPHA6,
339 sa, sb + min_l * (jjs - js), c, ldc, m_from, jjs);
340
341 STOP_RPCC(kernelcost);
342
343 }
344
345 for(is = m_from + min_i; is < m_to; is += min_i){
346 min_i = m_to - is;
347 if (min_i >= GEMM3M_P * 2) {
348 min_i = GEMM3M_P;
349 } else
350 if (min_i > GEMM3M_P) {
351 min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
352 }
353
354 START_RPCC();
355
356 ICOPYB_OPERATION(min_l, min_i, a, lda, ls, is, sa);
357
358 STOP_RPCC(innercost);
359
360 START_RPCC();
361
362 KERNEL_OPERATION(min_i, min_j, min_l, ALPHA5, ALPHA6, sa, sb, c, ldc, is, js);
363
364 STOP_RPCC(kernelcost);
365 }
366
367 min_i = m_to - m_from;
368 if (min_i >= GEMM3M_P * 2) {
369 min_i = GEMM3M_P;
370 } else {
371 if (min_i > GEMM3M_P) {
372 min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
373 }
374 }
375
376 START_RPCC();
377
378 ICOPYR_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
379
380 STOP_RPCC(innercost);
381
382 for(jjs = js; jjs < js + min_j; jjs += min_jj){
383 min_jj = min_j + js - jjs;
384 if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
385
386 START_RPCC();
387
388 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
389 OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js));
390 #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
391 OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js));
392 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
393 OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js));
394 #else
395 OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js));
396 #endif
397
398 STOP_RPCC(outercost);
399
400 START_RPCC();
401
402 KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA11, ALPHA12,
403 sa, sb + min_l * (jjs - js), c, ldc, m_from, jjs);
404
405 STOP_RPCC(kernelcost);
406
407 }
408
409 for(is = m_from + min_i; is < m_to; is += min_i){
410 min_i = m_to - is;
411 if (min_i >= GEMM3M_P * 2) {
412 min_i = GEMM3M_P;
413 } else
414 if (min_i > GEMM3M_P) {
415 min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
416 }
417
418 START_RPCC();
419
420 ICOPYR_OPERATION(min_l, min_i, a, lda, ls, is, sa);
421
422 STOP_RPCC(innercost);
423
424 START_RPCC();
425
426 KERNEL_OPERATION(min_i, min_j, min_l, ALPHA11, ALPHA12, sa, sb, c, ldc, is, js);
427
428 STOP_RPCC(kernelcost);
429
430 }
431
432 min_i = m_to - m_from;
433 if (min_i >= GEMM3M_P * 2) {
434 min_i = GEMM3M_P;
435 } else {
436 if (min_i > GEMM3M_P) {
437 min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
438 }
439 }
440
441 START_RPCC();
442
443 ICOPYI_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
444
445 STOP_RPCC(innercost);
446
447 for(jjs = js; jjs < js + min_j; jjs += min_jj){
448 min_jj = min_j + js - jjs;
449 if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
450
451 START_RPCC();
452
453 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
454 OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js));
455 #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
456 OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js));
457 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
458 OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js));
459 #else
460 OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js));
461 #endif
462
463 STOP_RPCC(outercost);
464
465 START_RPCC();
466
467 KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA17, ALPHA18,
468 sa, sb + min_l * (jjs - js), c, ldc, m_from, jjs);
469
470 STOP_RPCC(kernelcost);
471
472 }
473
474 for(is = m_from + min_i; is < m_to; is += min_i){
475 min_i = m_to - is;
476 if (min_i >= GEMM3M_P * 2) {
477 min_i = GEMM3M_P;
478 } else
479 if (min_i > GEMM3M_P) {
480 min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
481 }
482
483 START_RPCC();
484
485 ICOPYI_OPERATION(min_l, min_i, a, lda, ls, is, sa);
486
487 STOP_RPCC(innercost);
488
489 START_RPCC();
490
491 KERNEL_OPERATION(min_i, min_j, min_l, ALPHA17, ALPHA18, sa, sb, c, ldc, is, js);
492
493 STOP_RPCC(kernelcost);
494
495 }
496
497 } /* end of js */
498 } /* end of ls */
499
500
501 #ifdef TIMING
502 total = (double)outercost + (double)innercost + (double)kernelcost;
503
504 printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f\n",
505 innercost / total * 100., outercost / total * 100.,
506 kernelcost / total * 100.);
507
508 printf( " Total %10.3f%% %10.3f MFlops\n",
509 ((double)(m_to - m_from) * (double)(n_to - n_from) * (double)k) / (double)kernelcost / 2 * 100,
510 2400. * (2. * (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k) / (double)kernelcost);
511 #endif
512
513 return 0;
514 }
515