1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
4 /* */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
8 /* */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
11 /* disclaimer. */
12 /* */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
17 /* */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
32 /* */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
38
39 #ifndef KERNEL_OPERATION
40 #ifndef COMPLEX
41 #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
42 KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y))
43 #else
44 #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
45 KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y))
46 #endif
47 #endif
48
49 #ifndef ICOPY_OPERATION
50 #ifndef TRANS
51 #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
52 #else
53 #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
54 #endif
55 #endif
56
57 #ifndef OCOPY_OPERATION
58 #ifdef TRANS
59 #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
60 #else
61 #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
62 #endif
63 #endif
64
65 #ifndef M
66 #define M args -> n
67 #endif
68
69 #ifndef N
70 #define N args -> n
71 #endif
72
73 #ifndef K
74 #define K args -> k
75 #endif
76
77 #ifndef A
78 #define A args -> a
79 #endif
80
81 #ifndef C
82 #define C args -> c
83 #endif
84
85 #ifndef LDA
86 #define LDA args -> lda
87 #endif
88
89 #ifndef LDC
90 #define LDC args -> ldc
91 #endif
92
93 #ifdef TIMING
94 #define START_RPCC() rpcc_counter = rpcc()
95 #define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter
96 #else
97 #define START_RPCC()
98 #define STOP_RPCC(COUNTER)
99 #endif
100
CNAME(blas_arg_t * args,BLASLONG * range_m,BLASLONG * range_n,FLOAT * sa,FLOAT * sb,BLASLONG dummy)101 int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) {
102
103 BLASLONG m_from, m_to, n_from, n_to, k, lda, ldc;
104 FLOAT *a, *c, *alpha, *beta;
105
106 BLASLONG ls, is, js;
107 BLASLONG min_l, min_i, min_j;
108 BLASLONG jjs, min_jj;
109 BLASLONG m_start, m_end;
110
111 int shared = ((GEMM_UNROLL_M == GEMM_UNROLL_N) && !HAVE_EX_L2);
112
113 FLOAT *aa;
114
115 #ifdef TIMING
116 unsigned long long rpcc_counter;
117 unsigned long long innercost = 0;
118 unsigned long long outercost = 0;
119 unsigned long long kernelcost = 0;
120 double total;
121 #endif
122
123 k = K;
124
125 a = (FLOAT *)A;
126 c = (FLOAT *)C;
127
128 lda = LDA;
129 ldc = LDC;
130
131 alpha = (FLOAT *)args -> alpha;
132 beta = (FLOAT *)args -> beta;
133
134 m_from = 0;
135 m_to = M;
136
137 if (range_m) {
138 m_from = *(((BLASLONG *)range_m) + 0);
139 m_to = *(((BLASLONG *)range_m) + 1);
140 }
141
142 n_from = 0;
143 n_to = N;
144
145 if (range_n) {
146 n_from = *(((BLASLONG *)range_n) + 0);
147 n_to = *(((BLASLONG *)range_n) + 1);
148 }
149
150 if (beta) {
151 #if !defined(COMPLEX) || defined(HERK)
152 if (beta[0] != ONE)
153 #else
154 if ((beta[0] != ONE) || (beta[1] != ZERO))
155 #endif
156 syrk_beta(m_from, m_to, n_from, n_to, beta, c, ldc);
157 }
158
159 if ((k == 0) || (alpha == NULL)) return 0;
160
161 if ((alpha[0] == ZERO)
162 #if defined(COMPLEX) && !defined(HERK)
163 && (alpha[1] == ZERO)
164 #endif
165 ) return 0;
166
167 #if 0
168 fprintf(stderr, "m_from : %ld m_to : %ld n_from : %ld n_to : %ld\n",
169 m_from, m_to, n_from, n_to);
170 #endif
171
172 for(js = n_from; js < n_to; js += GEMM_R){
173 min_j = n_to - js;
174 if (min_j > GEMM_R) min_j = GEMM_R;
175
176 #ifndef LOWER
177 m_start = m_from;
178 m_end = js + min_j;
179 if (m_end > m_to) m_end = m_to;
180 #else
181 m_start = m_from;
182 m_end = m_to;
183 if (m_start < js) m_start = js;
184 #endif
185
186 for(ls = 0; ls < k; ls += min_l){
187 min_l = k - ls;
188 if (min_l >= GEMM_Q * 2) {
189 min_l = GEMM_Q;
190 } else
191 if (min_l > GEMM_Q) {
192 min_l = (min_l + 1) / 2;
193 }
194
195 min_i = m_end - m_start;
196
197 if (min_i >= GEMM_P * 2) {
198 min_i = GEMM_P;
199 } else
200 if (min_i > GEMM_P) {
201 min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
202 }
203
204 #ifndef LOWER
205
206 if (m_end >= js) {
207
208 aa = sb + min_l * MAX(m_start - js, 0) * COMPSIZE;
209 if (!shared) aa = sa;
210
211 for(jjs = MAX(m_start, js); jjs < js + min_j; jjs += min_jj){
212 min_jj = js + min_j - jjs;
213 if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
214
215 if (!shared && (jjs - MAX(m_start, js) < min_i)) {
216 START_RPCC();
217
218 ICOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sa + min_l * (jjs - js) * COMPSIZE);
219
220 STOP_RPCC(innercost);
221 }
222
223 START_RPCC();
224
225 OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
226
227 STOP_RPCC(outercost);
228
229 START_RPCC();
230
231 KERNEL_OPERATION(min_i, min_jj, min_l, alpha, aa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, MAX(m_start, js), jjs);
232
233 STOP_RPCC(kernelcost);
234 }
235
236 for(is = MAX(m_start, js) + min_i; is < m_end; is += min_i){
237 min_i = m_end - is;
238 if (min_i >= GEMM_P * 2) {
239 min_i = GEMM_P;
240 } else
241 if (min_i > GEMM_P) {
242 min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
243 }
244
245 aa = sb + min_l * (is - js) * COMPSIZE;
246
247 if (!shared) {
248
249 START_RPCC();
250
251 ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
252
253 STOP_RPCC(innercost);
254
255 aa = sa;
256 }
257
258 START_RPCC();
259
260 KERNEL_OPERATION(min_i, min_j, min_l, alpha, aa, sb, c, ldc, is, js);
261
262 STOP_RPCC(kernelcost);
263
264 }
265
266 }
267
268 if (m_start < js) {
269
270 if (m_end < js) {
271
272 START_RPCC();
273
274 ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa);
275
276 STOP_RPCC(innercost);
277
278 for(jjs = js; jjs < js + min_j; jjs += GEMM_UNROLL_MN){
279 min_jj = min_j + js - jjs;
280 if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
281
282 START_RPCC();
283
284 OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
285
286 STOP_RPCC(outercost);
287
288 START_RPCC();
289
290 KERNEL_OPERATION(min_i, min_jj, min_l, alpha, sa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs);
291
292 STOP_RPCC(kernelcost);
293
294 }
295 } else {
296 min_i = 0;
297 }
298
299 for(is = m_start + min_i; is < MIN(m_end, js); is += min_i){
300
301 min_i = MIN(m_end, js)- is;
302 if (min_i >= GEMM_P * 2) {
303 min_i = GEMM_P;
304 } else
305 if (min_i > GEMM_P) {
306 min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
307 }
308
309 START_RPCC();
310
311 ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
312
313 STOP_RPCC(innercost);
314
315 START_RPCC();
316
317 KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js);
318
319 STOP_RPCC(kernelcost);
320
321 }
322 }
323
324 #else
325
326 if (m_start < js + min_j) {
327
328 aa = sb + min_l * (m_start - js) * COMPSIZE;
329
330 if (!shared) {
331
332 START_RPCC();
333
334 ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa);
335
336 STOP_RPCC(innercost);
337
338 }
339
340 START_RPCC();
341
342 OCOPY_OPERATION(min_l, (shared? (min_i) : MIN(min_i, min_j + js - m_start)), a, lda, ls, m_start, aa);
343
344 STOP_RPCC(outercost);
345
346 START_RPCC();
347
348 KERNEL_OPERATION(min_i, MIN(min_i, min_j + js - m_start), min_l, alpha, (shared? (aa) : (sa)), aa, c, ldc, m_start, m_start);
349
350 STOP_RPCC(kernelcost);
351
352 for(jjs = js; jjs < m_start; jjs += GEMM_UNROLL_N){
353 min_jj = m_start - jjs;
354 if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
355
356 START_RPCC();
357
358 OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
359
360 STOP_RPCC(outercost);
361
362 START_RPCC();
363
364 KERNEL_OPERATION(min_i, min_jj, min_l, alpha, (shared? (aa) : (sa)), sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs);
365
366 STOP_RPCC(kernelcost);
367
368 }
369
370 for(is = m_start + min_i; is < m_end; is += min_i){
371
372 min_i = m_end - is;
373
374 if (min_i >= GEMM_P * 2) {
375 min_i = GEMM_P;
376 } else
377 if (min_i > GEMM_P) {
378 min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
379 }
380
381 if (is < js + min_j) {
382
383 if (!shared) {
384 START_RPCC();
385
386 ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
387
388 STOP_RPCC(innercost);
389 }
390
391 aa = sb + min_l * (is - js) * COMPSIZE;
392
393 START_RPCC();
394
395 OCOPY_OPERATION(min_l, (shared? (min_i) : MIN(min_i, min_j - is + js)), a, lda, ls, is, aa);
396
397 STOP_RPCC(outercost);
398
399 START_RPCC();
400
401 KERNEL_OPERATION(min_i, MIN(min_i, min_j - is + js), min_l, alpha, (shared? (aa) : (sa)), aa, c, ldc, is, is);
402
403 STOP_RPCC(kernelcost);
404
405 START_RPCC();
406
407 KERNEL_OPERATION(min_i, is - js, min_l, alpha, (shared? (aa) : (sa)), sb, c, ldc, is, js);
408
409 STOP_RPCC(kernelcost);
410
411 } else {
412
413 START_RPCC();
414
415 ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
416
417 STOP_RPCC(innercost);
418
419 START_RPCC();
420
421 KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js);
422
423 STOP_RPCC(kernelcost);
424
425 }
426
427 }
428
429 } else {
430
431 START_RPCC();
432
433 ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa);
434
435 STOP_RPCC(innercost);
436
437 for(jjs = js; jjs < min_j; jjs += GEMM_UNROLL_N){
438 min_jj = min_j - jjs;
439 if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
440
441 START_RPCC();
442
443 OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
444
445 STOP_RPCC(outercost);
446
447 START_RPCC();
448
449 KERNEL_OPERATION(min_i, min_jj, min_l, alpha, sa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs);
450
451 STOP_RPCC(kernelcost);
452
453 }
454
455 for(is = m_start + min_i; is < m_end; is += min_i){
456
457 min_i = m_end - is;
458
459 if (min_i >= GEMM_P * 2) {
460 min_i = GEMM_P;
461 } else
462 if (min_i > GEMM_P) {
463 min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
464 }
465
466 START_RPCC();
467
468 ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
469
470 STOP_RPCC(innercost);
471
472 START_RPCC();
473
474 KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js);
475
476 STOP_RPCC(kernelcost);
477
478 }
479 }
480 #endif
481 }
482 }
483
484 #ifdef TIMING
485 total = (double)outercost + (double)innercost + (double)kernelcost;
486
487 printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f kernel Effi. : %5.2f Total Effi. : %5.2f\n",
488 innercost / total * 100., outercost / total * 100., kernelcost / total * 100.,
489 (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / (double)kernelcost * 100. * (double)COMPSIZE / (double)DNUMOPT,
490 (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / total * 100. * (double)COMPSIZE / (double)DNUMOPT);
491
492 #endif
493
494 return 0;
495 }
496