1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
4 /* */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
8 /* */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
11 /* disclaimer. */
12 /* */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
17 /* */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
32 /* */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
38
39 #include <stdio.h>
40 #include <ctype.h>
41 #include "common.h"
42
43 const static FLOAT dp1 = 1.;
44
45 #ifdef CONJ
46 #define GEMM_KERNEL GEMM_KERNEL_L
47 #define TRMM_KERNEL_N TRMM_KERNEL_LR
48 #define TRMM_KERNEL_T TRMM_KERNEL_LC
49 #else
50 #define GEMM_KERNEL GEMM_KERNEL_N
51 #define TRMM_KERNEL_N TRMM_KERNEL_LN
52 #define TRMM_KERNEL_T TRMM_KERNEL_LT
53 #endif
54
55 #undef TIMING
56
57 #ifdef TIMING
58 #define START_RPCC() rpcc_counter = rpcc()
59 #define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter
60 #else
61 #define START_RPCC()
62 #define STOP_RPCC(COUNTER)
63 #endif
64
CNAME(blas_arg_t * args,BLASLONG * range_m,BLASLONG * range_n,FLOAT * sa,FLOAT * sb,BLASLONG dummy)65 int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) {
66
67 BLASLONG m, n, lda, ldb;
68 FLOAT *beta, *a, *b;
69
70 BLASLONG ls, is, js;
71 BLASLONG min_l, min_i, min_j;
72 BLASLONG jjs, min_jj;
73
74 #ifdef TIMING
75 unsigned long long rpcc_counter;
76 unsigned long long innercost = 0;
77 unsigned long long outercost = 0;
78 unsigned long long gemmcost = 0;
79 unsigned long long trmmcost = 0;
80 double total;
81 #endif
82
83 m = args -> m;
84 n = args -> n;
85
86 a = (FLOAT *)args -> a;
87 b = (FLOAT *)args -> b;
88
89 lda = args -> lda;
90 ldb = args -> ldb;
91
92 beta = (FLOAT *)args -> beta;
93
94 if (range_n) {
95 BLASLONG n_from = *(((BLASLONG *)range_n) + 0);
96 BLASLONG n_to = *(((BLASLONG *)range_n) + 1);
97
98 n = n_to - n_from;
99
100 b += n_from * ldb * COMPSIZE;
101 }
102
103 if (beta) {
104 #ifndef COMPLEX
105 if (beta[0] != ONE)
106 GEMM_BETA(m, n, 0, beta[0], NULL, 0, NULL, 0, b, ldb);
107 if (beta[0] == ZERO) return 0;
108 #else
109 if ((beta[0] != ONE) || (beta[1] != ZERO))
110 GEMM_BETA(m, n, 0, beta[0], beta[1], NULL, 0, NULL, 0, b, ldb);
111 if ((beta[0] == ZERO) && (beta[1] == ZERO)) return 0;
112 #endif
113 }
114
115 for(js = 0; js < n; js += GEMM_R){
116 min_j = n - js;
117 if (min_j > GEMM_R) min_j = GEMM_R;
118
119 #if (defined(UPPER) && !defined(TRANSA)) || (!defined(UPPER) && defined(TRANSA))
120
121 min_l = m;
122 if (min_l > GEMM_Q) min_l = GEMM_Q;
123 min_i = min_l;
124 if (min_i > GEMM_P) min_i = GEMM_P;
125
126 START_RPCC();
127
128 #ifndef TRANSA
129 TRMM_IUTCOPY(min_l, min_i, a, lda, 0, 0, sa);
130 #else
131 TRMM_ILNCOPY(min_l, min_i, a, lda, 0, 0, sa);
132 #endif
133
134 STOP_RPCC(innercost);
135
136 for(jjs = js; jjs < js + min_j; jjs += min_jj){
137 min_jj = min_j + js - jjs;
138 if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
139
140 START_RPCC();
141
142 GEMM_ONCOPY(min_l, min_jj, b + (jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
143
144 STOP_RPCC(outercost);
145
146 START_RPCC();
147
148 TRMM_KERNEL_N(min_i, min_jj, min_l, dp1,
149 #ifdef COMPLEX
150 ZERO,
151 #endif
152 sa, sb + min_l * (jjs - js) * COMPSIZE, b + (jjs * ldb) * COMPSIZE, ldb, 0);
153
154 STOP_RPCC(trmmcost);
155 }
156
157
158 for(is = min_i; is < min_l; is += GEMM_P){
159 min_i = min_l - is;
160 if (min_i > GEMM_P) min_i = GEMM_P;
161
162 START_RPCC();
163
164 #ifndef TRANSA
165 TRMM_IUTCOPY(min_l, min_i, a, lda, 0, is, sa);
166 #else
167 TRMM_ILNCOPY(min_l, min_i, a, lda, 0, is, sa);
168 #endif
169
170 STOP_RPCC(innercost);
171
172 START_RPCC();
173
174 TRMM_KERNEL_N(min_i, min_j, min_l, dp1,
175 #ifdef COMPLEX
176 ZERO,
177 #endif
178 sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is);
179
180 STOP_RPCC(trmmcost);
181
182 }
183
184 for(ls = min_l; ls < m; ls += GEMM_Q){
185 min_l = m - ls;
186 if (min_l > GEMM_Q) min_l = GEMM_Q;
187 min_i = ls;
188 if (min_i > GEMM_P) min_i = GEMM_P;
189
190 START_RPCC();
191
192 #ifndef TRANSA
193 GEMM_ITCOPY(min_l, min_i, a + (ls * lda) * COMPSIZE, lda, sa);
194 #else
195 GEMM_INCOPY(min_l, min_i, a + (ls ) * COMPSIZE, lda, sa);
196 #endif
197
198 STOP_RPCC(innercost);
199
200 for(jjs = js; jjs < js + min_j; jjs += min_jj){
201 min_jj = min_j + js - jjs;
202 if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
203
204 START_RPCC();
205
206 GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
207
208 STOP_RPCC(gemmcost);
209
210 START_RPCC();
211
212 GEMM_KERNEL(min_i, min_jj, min_l, dp1,
213 #ifdef COMPLEX
214 ZERO,
215 #endif
216 sa, sb + min_l * (jjs - js) * COMPSIZE,
217 b + (jjs * ldb) * COMPSIZE, ldb);
218
219 STOP_RPCC(gemmcost);
220 }
221
222 for(is = min_i; is < ls; is += GEMM_P){
223 min_i = ls - is;
224 if (min_i > GEMM_P) min_i = GEMM_P;
225
226 START_RPCC();
227
228 #ifndef TRANSA
229 GEMM_ITCOPY(min_l, min_i, a + (is + ls * lda) * COMPSIZE, lda, sa);
230 #else
231 GEMM_INCOPY(min_l, min_i, a + (ls + is * lda) * COMPSIZE, lda, sa);
232 #endif
233
234 STOP_RPCC(innercost);
235
236 START_RPCC();
237
238 GEMM_KERNEL(min_i, min_j, min_l, dp1,
239 #ifdef COMPLEX
240 ZERO,
241 #endif
242 sa, sb, b + (is + js * ldb) * COMPSIZE, ldb);
243
244 STOP_RPCC(gemmcost);
245 }
246
247 for(is = ls; is < ls + min_l; is += GEMM_P){
248 min_i = ls + min_l - is;
249 if (min_i > GEMM_P) min_i = GEMM_P;
250
251 START_RPCC();
252
253 #ifndef TRANSA
254 TRMM_IUTCOPY(min_l, min_i, a, lda, ls, is, sa);
255 #else
256 TRMM_ILNCOPY(min_l, min_i, a, lda, ls, is, sa);
257 #endif
258
259 STOP_RPCC(innercost);
260
261 START_RPCC();
262
263 TRMM_KERNEL_N(min_i, min_j, min_l, dp1,
264 #ifdef COMPLEX
265 ZERO,
266 #endif
267 sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - ls);
268
269 STOP_RPCC(trmmcost);
270 }
271 }
272
273 #else
274 min_l = m;
275 if (min_l > GEMM_Q) min_l = GEMM_Q;
276 min_i = min_l;
277 if (min_i > GEMM_P) min_i = GEMM_P;
278
279 START_RPCC();
280
281 #ifndef TRANSA
282 TRMM_ILTCOPY(min_l, min_i, a, lda, m - min_l, m - min_l, sa);
283 #else
284 TRMM_IUNCOPY(min_l, min_i, a, lda, m - min_l, m - min_l, sa);
285 #endif
286
287 STOP_RPCC(innercost);
288
289 for(jjs = js; jjs < js + min_j; jjs += min_jj){
290 min_jj = min_j + js - jjs;
291 if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
292
293 START_RPCC();
294
295 GEMM_ONCOPY(min_l, min_jj, b + (m - min_l + jjs * ldb) * COMPSIZE, ldb,
296 sb + min_l * (jjs - js) * COMPSIZE);
297
298 STOP_RPCC(outercost);
299
300 START_RPCC();
301
302 TRMM_KERNEL_T(min_i, min_jj, min_l, dp1,
303 #ifdef COMPLEX
304 ZERO,
305 #endif
306 sa, sb + min_l * (jjs - js) * COMPSIZE,
307 b + (m - min_l + jjs * ldb) * COMPSIZE, ldb, 0);
308
309 STOP_RPCC(trmmcost);
310 }
311
312 for(is = m - min_l + min_i; is < m; is += GEMM_P){
313 min_i = m - is;
314 if (min_i > GEMM_P) min_i = GEMM_P;
315
316 START_RPCC();
317
318 #ifndef TRANSA
319 TRMM_ILTCOPY(min_l, min_i, a, lda, m - min_l, is, sa);
320 #else
321 TRMM_IUNCOPY(min_l, min_i, a, lda, m - min_l, is, sa);
322 #endif
323
324 STOP_RPCC(innercost);
325
326 START_RPCC();
327
328 TRMM_KERNEL_T(min_i, min_j, min_l, dp1,
329 #ifdef COMPLEX
330 ZERO,
331 #endif
332 sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - m + min_l);
333
334 STOP_RPCC(trmmcost);
335 }
336
337 for(ls = m - min_l; ls > 0; ls -= GEMM_Q){
338 min_l = ls;
339 if (min_l > GEMM_Q) min_l = GEMM_Q;
340 min_i = min_l;
341 if (min_i > GEMM_P) min_i = GEMM_P;
342
343 START_RPCC();
344
345 #ifndef TRANSA
346 TRMM_ILTCOPY(min_l, min_i, a, lda, ls - min_l, ls - min_l, sa);
347 #else
348 TRMM_IUNCOPY(min_l, min_i, a, lda, ls - min_l, ls - min_l, sa);
349 #endif
350
351 STOP_RPCC(innercost);
352
353 for(jjs = js; jjs < js + min_j; jjs += min_jj){
354 min_jj = min_j + js - jjs;
355 if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
356
357 START_RPCC();
358
359 GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb,
360 sb + min_l * (jjs - js) * COMPSIZE);
361
362 STOP_RPCC(outercost);
363
364 START_RPCC();
365
366 TRMM_KERNEL_T(min_i, min_jj, min_l, dp1,
367 #ifdef COMPLEX
368 ZERO,
369 #endif
370 sa, sb + min_l * (jjs - js) * COMPSIZE,
371 b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, 0);
372
373 STOP_RPCC(trmmcost);
374 }
375
376 for(is = ls - min_l + min_i; is < ls; is += GEMM_P){
377 min_i = ls - is;
378 if (min_i > GEMM_P) min_i = GEMM_P;
379
380 START_RPCC();
381
382 #ifndef TRANSA
383 TRMM_ILTCOPY(min_l, min_i, a, lda, ls - min_l, is, sa);
384 #else
385 TRMM_IUNCOPY(min_l, min_i, a, lda, ls - min_l, is, sa);
386 #endif
387
388 STOP_RPCC(innercost);
389
390 START_RPCC();
391
392 TRMM_KERNEL_T(min_i, min_j, min_l, dp1,
393 #ifdef COMPLEX
394 ZERO,
395 #endif
396 sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - ls + min_l);
397
398 STOP_RPCC(trmmcost);
399 }
400
401
402 for(is = ls; is < m; is += GEMM_P){
403 min_i = m - is;
404 if (min_i > GEMM_P) min_i = GEMM_P;
405
406 START_RPCC();
407
408 #ifndef TRANSA
409 GEMM_ITCOPY(min_l, min_i, a + (is + (ls - min_l) * lda) * COMPSIZE, lda, sa);
410 #else
411 GEMM_INCOPY(min_l, min_i, a + ((ls - min_l) + is * lda) * COMPSIZE, lda, sa);
412 #endif
413
414 STOP_RPCC(innercost);
415
416 START_RPCC();
417
418 GEMM_KERNEL(min_i, min_j, min_l, dp1,
419 #ifdef COMPLEX
420 ZERO,
421 #endif
422 sa, sb, b + (is + js * ldb) * COMPSIZE, ldb);
423
424 STOP_RPCC(gemmcost);
425 }
426 }
427
428 #endif
429
430 }
431
432 #ifdef TIMING
433 total = (double)outercost + (double)innercost + (double)gemmcost + (double)trmmcost;
434
435 printf( "Copy A : %5.2f Copy B: %5.2f GEMM Kernel : %5.2f TRMM Kerlnel : %5.2f kernel Effi. : %5.2f Total Effi. : %5.2f\n",
436 innercost / total * 100., outercost / total * 100.,
437 gemmcost / total * 100., trmmcost / total * 100.,
438 (double)n * (double)n * (double)n / (double)(trmmcost + gemmcost) * 100. * (double)COMPSIZE / 2.,
439 (double)n * (double)n * (double)n / total * 100. * (double)COMPSIZE / 2.);
440
441 #endif
442
443 return 0;
444 }
445