1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin.           */
3 /* All rights reserved.                                              */
4 /*                                                                   */
5 /* Redistribution and use in source and binary forms, with or        */
6 /* without modification, are permitted provided that the following   */
7 /* conditions are met:                                               */
8 /*                                                                   */
9 /*   1. Redistributions of source code must retain the above         */
10 /*      copyright notice, this list of conditions and the following  */
11 /*      disclaimer.                                                  */
12 /*                                                                   */
13 /*   2. Redistributions in binary form must reproduce the above      */
14 /*      copyright notice, this list of conditions and the following  */
15 /*      disclaimer in the documentation and/or other materials       */
16 /*      provided with the distribution.                              */
17 /*                                                                   */
18 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
32 /*                                                                   */
33 /* The views and conclusions contained in the software and           */
34 /* documentation are those of the authors and should not be          */
35 /* interpreted as representing official policies, either expressed   */
36 /* or implied, of The University of Texas at Austin.                 */
37 /*********************************************************************/
38 
39 #include <stdio.h>
40 #include <ctype.h>
41 #include "common.h"
42 
43 const static FLOAT dp1 = 1.;
44 
45 #ifdef CONJ
46 #define GEMM_KERNEL   GEMM_KERNEL_L
47 #define TRMM_KERNEL_N TRMM_KERNEL_LR
48 #define TRMM_KERNEL_T TRMM_KERNEL_LC
49 #else
50 #define GEMM_KERNEL   GEMM_KERNEL_N
51 #define TRMM_KERNEL_N TRMM_KERNEL_LN
52 #define TRMM_KERNEL_T TRMM_KERNEL_LT
53 #endif
54 
55 #undef TIMING
56 
57 #ifdef TIMING
58 #define START_RPCC()		rpcc_counter = rpcc()
59 #define STOP_RPCC(COUNTER)	COUNTER  += rpcc() - rpcc_counter
60 #else
61 #define START_RPCC()
62 #define STOP_RPCC(COUNTER)
63 #endif
64 
CNAME(blas_arg_t * args,BLASLONG * range_m,BLASLONG * range_n,FLOAT * sa,FLOAT * sb,BLASLONG dummy)65 int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) {
66 
67   BLASLONG m, n, lda, ldb;
68   FLOAT *beta, *a, *b;
69 
70   BLASLONG ls, is, js;
71   BLASLONG min_l, min_i, min_j;
72   BLASLONG jjs, min_jj;
73 
74 #ifdef TIMING
75   unsigned long long rpcc_counter;
76   unsigned long long innercost  = 0;
77   unsigned long long outercost  = 0;
78   unsigned long long gemmcost = 0;
79   unsigned long long trmmcost = 0;
80   double total;
81 #endif
82 
83   m = args -> m;
84   n = args -> n;
85 
86   a = (FLOAT *)args -> a;
87   b = (FLOAT *)args -> b;
88 
89   lda = args -> lda;
90   ldb = args -> ldb;
91 
92   beta  = (FLOAT *)args -> beta;
93 
94   if (range_n) {
95     BLASLONG n_from = *(((BLASLONG *)range_n) + 0);
96     BLASLONG n_to   = *(((BLASLONG *)range_n) + 1);
97 
98     n = n_to - n_from;
99 
100     b += n_from * ldb * COMPSIZE;
101   }
102 
103   if (beta) {
104 #ifndef COMPLEX
105     if (beta[0] != ONE)
106       GEMM_BETA(m, n, 0, beta[0], NULL, 0, NULL, 0, b, ldb);
107     if (beta[0] == ZERO) return 0;
108 #else
109     if ((beta[0] != ONE) || (beta[1] != ZERO))
110       GEMM_BETA(m, n, 0, beta[0], beta[1], NULL, 0, NULL, 0, b, ldb);
111     if ((beta[0] == ZERO) && (beta[1] == ZERO)) return 0;
112 #endif
113   }
114 
115   for(js = 0; js < n; js += GEMM_R){
116     min_j = n - js;
117     if (min_j > GEMM_R) min_j = GEMM_R;
118 
119 #if (defined(UPPER) && !defined(TRANSA)) || (!defined(UPPER) && defined(TRANSA))
120 
121     min_l = m;
122     if (min_l > GEMM_Q) min_l = GEMM_Q;
123     min_i = min_l;
124     if (min_i > GEMM_P) min_i = GEMM_P;
125 
126     START_RPCC();
127 
128 #ifndef TRANSA
129     TRMM_IUTCOPY(min_l, min_i, a, lda, 0, 0, sa);
130 #else
131     TRMM_ILNCOPY(min_l, min_i, a, lda, 0, 0, sa);
132 #endif
133 
134     STOP_RPCC(innercost);
135 
136     for(jjs = js; jjs < js + min_j; jjs += min_jj){
137       min_jj = min_j + js - jjs;
138       if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
139 
140       START_RPCC();
141 
142       GEMM_ONCOPY(min_l, min_jj, b + (jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
143 
144       STOP_RPCC(outercost);
145 
146       START_RPCC();
147 
148       TRMM_KERNEL_N(min_i, min_jj, min_l, dp1,
149 #ifdef COMPLEX
150 		    ZERO,
151 #endif
152 		    sa, sb + min_l * (jjs - js) * COMPSIZE, b + (jjs * ldb) * COMPSIZE, ldb, 0);
153 
154       STOP_RPCC(trmmcost);
155     }
156 
157 
158     for(is = min_i; is < min_l; is += GEMM_P){
159       min_i = min_l - is;
160       if (min_i > GEMM_P) min_i = GEMM_P;
161 
162       START_RPCC();
163 
164 #ifndef TRANSA
165       TRMM_IUTCOPY(min_l, min_i, a, lda, 0, is, sa);
166 #else
167       TRMM_ILNCOPY(min_l, min_i, a, lda, 0, is, sa);
168 #endif
169 
170       STOP_RPCC(innercost);
171 
172       START_RPCC();
173 
174       TRMM_KERNEL_N(min_i, min_j, min_l, dp1,
175 #ifdef COMPLEX
176 		    ZERO,
177 #endif
178 		    sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is);
179 
180       STOP_RPCC(trmmcost);
181 
182     }
183 
184     for(ls = min_l; ls < m; ls += GEMM_Q){
185       min_l = m - ls;
186       if (min_l > GEMM_Q) min_l = GEMM_Q;
187       min_i = ls;
188       if (min_i > GEMM_P) min_i = GEMM_P;
189 
190       START_RPCC();
191 
192 #ifndef TRANSA
193       GEMM_ITCOPY(min_l, min_i, a + (ls * lda) * COMPSIZE, lda, sa);
194 #else
195       GEMM_INCOPY(min_l, min_i, a + (ls      ) * COMPSIZE, lda, sa);
196 #endif
197 
198       STOP_RPCC(innercost);
199 
200       for(jjs = js; jjs < js + min_j; jjs += min_jj){
201 	min_jj = min_j + js - jjs;
202 	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
203 
204 	START_RPCC();
205 
206 	GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
207 
208 	STOP_RPCC(gemmcost);
209 
210 	START_RPCC();
211 
212 	GEMM_KERNEL(min_i, min_jj, min_l, dp1,
213 #ifdef COMPLEX
214 		    ZERO,
215 #endif
216 		    sa, sb + min_l * (jjs - js) * COMPSIZE,
217 		    b + (jjs * ldb) * COMPSIZE, ldb);
218 
219 	STOP_RPCC(gemmcost);
220       }
221 
222       for(is = min_i; is < ls; is += GEMM_P){
223 	min_i = ls - is;
224 	if (min_i > GEMM_P) min_i = GEMM_P;
225 
226 	START_RPCC();
227 
228 #ifndef TRANSA
229 	GEMM_ITCOPY(min_l, min_i, a + (is + ls * lda) * COMPSIZE, lda, sa);
230 #else
231 	GEMM_INCOPY(min_l, min_i, a + (ls + is * lda) * COMPSIZE, lda, sa);
232 #endif
233 
234 	STOP_RPCC(innercost);
235 
236 	START_RPCC();
237 
238 	GEMM_KERNEL(min_i, min_j, min_l, dp1,
239 #ifdef COMPLEX
240 		    ZERO,
241 #endif
242 		    sa, sb, b + (is + js * ldb) * COMPSIZE, ldb);
243 
244 	STOP_RPCC(gemmcost);
245       }
246 
247       for(is = ls; is < ls + min_l; is += GEMM_P){
248 	min_i = ls + min_l - is;
249 	if (min_i > GEMM_P) min_i = GEMM_P;
250 
251 	START_RPCC();
252 
253 #ifndef TRANSA
254 	TRMM_IUTCOPY(min_l, min_i, a, lda, ls, is, sa);
255 #else
256 	TRMM_ILNCOPY(min_l, min_i, a, lda, ls, is, sa);
257 #endif
258 
259 	STOP_RPCC(innercost);
260 
261 	START_RPCC();
262 
263 	TRMM_KERNEL_N(min_i, min_j, min_l, dp1,
264 #ifdef COMPLEX
265 		      ZERO,
266 #endif
267 		      sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - ls);
268 
269 	STOP_RPCC(trmmcost);
270       }
271     }
272 
273 #else
274     min_l = m;
275     if (min_l > GEMM_Q) min_l = GEMM_Q;
276     min_i = min_l;
277     if (min_i > GEMM_P) min_i = GEMM_P;
278 
279     START_RPCC();
280 
281 #ifndef TRANSA
282     TRMM_ILTCOPY(min_l, min_i, a, lda, m - min_l, m - min_l, sa);
283 #else
284     TRMM_IUNCOPY(min_l, min_i, a, lda, m - min_l, m - min_l, sa);
285 #endif
286 
287     STOP_RPCC(innercost);
288 
289     for(jjs = js; jjs < js + min_j; jjs += min_jj){
290       min_jj = min_j + js - jjs;
291       if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
292 
293       START_RPCC();
294 
295       GEMM_ONCOPY(min_l, min_jj, b + (m - min_l + jjs * ldb) * COMPSIZE, ldb,
296 		  sb + min_l * (jjs - js) * COMPSIZE);
297 
298       STOP_RPCC(outercost);
299 
300       START_RPCC();
301 
302       TRMM_KERNEL_T(min_i, min_jj, min_l, dp1,
303 #ifdef COMPLEX
304 		    ZERO,
305 #endif
306 		    sa, sb + min_l * (jjs - js) * COMPSIZE,
307 		    b + (m - min_l + jjs * ldb) * COMPSIZE, ldb, 0);
308 
309       STOP_RPCC(trmmcost);
310     }
311 
312     for(is = m - min_l + min_i; is < m; is += GEMM_P){
313       min_i = m - is;
314       if (min_i > GEMM_P) min_i = GEMM_P;
315 
316       START_RPCC();
317 
318 #ifndef TRANSA
319       TRMM_ILTCOPY(min_l, min_i, a, lda, m - min_l, is, sa);
320 #else
321       TRMM_IUNCOPY(min_l, min_i, a, lda, m - min_l, is, sa);
322 #endif
323 
324       STOP_RPCC(innercost);
325 
326       START_RPCC();
327 
328       TRMM_KERNEL_T(min_i, min_j, min_l, dp1,
329 #ifdef COMPLEX
330 		    ZERO,
331 #endif
332 		    sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - m + min_l);
333 
334       STOP_RPCC(trmmcost);
335     }
336 
337     for(ls = m - min_l; ls > 0; ls -= GEMM_Q){
338       min_l = ls;
339       if (min_l > GEMM_Q) min_l = GEMM_Q;
340       min_i = min_l;
341       if (min_i > GEMM_P) min_i = GEMM_P;
342 
343       START_RPCC();
344 
345 #ifndef TRANSA
346       TRMM_ILTCOPY(min_l, min_i, a, lda, ls - min_l, ls - min_l, sa);
347 #else
348       TRMM_IUNCOPY(min_l, min_i, a, lda, ls - min_l, ls - min_l, sa);
349 #endif
350 
351       STOP_RPCC(innercost);
352 
353       for(jjs = js; jjs < js + min_j; jjs += min_jj){
354 	min_jj = min_j + js - jjs;
355 	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
356 
357 	START_RPCC();
358 
359 	GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb,
360 		    sb + min_l * (jjs - js) * COMPSIZE);
361 
362 	STOP_RPCC(outercost);
363 
364 	START_RPCC();
365 
366 	TRMM_KERNEL_T(min_i, min_jj, min_l, dp1,
367 #ifdef COMPLEX
368 		      ZERO,
369 #endif
370 		      sa, sb + min_l * (jjs - js) * COMPSIZE,
371 		      b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, 0);
372 
373 	STOP_RPCC(trmmcost);
374       }
375 
376       for(is = ls - min_l + min_i; is < ls; is += GEMM_P){
377 	min_i = ls - is;
378 	if (min_i > GEMM_P) min_i = GEMM_P;
379 
380 	START_RPCC();
381 
382 #ifndef TRANSA
383 	TRMM_ILTCOPY(min_l, min_i, a, lda, ls - min_l, is, sa);
384 #else
385 	TRMM_IUNCOPY(min_l, min_i, a, lda, ls - min_l, is, sa);
386 #endif
387 
388 	STOP_RPCC(innercost);
389 
390 	START_RPCC();
391 
392 	TRMM_KERNEL_T(min_i, min_j, min_l, dp1,
393 #ifdef COMPLEX
394 		      ZERO,
395 #endif
396 		      sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - ls + min_l);
397 
398 	STOP_RPCC(trmmcost);
399       }
400 
401 
402       for(is = ls; is < m; is += GEMM_P){
403 	min_i = m - is;
404 	if (min_i > GEMM_P) min_i = GEMM_P;
405 
406 	START_RPCC();
407 
408 #ifndef TRANSA
409 	GEMM_ITCOPY(min_l, min_i, a + (is + (ls - min_l) * lda) * COMPSIZE, lda, sa);
410 #else
411 	GEMM_INCOPY(min_l, min_i, a + ((ls - min_l) + is * lda) * COMPSIZE, lda, sa);
412 #endif
413 
414 	STOP_RPCC(innercost);
415 
416 	START_RPCC();
417 
418 	GEMM_KERNEL(min_i, min_j, min_l, dp1,
419 #ifdef COMPLEX
420 		    ZERO,
421 #endif
422 		    sa, sb, b + (is + js * ldb) * COMPSIZE, ldb);
423 
424 	STOP_RPCC(gemmcost);
425       }
426     }
427 
428 #endif
429 
430   }
431 
432 #ifdef TIMING
433   total = (double)outercost + (double)innercost + (double)gemmcost + (double)trmmcost;
434 
435   printf( "Copy A : %5.2f Copy  B: %5.2f  GEMM Kernel : %5.2f  TRMM Kerlnel : %5.2f   kernel Effi. : %5.2f Total Effi. : %5.2f\n",
436 	  innercost / total * 100., outercost / total * 100.,
437 	  gemmcost / total * 100., trmmcost / total * 100.,
438 	  (double)n * (double)n * (double)n / (double)(trmmcost + gemmcost) * 100. * (double)COMPSIZE / 2.,
439 	  (double)n * (double)n * (double)n / total * 100. * (double)COMPSIZE / 2.);
440 
441 #endif
442 
443   return 0;
444 }
445