1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin.           */
3 /* All rights reserved.                                              */
4 /*                                                                   */
5 /* Redistribution and use in source and binary forms, with or        */
6 /* without modification, are permitted provided that the following   */
7 /* conditions are met:                                               */
8 /*                                                                   */
9 /*   1. Redistributions of source code must retain the above         */
10 /*      copyright notice, this list of conditions and the following  */
11 /*      disclaimer.                                                  */
12 /*                                                                   */
13 /*   2. Redistributions in binary form must reproduce the above      */
14 /*      copyright notice, this list of conditions and the following  */
15 /*      disclaimer in the documentation and/or other materials       */
16 /*      provided with the distribution.                              */
17 /*                                                                   */
18 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
32 /*                                                                   */
33 /* The views and conclusions contained in the software and           */
34 /* documentation are those of the authors and should not be          */
35 /* interpreted as representing official policies, either expressed   */
36 /* or implied, of The University of Texas at Austin.                 */
37 /*********************************************************************/
38 
39 #ifndef KERNEL_OPERATION
40 #ifndef COMPLEX
41 #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
42 	KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y))
43 #else
44 #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
45 	KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y))
46 #endif
47 #endif
48 
49 #ifndef ICOPY_OPERATION
50 #ifndef TRANS
51 #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
52 #else
53 #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
54 #endif
55 #endif
56 
57 #ifndef OCOPY_OPERATION
58 #ifdef TRANS
59 #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
60 #else
61 #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
62 #endif
63 #endif
64 
65 #ifndef M
66 #define M	args -> n
67 #endif
68 
69 #ifndef N
70 #define N	args -> n
71 #endif
72 
73 #ifndef K
74 #define K	args -> k
75 #endif
76 
77 #ifndef A
78 #define A	args -> a
79 #endif
80 
81 #ifndef C
82 #define C	args -> c
83 #endif
84 
85 #ifndef LDA
86 #define LDA	args -> lda
87 #endif
88 
89 #ifndef LDC
90 #define LDC	args -> ldc
91 #endif
92 
93 #ifdef TIMING
94 #define START_RPCC()		rpcc_counter = rpcc()
95 #define STOP_RPCC(COUNTER)	COUNTER  += rpcc() - rpcc_counter
96 #else
97 #define START_RPCC()
98 #define STOP_RPCC(COUNTER)
99 #endif
100 
CNAME(blas_arg_t * args,BLASLONG * range_m,BLASLONG * range_n,FLOAT * sa,FLOAT * sb,BLASLONG dummy)101 int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) {
102 
103   BLASLONG m_from, m_to, n_from, n_to, k, lda, ldc;
104   FLOAT *a, *c, *alpha, *beta;
105 
106   BLASLONG ls, is, js;
107   BLASLONG min_l, min_i, min_j;
108   BLASLONG jjs, min_jj;
109   BLASLONG m_start, m_end;
110 
111   int shared = ((GEMM_UNROLL_M == GEMM_UNROLL_N) && !HAVE_EX_L2);
112 
113   FLOAT *aa;
114 
115 #ifdef TIMING
116   unsigned long long rpcc_counter;
117   unsigned long long innercost  = 0;
118   unsigned long long outercost  = 0;
119   unsigned long long kernelcost = 0;
120   double total;
121 #endif
122 
123   k = K;
124 
125   a = (FLOAT *)A;
126   c = (FLOAT *)C;
127 
128   lda = LDA;
129   ldc = LDC;
130 
131   alpha = (FLOAT *)args -> alpha;
132   beta  = (FLOAT *)args -> beta;
133 
134   m_from = 0;
135   m_to   = M;
136 
137   if (range_m) {
138     m_from = *(((BLASLONG *)range_m) + 0);
139     m_to   = *(((BLASLONG *)range_m) + 1);
140   }
141 
142   n_from = 0;
143   n_to   = N;
144 
145   if (range_n) {
146     n_from = *(((BLASLONG *)range_n) + 0);
147     n_to   = *(((BLASLONG *)range_n) + 1);
148   }
149 
150   if (beta) {
151 #if !defined(COMPLEX) || defined(HERK)
152     if (beta[0] != ONE)
153 #else
154     if ((beta[0] != ONE) || (beta[1] != ZERO))
155 #endif
156       syrk_beta(m_from, m_to, n_from, n_to, beta, c, ldc);
157   }
158 
159   if ((k == 0) || (alpha == NULL)) return 0;
160 
161   if ((alpha[0] == ZERO)
162 #if defined(COMPLEX) && !defined(HERK)
163       && (alpha[1] == ZERO)
164 #endif
165       ) return 0;
166 
167 #if 0
168   fprintf(stderr, "m_from : %ld m_to : %ld n_from : %ld n_to : %ld\n",
169 	  m_from, m_to, n_from, n_to);
170 #endif
171 
172   for(js = n_from; js < n_to; js += GEMM_R){
173     min_j = n_to - js;
174     if (min_j > GEMM_R) min_j = GEMM_R;
175 
176 #ifndef LOWER
177     m_start = m_from;
178     m_end   = js + min_j;
179     if (m_end > m_to) m_end = m_to;
180 #else
181     m_start = m_from;
182     m_end   = m_to;
183     if (m_start < js) m_start = js;
184 #endif
185 
186     for(ls = 0; ls < k; ls += min_l){
187       min_l = k - ls;
188       if (min_l >= GEMM_Q * 2) {
189 	min_l = GEMM_Q;
190       } else
191 	if (min_l > GEMM_Q) {
192 	  min_l = (min_l + 1) / 2;
193 	}
194 
195       min_i = m_end - m_start;
196 
197       if (min_i >= GEMM_P * 2) {
198 	min_i = GEMM_P;
199       } else
200 	if (min_i > GEMM_P) {
201 	  min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
202 	}
203 
204 #ifndef LOWER
205 
206       if (m_end >= js) {
207 
208 	aa = sb + min_l * MAX(m_start - js, 0) * COMPSIZE;
209 	if (!shared) aa = sa;
210 
211 	for(jjs = MAX(m_start, js); jjs < js + min_j; jjs += min_jj){
212 	  min_jj = js + min_j - jjs;
213 	  if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
214 
215 	  if (!shared && (jjs - MAX(m_start, js) < min_i)) {
216 	    START_RPCC();
217 
218 	    ICOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sa + min_l * (jjs - js) * COMPSIZE);
219 
220 	    STOP_RPCC(innercost);
221 	  }
222 
223 	  START_RPCC();
224 
225 	  OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
226 
227 	  STOP_RPCC(outercost);
228 
229 	  START_RPCC();
230 
231 	  KERNEL_OPERATION(min_i, min_jj, min_l, alpha, aa, sb + min_l * (jjs - js)  * COMPSIZE, c, ldc, MAX(m_start, js), jjs);
232 
233 	  STOP_RPCC(kernelcost);
234 	}
235 
236 	for(is = MAX(m_start, js) + min_i; is < m_end; is += min_i){
237 	  min_i = m_end - is;
238 	  if (min_i >= GEMM_P * 2) {
239 	    min_i = GEMM_P;
240 	  } else
241 	    if (min_i > GEMM_P) {
242 	      min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
243 	    }
244 
245 	  aa = sb + min_l * (is - js)  * COMPSIZE;
246 
247 	  if (!shared) {
248 
249 	    START_RPCC();
250 
251 	    ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
252 
253 	    STOP_RPCC(innercost);
254 
255 	    aa = sa;
256 	  }
257 
258 	  START_RPCC();
259 
260 	  KERNEL_OPERATION(min_i, min_j, min_l, alpha, aa, sb, c, ldc, is, js);
261 
262 	  STOP_RPCC(kernelcost);
263 
264 	}
265 
266       }
267 
268       if (m_start < js) {
269 
270 	if (m_end < js) {
271 
272 	  START_RPCC();
273 
274 	  ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa);
275 
276 	  STOP_RPCC(innercost);
277 
278 	  for(jjs = js; jjs < js + min_j; jjs += GEMM_UNROLL_MN){
279 	    min_jj = min_j + js - jjs;
280 	    if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
281 
282 	    START_RPCC();
283 
284 	    OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
285 
286 	    STOP_RPCC(outercost);
287 
288 	  START_RPCC();
289 
290 	  KERNEL_OPERATION(min_i, min_jj, min_l, alpha, sa, sb + min_l * (jjs - js)  * COMPSIZE, c, ldc, m_start, jjs);
291 
292 	  STOP_RPCC(kernelcost);
293 
294 	  }
295 	} else {
296 	  min_i = 0;
297 	}
298 
299 	for(is = m_start + min_i; is < MIN(m_end, js); is += min_i){
300 
301 	  min_i = MIN(m_end, js)- is;
302 	  if (min_i >= GEMM_P * 2) {
303 	    min_i = GEMM_P;
304 	  } else
305 	    if (min_i > GEMM_P) {
306 	      min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
307 	    }
308 
309 	  START_RPCC();
310 
311 	  ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
312 
313 	  STOP_RPCC(innercost);
314 
315 	  START_RPCC();
316 
317 	  KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js);
318 
319 	  STOP_RPCC(kernelcost);
320 
321 	}
322       }
323 
324 #else
325 
326       if (m_start < js + min_j) {
327 
328 	aa = sb + min_l * (m_start - js) * COMPSIZE;
329 
330 	if (!shared) {
331 
332 	  START_RPCC();
333 
334 	  ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa);
335 
336 	  STOP_RPCC(innercost);
337 
338 	}
339 
340 	START_RPCC();
341 
342 	OCOPY_OPERATION(min_l, (shared? (min_i) : MIN(min_i, min_j + js - m_start)), a, lda, ls, m_start, aa);
343 
344 	STOP_RPCC(outercost);
345 
346 	START_RPCC();
347 
348 	KERNEL_OPERATION(min_i, MIN(min_i, min_j + js - m_start), min_l, alpha, (shared? (aa) : (sa)), aa, c, ldc, m_start, m_start);
349 
350 	STOP_RPCC(kernelcost);
351 
352 	for(jjs = js; jjs < m_start; jjs += GEMM_UNROLL_N){
353 	  min_jj = m_start - jjs;
354 	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
355 
356 	  START_RPCC();
357 
358 	  OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
359 
360 	  STOP_RPCC(outercost);
361 
362 	  START_RPCC();
363 
364 	  KERNEL_OPERATION(min_i, min_jj, min_l, alpha, (shared? (aa) : (sa)), sb + min_l * (jjs - js)  * COMPSIZE, c, ldc, m_start, jjs);
365 
366 	  STOP_RPCC(kernelcost);
367 
368 	}
369 
370 	for(is = m_start + min_i; is < m_end; is += min_i){
371 
372 	  min_i = m_end - is;
373 
374 	  if (min_i >= GEMM_P * 2) {
375 	    min_i = GEMM_P;
376 	  } else
377 	    if (min_i > GEMM_P) {
378 	      min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
379 	    }
380 
381 	  if (is  < js + min_j) {
382 
383 	    if (!shared) {
384 	      START_RPCC();
385 
386 	      ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
387 
388 	      STOP_RPCC(innercost);
389 	    }
390 
391 	    aa = sb + min_l * (is - js) * COMPSIZE;
392 
393 	    START_RPCC();
394 
395 	    OCOPY_OPERATION(min_l, (shared? (min_i) : MIN(min_i, min_j - is + js)), a, lda, ls, is, aa);
396 
397 	    STOP_RPCC(outercost);
398 
399 	    START_RPCC();
400 
401 	    KERNEL_OPERATION(min_i, MIN(min_i, min_j - is + js), min_l, alpha,  (shared? (aa) : (sa)), aa,  c, ldc, is, is);
402 
403 	    STOP_RPCC(kernelcost);
404 
405 	    START_RPCC();
406 
407 	    KERNEL_OPERATION(min_i, is - js, min_l, alpha, (shared? (aa) : (sa)), sb,  c, ldc, is, js);
408 
409 	    STOP_RPCC(kernelcost);
410 
411 	  } else {
412 
413 	    START_RPCC();
414 
415 	    ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
416 
417 	    STOP_RPCC(innercost);
418 
419 	    START_RPCC();
420 
421 	    KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb,  c, ldc, is, js);
422 
423 	    STOP_RPCC(kernelcost);
424 
425 	  }
426 
427 	}
428 
429       } else {
430 
431 	START_RPCC();
432 
433 	ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa);
434 
435 	STOP_RPCC(innercost);
436 
437 	for(jjs = js; jjs < min_j; jjs += GEMM_UNROLL_N){
438 	  min_jj = min_j - jjs;
439 	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
440 
441 	  START_RPCC();
442 
443 	  OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
444 
445 	  STOP_RPCC(outercost);
446 
447 	  START_RPCC();
448 
449 	  KERNEL_OPERATION(min_i, min_jj, min_l, alpha, sa, sb + min_l * (jjs - js)  * COMPSIZE, c, ldc, m_start, jjs);
450 
451 	  STOP_RPCC(kernelcost);
452 
453 	}
454 
455 	for(is = m_start + min_i; is < m_end; is += min_i){
456 
457 	  min_i = m_end - is;
458 
459 	  if (min_i >= GEMM_P * 2) {
460 	    min_i = GEMM_P;
461 	  } else
462 	    if (min_i > GEMM_P) {
463 	      min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
464 	    }
465 
466 	  START_RPCC();
467 
468 	  ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
469 
470 	  STOP_RPCC(innercost);
471 
472 	  START_RPCC();
473 
474 	  KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb,  c, ldc, is, js);
475 
476 	  STOP_RPCC(kernelcost);
477 
478 	}
479       }
480 #endif
481     }
482   }
483 
484 #ifdef TIMING
485   total = (double)outercost + (double)innercost + (double)kernelcost;
486 
487   printf( "Copy A : %5.2f Copy  B: %5.2f  Kernel : %5.2f  kernel Effi. : %5.2f Total Effi. : %5.2f\n",
488 	   innercost / total * 100., outercost / total * 100., kernelcost / total * 100.,
489 	  (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / (double)kernelcost * 100. * (double)COMPSIZE / (double)DNUMOPT,
490 	  (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / total * 100. * (double)COMPSIZE / (double)DNUMOPT);
491 
492 #endif
493 
494   return 0;
495 }
496