1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin.           */
3 /* All rights reserved.                                              */
4 /*                                                                   */
5 /* Redistribution and use in source and binary forms, with or        */
6 /* without modification, are permitted provided that the following   */
7 /* conditions are met:                                               */
8 /*                                                                   */
9 /*   1. Redistributions of source code must retain the above         */
10 /*      copyright notice, this list of conditions and the following  */
11 /*      disclaimer.                                                  */
12 /*                                                                   */
13 /*   2. Redistributions in binary form must reproduce the above      */
14 /*      copyright notice, this list of conditions and the following  */
15 /*      disclaimer in the documentation and/or other materials       */
16 /*      provided with the distribution.                              */
17 /*                                                                   */
18 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
32 /*                                                                   */
33 /* The views and conclusions contained in the software and           */
34 /* documentation are those of the authors and should not be          */
35 /* interpreted as representing official policies, either expressed   */
36 /* or implied, of The University of Texas at Austin.                 */
37 /*********************************************************************/
38 
39 #ifndef KERNEL_OPERATION
40 #ifndef COMPLEX
41 #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y, FLAG) \
42 	KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y), FLAG)
43 #else
44 #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y, FLAG) \
45 	KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y), FLAG)
46 #endif
47 #endif
48 
49 #ifndef KERNEL_OPERATION_C
50 #define KERNEL_OPERATION_C KERNEL_OPERATION
51 #endif
52 
53 #ifndef ICOPY_OPERATION
54 #ifndef TRANS
55 #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
56 #else
57 #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
58 #endif
59 #endif
60 
61 #ifndef OCOPY_OPERATION
62 #ifdef TRANS
63 #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
64 #else
65 #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
66 #endif
67 #endif
68 
69 #ifndef M
70 #define M	args -> n
71 #endif
72 
73 #ifndef N
74 #define N	args -> n
75 #endif
76 
77 #ifndef K
78 #define K	args -> k
79 #endif
80 
81 #ifndef A
82 #define A	args -> a
83 #endif
84 
85 #ifndef B
86 #define B	args -> b
87 #endif
88 
89 #ifndef C
90 #define C	args -> c
91 #endif
92 
93 #ifndef LDA
94 #define LDA	args -> lda
95 #endif
96 
97 #ifndef LDB
98 #define LDB	args -> ldb
99 #endif
100 
101 #ifndef LDC
102 #define LDC	args -> ldc
103 #endif
104 
CNAME(blas_arg_t * args,BLASLONG * range_m,BLASLONG * range_n,FLOAT * sa,FLOAT * sb,BLASLONG dummy)105 int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) {
106 
107   BLASLONG m_from, m_to, n_from, n_to, k, lda, ldb, ldc;
108   FLOAT *a, *b, *c, *alpha, *beta;
109 
110   BLASLONG ls, is, js;
111   BLASLONG min_l, min_i, min_j;
112   BLASLONG jjs, min_jj;
113   BLASLONG m_start, m_end;
114 
115   FLOAT *aa;
116 
117   k = K;
118 
119   a = (FLOAT *)A;
120   b = (FLOAT *)B;
121   c = (FLOAT *)C;
122 
123   lda = LDA;
124   ldb = LDB;
125   ldc = LDC;
126 
127   alpha = (FLOAT *)args -> alpha;
128   beta  = (FLOAT *)args -> beta;
129 
130   m_from = 0;
131   m_to   = M;
132 
133   if (range_m) {
134     m_from = *(((BLASLONG *)range_m) + 0);
135     m_to   = *(((BLASLONG *)range_m) + 1);
136   }
137 
138   n_from = 0;
139   n_to   = N;
140 
141   if (range_n) {
142     n_from = *(((BLASLONG *)range_n) + 0);
143     n_to   = *(((BLASLONG *)range_n) + 1);
144   }
145 
146   if (beta) {
147 #if !defined(COMPLEX) || defined(HER2K)
148     if (beta[0] != ONE)
149 #else
150     if ((beta[0] != ONE) || (beta[1] != ZERO))
151 #endif
152       syrk_beta(m_from, m_to, n_from, n_to, beta, c, ldc);
153   }
154 
155   if ((k == 0) || (alpha == NULL)) return 0;
156 
157   if ((alpha[0] == ZERO)
158 #ifdef COMPLEX
159       && (alpha[1] == ZERO)
160 #endif
161       ) return 0;
162 
163   for(js = n_from; js < n_to; js += GEMM_R){
164     min_j = n_to - js;
165     if (min_j > GEMM_R) min_j = GEMM_R;
166 
167 #ifndef LOWER
168     m_start = m_from;
169     m_end   = js + min_j;
170     if (m_end > m_to) m_end = m_to;
171 #else
172     m_start = m_from;
173     m_end   = m_to;
174     if (m_start < js) m_start = js;
175 #endif
176 
177     for(ls = 0; ls < k; ls += min_l){
178       min_l = k - ls;
179       if (min_l >= GEMM_Q * 2) {
180 	min_l = GEMM_Q;
181       } else
182 	if (min_l > GEMM_Q) {
183 	  min_l = (min_l + 1) / 2;
184 	}
185 
186       min_i = m_end - m_start;
187 
188       if (min_i >= GEMM_P * 2) {
189 	min_i = GEMM_P;
190       } else
191 	if (min_i > GEMM_P) {
192 	  min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
193 	}
194 
195 #ifndef LOWER
196 
197       if (m_start >= js) {
198 
199 	ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa);
200 
201 	aa = sb + min_l * (m_start - js)  * COMPSIZE;
202 
203 	OCOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, aa);
204 
205 	KERNEL_OPERATION(min_i, min_i, min_l, alpha, sa, aa, c, ldc, m_start, m_start, 1);
206 
207 	jjs = m_start + min_i;
208 
209       } else {
210 
211 	ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa);
212 
213 	jjs = js;
214       }
215 
216       for(; jjs < js + min_j; jjs += GEMM_UNROLL_MN){
217 	min_jj = min_j + js - jjs;
218 	if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
219 
220 	OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
221 
222 	KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
223 			 sa, sb + min_l * (jjs - js)  * COMPSIZE,
224 			 c, ldc, m_start, jjs, 1);
225       }
226 
227       for(is = m_start + min_i; is < m_end; is += min_i){
228 	min_i = m_end - is;
229 	if (min_i >= GEMM_P * 2) {
230 	  min_i = GEMM_P;
231 	} else
232 	  if (min_i > GEMM_P) {
233 	    min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
234 	  }
235 
236 	ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
237 
238 	KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 1);
239 
240       }
241 
242       min_i = m_end - m_start;
243 
244       if (min_i >= GEMM_P * 2) {
245 	min_i = GEMM_P;
246       } else
247 	if (min_i > GEMM_P) {
248 	  min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
249 	}
250 
251       if (m_start >= js) {
252 
253 	ICOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, sa);
254 
255 	aa = sb + min_l * (m_start - js)  * COMPSIZE;
256 
257 	OCOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, aa);
258 
259 	KERNEL_OPERATION_C(min_i, min_i, min_l, alpha, sa, aa, c, ldc, m_start, m_start, 0);
260 
261 	jjs = m_start + min_i;
262 
263       } else {
264 
265 	ICOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, sa);
266 
267 	jjs = js;
268       }
269 
270       for(; jjs < js + min_j; jjs += GEMM_UNROLL_MN){
271 	min_jj = min_j + js - jjs;
272 	if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
273 
274 	OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
275 
276 	KERNEL_OPERATION_C(min_i, min_jj, min_l, alpha,
277 			 sa, sb + min_l * (jjs - js)  * COMPSIZE,
278 			 c, ldc, m_start, jjs, 0);
279       }
280 
281       for(is = m_start + min_i; is < m_end; is += min_i){
282 	min_i = m_end - is;
283 	if (min_i >= GEMM_P * 2) {
284 	  min_i = GEMM_P;
285 	} else
286 	  if (min_i > GEMM_P) {
287 	    min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
288 	  }
289 
290 	ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa);
291 
292 	KERNEL_OPERATION_C(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 0);
293 
294       }
295 
296 #else
297 
298       aa = sb + min_l * (m_start - js) * COMPSIZE;
299 
300       ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa);
301 
302       OCOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, aa);
303 
304       KERNEL_OPERATION(min_i, MIN(min_i, min_j + js - m_start), min_l, alpha,
305 		       sa, aa, c, ldc, m_start, m_start, 1);
306 
307       for(jjs = js; jjs < m_start; jjs += GEMM_UNROLL_MN){
308 	min_jj = m_start - jjs;
309 	if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
310 
311 	OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
312 
313 	KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
314 			 sa, sb + min_l * (jjs - js)  * COMPSIZE, c, ldc, m_start, jjs, 1);
315       }
316 
317       for(is = m_start + min_i; is < m_end; is += min_i){
318 
319 	min_i = m_end - is;
320 
321 	if (min_i >= GEMM_P * 2) {
322 	  min_i = GEMM_P;
323 	} else
324 	  if (min_i > GEMM_P) {
325 	    min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
326 	  }
327 
328 	aa = sb + min_l * (is - js) * COMPSIZE;
329 
330 	if (is  < js + min_j) {
331 
332 	  ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
333 
334 	  OCOPY_OPERATION(min_l, min_i, b, ldb, ls, is, aa);
335 
336 	  KERNEL_OPERATION(min_i, MIN(min_i, min_j - is + js), min_l, alpha,  sa, aa,  c, ldc, is, is, 1);
337 
338 	  KERNEL_OPERATION(min_i, is - js, min_l, alpha, sa, sb,  c, ldc, is, js, 1);
339 
340 	  } else {
341 
342 	    ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
343 
344 	    KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb,  c, ldc, is, js, 1);
345 
346 	  }
347 
348       }
349 
350       min_i = m_end - m_start;
351 
352       if (min_i >= GEMM_P * 2) {
353 	min_i = GEMM_P;
354       } else
355 	if (min_i > GEMM_P) {
356 	  min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
357 	}
358 
359       aa = sb + min_l * (m_start - js) * COMPSIZE;
360 
361       ICOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, sa);
362 
363       OCOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, aa);
364 
365       KERNEL_OPERATION_C(min_i, MIN(min_i, min_j + js - m_start), min_l, alpha,
366 		       sa, aa, c, ldc, m_start, m_start, 0);
367 
368       for(jjs = js; jjs < m_start; jjs += GEMM_UNROLL_MN){
369 	min_jj = m_start - jjs;
370 	if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
371 
372 	OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
373 
374 	KERNEL_OPERATION_C(min_i, min_jj, min_l, alpha,
375 			 sa, sb + min_l * (jjs - js)  * COMPSIZE, c, ldc, m_start, jjs, 0);
376       }
377 
378       for(is = m_start + min_i; is < m_end; is += min_i){
379 
380 	min_i = m_end - is;
381 
382 	if (min_i >= GEMM_P * 2) {
383 	  min_i = GEMM_P;
384 	} else
385 	  if (min_i > GEMM_P) {
386 	    min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
387 	  }
388 
389 	aa = sb + min_l * (is - js) * COMPSIZE;
390 
391 	if (is  < js + min_j) {
392 
393 	  ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa);
394 
395 	  OCOPY_OPERATION(min_l, min_i, a, lda, ls, is, aa);
396 
397 	  KERNEL_OPERATION_C(min_i, MIN(min_i, min_j - is + js), min_l, alpha,  sa, aa,  c, ldc, is, is, 0);
398 
399 	  KERNEL_OPERATION_C(min_i, is - js, min_l, alpha, sa, sb,  c, ldc, is, js, 0);
400 
401 	  } else {
402 
403 	    ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa);
404 
405 	    KERNEL_OPERATION_C(min_i, min_j, min_l, alpha, sa, sb,  c, ldc, is, js, 0);
406 
407 	  }
408 
409       }
410 
411 
412 
413 #endif
414     }
415   }
416 
417   return 0;
418 }
419