1 /*********************************************************************/
2 /*                                                                   */
3 /*             Optimized BLAS libraries                              */
4 /*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5 /*                                                                   */
6 /* Copyright (c) The University of Texas, 2009. All rights reserved. */
7 /* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8 /* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9 /* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10 /* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11 /* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12 /* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13 /* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14 /* Under no circumstances shall University be liable for incidental, */
15 /* special, indirect, direct or consequential damages or loss of     */
16 /* profits, interruption of business, or related expenses which may  */
17 /* arise from use of Software or Documentation, including but not    */
18 /* limited to those resulting from defects in Software and/or        */
19 /* Documentation, or loss or inaccuracy of data of any kind.         */
20 /*********************************************************************/
21 
22 #ifndef KERNEL_OPERATION
23 #ifndef COMPLEX
24 #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y, FLAG) \
25 	KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y), FLAG)
26 #else
27 #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y, FLAG) \
28 	KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y), FLAG)
29 #endif
30 #endif
31 
32 #ifndef KERNEL_OPERATION_C
33 #define KERNEL_OPERATION_C KERNEL_OPERATION
34 #endif
35 
36 #ifndef ICOPY_OPERATION
37 #ifndef TRANS
38 #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
39 #else
40 #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
41 #endif
42 #endif
43 
44 #ifndef OCOPY_OPERATION
45 #ifdef TRANS
46 #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
47 #else
48 #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
49 #endif
50 #endif
51 
52 #ifndef M
53 #define M	args -> n
54 #endif
55 
56 #ifndef N
57 #define N	args -> n
58 #endif
59 
60 #ifndef K
61 #define K	args -> k
62 #endif
63 
64 #ifndef A
65 #define A	args -> a
66 #endif
67 
68 #ifndef B
69 #define B	args -> b
70 #endif
71 
72 #ifndef C
73 #define C	args -> c
74 #endif
75 
76 #ifndef LDA
77 #define LDA	args -> lda
78 #endif
79 
80 #ifndef LDB
81 #define LDB	args -> ldb
82 #endif
83 
84 #ifndef LDC
85 #define LDC	args -> ldc
86 #endif
87 
CNAME(blas_arg_t * args,BLASLONG * range_m,BLASLONG * range_n,FLOAT * sa,FLOAT * sb,BLASLONG dummy)88 int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) {
89 
90   BLASLONG m_from, m_to, n_from, n_to, k, lda, ldb, ldc;
91   FLOAT *a, *b, *c, *alpha, *beta;
92 
93   BLASLONG ls, is, js;
94   BLASLONG min_l, min_i, min_j;
95   BLASLONG jjs, min_jj;
96   BLASLONG m_start, m_end;
97 
98   FLOAT *aa;
99 
100   k = K;
101 
102   a = (FLOAT *)A;
103   b = (FLOAT *)B;
104   c = (FLOAT *)C;
105 
106   lda = LDA;
107   ldb = LDB;
108   ldc = LDC;
109 
110   alpha = (FLOAT *)args -> alpha;
111   beta  = (FLOAT *)args -> beta;
112 
113   m_from = 0;
114   m_to   = M;
115 
116   if (range_m) {
117     m_from = *(((BLASLONG *)range_m) + 0);
118     m_to   = *(((BLASLONG *)range_m) + 1);
119   }
120 
121   n_from = 0;
122   n_to   = N;
123 
124   if (range_n) {
125     n_from = *(((BLASLONG *)range_n) + 0);
126     n_to   = *(((BLASLONG *)range_n) + 1);
127   }
128 
129   if (beta) {
130 #if !defined(COMPLEX) || defined(HER2K)
131     if (beta[0] != ONE)
132 #else
133     if ((beta[0] != ONE) || (beta[1] != ZERO))
134 #endif
135       syrk_beta(m_from, m_to, n_from, n_to, beta, c, ldc);
136   }
137 
138   if ((k == 0) || (alpha == NULL)) return 0;
139 
140   if ((alpha[0] == ZERO)
141 #ifdef COMPLEX
142       && (alpha[1] == ZERO)
143 #endif
144       ) return 0;
145 
146   for(js = n_from; js < n_to; js += GEMM_R){
147     min_j = n_to - js;
148     if (min_j > GEMM_R) min_j = GEMM_R;
149 
150 #ifndef LOWER
151     m_start = m_from;
152     m_end   = js + min_j;
153     if (m_end > m_to) m_end = m_to;
154 #else
155     m_start = m_from;
156     m_end   = m_to;
157     if (m_start < js) m_start = js;
158 #endif
159 
160     for(ls = 0; ls < k; ls += min_l){
161       min_l = k - ls;
162       if (min_l >= GEMM_Q * 2) {
163 	min_l = GEMM_Q;
164       } else
165 	if (min_l > GEMM_Q) {
166 	  min_l = (min_l + 1) / 2;
167 	}
168 
169       min_i = m_end - m_start;
170 
171       if (min_i >= GEMM_P * 2) {
172 	min_i = GEMM_P;
173       } else
174 	if (min_i > GEMM_P) {
175 	  min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
176 	}
177 
178 #ifndef LOWER
179 
180       if (m_start >= js) {
181 
182 	ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa);
183 
184 	aa = sb + min_l * (m_start - js)  * COMPSIZE;
185 
186 	OCOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, aa);
187 
188 	KERNEL_OPERATION(min_i, min_i, min_l, alpha, sa, aa, c, ldc, m_start, m_start, 1);
189 
190 	jjs = m_start + min_i;
191 
192       } else {
193 
194 	ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa);
195 
196 	jjs = js;
197       }
198 
199       for(; jjs < js + min_j; jjs += GEMM_UNROLL_MN){
200 	min_jj = min_j + js - jjs;
201 	if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
202 
203 	OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
204 
205 	KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
206 			 sa, sb + min_l * (jjs - js)  * COMPSIZE,
207 			 c, ldc, m_start, jjs, 1);
208       }
209 
210       for(is = m_start + min_i; is < m_end; is += min_i){
211 	min_i = m_end - is;
212 	if (min_i >= GEMM_P * 2) {
213 	  min_i = GEMM_P;
214 	} else
215 	  if (min_i > GEMM_P) {
216 	    min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
217 	  }
218 
219 	ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
220 
221 	KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 1);
222 
223       }
224 
225       min_i = m_end - m_start;
226 
227       if (min_i >= GEMM_P * 2) {
228 	min_i = GEMM_P;
229       } else
230 	if (min_i > GEMM_P) {
231 	  min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
232 	}
233 
234       if (m_start >= js) {
235 
236 	ICOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, sa);
237 
238 	aa = sb + min_l * (m_start - js)  * COMPSIZE;
239 
240 	OCOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, aa);
241 
242 	KERNEL_OPERATION_C(min_i, min_i, min_l, alpha, sa, aa, c, ldc, m_start, m_start, 0);
243 
244 	jjs = m_start + min_i;
245 
246       } else {
247 
248 	ICOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, sa);
249 
250 	jjs = js;
251       }
252 
253       for(; jjs < js + min_j; jjs += GEMM_UNROLL_MN){
254 	min_jj = min_j + js - jjs;
255 	if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
256 
257 	OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
258 
259 	KERNEL_OPERATION_C(min_i, min_jj, min_l, alpha,
260 			 sa, sb + min_l * (jjs - js)  * COMPSIZE,
261 			 c, ldc, m_start, jjs, 0);
262       }
263 
264       for(is = m_start + min_i; is < m_end; is += min_i){
265 	min_i = m_end - is;
266 	if (min_i >= GEMM_P * 2) {
267 	  min_i = GEMM_P;
268 	} else
269 	  if (min_i > GEMM_P) {
270 	    min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
271 	  }
272 
273 	ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa);
274 
275 	KERNEL_OPERATION_C(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 0);
276 
277       }
278 
279 #else
280 
281       aa = sb + min_l * (m_start - js) * COMPSIZE;
282 
283       ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa);
284 
285       OCOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, aa);
286 
287       KERNEL_OPERATION(min_i, MIN(min_i, min_j + js - m_start), min_l, alpha,
288 		       sa, aa, c, ldc, m_start, m_start, 1);
289 
290       for(jjs = js; jjs < m_start; jjs += GEMM_UNROLL_MN){
291 	min_jj = m_start - jjs;
292 	if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
293 
294 	OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
295 
296 	KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
297 			 sa, sb + min_l * (jjs - js)  * COMPSIZE, c, ldc, m_start, jjs, 1);
298       }
299 
300       for(is = m_start + min_i; is < m_end; is += min_i){
301 
302 	min_i = m_end - is;
303 
304 	if (min_i >= GEMM_P * 2) {
305 	  min_i = GEMM_P;
306 	} else
307 	  if (min_i > GEMM_P) {
308 	    min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
309 	  }
310 
311 	aa = sb + min_l * (is - js) * COMPSIZE;
312 
313 	if (is  < js + min_j) {
314 
315 	  ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
316 
317 	  OCOPY_OPERATION(min_l, min_i, b, ldb, ls, is, aa);
318 
319 	  KERNEL_OPERATION(min_i, MIN(min_i, min_j - is + js), min_l, alpha,  sa, aa,  c, ldc, is, is, 1);
320 
321 	  KERNEL_OPERATION(min_i, is - js, min_l, alpha, sa, sb,  c, ldc, is, js, 1);
322 
323 	  } else {
324 
325 	    ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
326 
327 	    KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb,  c, ldc, is, js, 1);
328 
329 	  }
330 
331       }
332 
333       min_i = m_end - m_start;
334 
335       if (min_i >= GEMM_P * 2) {
336 	min_i = GEMM_P;
337       } else
338 	if (min_i > GEMM_P) {
339 	  min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
340 	}
341 
342       aa = sb + min_l * (m_start - js) * COMPSIZE;
343 
344       ICOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, sa);
345 
346       OCOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, aa);
347 
348       KERNEL_OPERATION_C(min_i, MIN(min_i, min_j + js - m_start), min_l, alpha,
349 		       sa, aa, c, ldc, m_start, m_start, 0);
350 
351       for(jjs = js; jjs < m_start; jjs += GEMM_UNROLL_MN){
352 	min_jj = m_start - jjs;
353 	if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
354 
355 	OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
356 
357 	KERNEL_OPERATION_C(min_i, min_jj, min_l, alpha,
358 			 sa, sb + min_l * (jjs - js)  * COMPSIZE, c, ldc, m_start, jjs, 0);
359       }
360 
361       for(is = m_start + min_i; is < m_end; is += min_i){
362 
363 	min_i = m_end - is;
364 
365 	if (min_i >= GEMM_P * 2) {
366 	  min_i = GEMM_P;
367 	} else
368 	  if (min_i > GEMM_P) {
369 	    min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
370 	  }
371 
372 	aa = sb + min_l * (is - js) * COMPSIZE;
373 
374 	if (is  < js + min_j) {
375 
376 	  ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa);
377 
378 	  OCOPY_OPERATION(min_l, min_i, a, lda, ls, is, aa);
379 
380 	  KERNEL_OPERATION_C(min_i, MIN(min_i, min_j - is + js), min_l, alpha,  sa, aa,  c, ldc, is, is, 0);
381 
382 	  KERNEL_OPERATION_C(min_i, is - js, min_l, alpha, sa, sb,  c, ldc, is, js, 0);
383 
384 	  } else {
385 
386 	    ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa);
387 
388 	    KERNEL_OPERATION_C(min_i, min_j, min_l, alpha, sa, sb,  c, ldc, is, js, 0);
389 
390 	  }
391 
392       }
393 
394 
395 
396 #endif
397     }
398   }
399 
400   return 0;
401 }
402