1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
4 /* */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
8 /* */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
11 /* disclaimer. */
12 /* */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
17 /* */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
32 /* */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
38
39 #ifndef KERNEL_OPERATION
40 #ifndef COMPLEX
41 #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y, FLAG) \
42 KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y), FLAG)
43 #else
44 #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y, FLAG) \
45 KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y), FLAG)
46 #endif
47 #endif
48
49 #ifndef KERNEL_OPERATION_C
50 #define KERNEL_OPERATION_C KERNEL_OPERATION
51 #endif
52
53 #ifndef ICOPY_OPERATION
54 #ifndef TRANS
55 #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
56 #else
57 #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
58 #endif
59 #endif
60
61 #ifndef OCOPY_OPERATION
62 #ifdef TRANS
63 #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
64 #else
65 #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
66 #endif
67 #endif
68
69 #ifndef M
70 #define M args -> n
71 #endif
72
73 #ifndef N
74 #define N args -> n
75 #endif
76
77 #ifndef K
78 #define K args -> k
79 #endif
80
81 #ifndef A
82 #define A args -> a
83 #endif
84
85 #ifndef B
86 #define B args -> b
87 #endif
88
89 #ifndef C
90 #define C args -> c
91 #endif
92
93 #ifndef LDA
94 #define LDA args -> lda
95 #endif
96
97 #ifndef LDB
98 #define LDB args -> ldb
99 #endif
100
101 #ifndef LDC
102 #define LDC args -> ldc
103 #endif
104
CNAME(blas_arg_t * args,BLASLONG * range_m,BLASLONG * range_n,FLOAT * sa,FLOAT * sb,BLASLONG dummy)105 int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) {
106
107 BLASLONG m_from, m_to, n_from, n_to, k, lda, ldb, ldc;
108 FLOAT *a, *b, *c, *alpha, *beta;
109
110 BLASLONG ls, is, js;
111 BLASLONG min_l, min_i, min_j;
112 BLASLONG jjs, min_jj;
113 BLASLONG m_start, m_end;
114
115 FLOAT *aa;
116
117 k = K;
118
119 a = (FLOAT *)A;
120 b = (FLOAT *)B;
121 c = (FLOAT *)C;
122
123 lda = LDA;
124 ldb = LDB;
125 ldc = LDC;
126
127 alpha = (FLOAT *)args -> alpha;
128 beta = (FLOAT *)args -> beta;
129
130 m_from = 0;
131 m_to = M;
132
133 if (range_m) {
134 m_from = *(((BLASLONG *)range_m) + 0);
135 m_to = *(((BLASLONG *)range_m) + 1);
136 }
137
138 n_from = 0;
139 n_to = N;
140
141 if (range_n) {
142 n_from = *(((BLASLONG *)range_n) + 0);
143 n_to = *(((BLASLONG *)range_n) + 1);
144 }
145
146 if (beta) {
147 #if !defined(COMPLEX) || defined(HER2K)
148 if (beta[0] != ONE)
149 #else
150 if ((beta[0] != ONE) || (beta[1] != ZERO))
151 #endif
152 syrk_beta(m_from, m_to, n_from, n_to, beta, c, ldc);
153 }
154
155 if ((k == 0) || (alpha == NULL)) return 0;
156
157 if ((alpha[0] == ZERO)
158 #ifdef COMPLEX
159 && (alpha[1] == ZERO)
160 #endif
161 ) return 0;
162
163 for(js = n_from; js < n_to; js += GEMM_R){
164 min_j = n_to - js;
165 if (min_j > GEMM_R) min_j = GEMM_R;
166
167 #ifndef LOWER
168 m_start = m_from;
169 m_end = js + min_j;
170 if (m_end > m_to) m_end = m_to;
171 #else
172 m_start = m_from;
173 m_end = m_to;
174 if (m_start < js) m_start = js;
175 #endif
176
177 for(ls = 0; ls < k; ls += min_l){
178 min_l = k - ls;
179 if (min_l >= GEMM_Q * 2) {
180 min_l = GEMM_Q;
181 } else
182 if (min_l > GEMM_Q) {
183 min_l = (min_l + 1) / 2;
184 }
185
186 min_i = m_end - m_start;
187
188 if (min_i >= GEMM_P * 2) {
189 min_i = GEMM_P;
190 } else
191 if (min_i > GEMM_P) {
192 min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
193 }
194
195 #ifndef LOWER
196
197 if (m_start >= js) {
198
199 ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa);
200
201 aa = sb + min_l * (m_start - js) * COMPSIZE;
202
203 OCOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, aa);
204
205 KERNEL_OPERATION(min_i, min_i, min_l, alpha, sa, aa, c, ldc, m_start, m_start, 1);
206
207 jjs = m_start + min_i;
208
209 } else {
210
211 ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa);
212
213 jjs = js;
214 }
215
216 for(; jjs < js + min_j; jjs += GEMM_UNROLL_MN){
217 min_jj = min_j + js - jjs;
218 if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
219
220 OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
221
222 KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
223 sa, sb + min_l * (jjs - js) * COMPSIZE,
224 c, ldc, m_start, jjs, 1);
225 }
226
227 for(is = m_start + min_i; is < m_end; is += min_i){
228 min_i = m_end - is;
229 if (min_i >= GEMM_P * 2) {
230 min_i = GEMM_P;
231 } else
232 if (min_i > GEMM_P) {
233 min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
234 }
235
236 ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
237
238 KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 1);
239
240 }
241
242 min_i = m_end - m_start;
243
244 if (min_i >= GEMM_P * 2) {
245 min_i = GEMM_P;
246 } else
247 if (min_i > GEMM_P) {
248 min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
249 }
250
251 if (m_start >= js) {
252
253 ICOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, sa);
254
255 aa = sb + min_l * (m_start - js) * COMPSIZE;
256
257 OCOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, aa);
258
259 KERNEL_OPERATION_C(min_i, min_i, min_l, alpha, sa, aa, c, ldc, m_start, m_start, 0);
260
261 jjs = m_start + min_i;
262
263 } else {
264
265 ICOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, sa);
266
267 jjs = js;
268 }
269
270 for(; jjs < js + min_j; jjs += GEMM_UNROLL_MN){
271 min_jj = min_j + js - jjs;
272 if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
273
274 OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
275
276 KERNEL_OPERATION_C(min_i, min_jj, min_l, alpha,
277 sa, sb + min_l * (jjs - js) * COMPSIZE,
278 c, ldc, m_start, jjs, 0);
279 }
280
281 for(is = m_start + min_i; is < m_end; is += min_i){
282 min_i = m_end - is;
283 if (min_i >= GEMM_P * 2) {
284 min_i = GEMM_P;
285 } else
286 if (min_i > GEMM_P) {
287 min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
288 }
289
290 ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa);
291
292 KERNEL_OPERATION_C(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 0);
293
294 }
295
296 #else
297
298 aa = sb + min_l * (m_start - js) * COMPSIZE;
299
300 ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa);
301
302 OCOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, aa);
303
304 KERNEL_OPERATION(min_i, MIN(min_i, min_j + js - m_start), min_l, alpha,
305 sa, aa, c, ldc, m_start, m_start, 1);
306
307 for(jjs = js; jjs < m_start; jjs += GEMM_UNROLL_MN){
308 min_jj = m_start - jjs;
309 if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
310
311 OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
312
313 KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
314 sa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs, 1);
315 }
316
317 for(is = m_start + min_i; is < m_end; is += min_i){
318
319 min_i = m_end - is;
320
321 if (min_i >= GEMM_P * 2) {
322 min_i = GEMM_P;
323 } else
324 if (min_i > GEMM_P) {
325 min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
326 }
327
328 aa = sb + min_l * (is - js) * COMPSIZE;
329
330 if (is < js + min_j) {
331
332 ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
333
334 OCOPY_OPERATION(min_l, min_i, b, ldb, ls, is, aa);
335
336 KERNEL_OPERATION(min_i, MIN(min_i, min_j - is + js), min_l, alpha, sa, aa, c, ldc, is, is, 1);
337
338 KERNEL_OPERATION(min_i, is - js, min_l, alpha, sa, sb, c, ldc, is, js, 1);
339
340 } else {
341
342 ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
343
344 KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 1);
345
346 }
347
348 }
349
350 min_i = m_end - m_start;
351
352 if (min_i >= GEMM_P * 2) {
353 min_i = GEMM_P;
354 } else
355 if (min_i > GEMM_P) {
356 min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
357 }
358
359 aa = sb + min_l * (m_start - js) * COMPSIZE;
360
361 ICOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, sa);
362
363 OCOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, aa);
364
365 KERNEL_OPERATION_C(min_i, MIN(min_i, min_j + js - m_start), min_l, alpha,
366 sa, aa, c, ldc, m_start, m_start, 0);
367
368 for(jjs = js; jjs < m_start; jjs += GEMM_UNROLL_MN){
369 min_jj = m_start - jjs;
370 if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
371
372 OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
373
374 KERNEL_OPERATION_C(min_i, min_jj, min_l, alpha,
375 sa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs, 0);
376 }
377
378 for(is = m_start + min_i; is < m_end; is += min_i){
379
380 min_i = m_end - is;
381
382 if (min_i >= GEMM_P * 2) {
383 min_i = GEMM_P;
384 } else
385 if (min_i > GEMM_P) {
386 min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
387 }
388
389 aa = sb + min_l * (is - js) * COMPSIZE;
390
391 if (is < js + min_j) {
392
393 ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa);
394
395 OCOPY_OPERATION(min_l, min_i, a, lda, ls, is, aa);
396
397 KERNEL_OPERATION_C(min_i, MIN(min_i, min_j - is + js), min_l, alpha, sa, aa, c, ldc, is, is, 0);
398
399 KERNEL_OPERATION_C(min_i, is - js, min_l, alpha, sa, sb, c, ldc, is, js, 0);
400
401 } else {
402
403 ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa);
404
405 KERNEL_OPERATION_C(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 0);
406
407 }
408
409 }
410
411
412
413 #endif
414 }
415 }
416
417 return 0;
418 }
419