1 /*********************************************************************/
2 /* */
3 /* Optimized BLAS libraries */
4 /* By Kazushige Goto <kgoto@tacc.utexas.edu> */
5 /* */
6 /* Copyright (c) The University of Texas, 2009. All rights reserved. */
7 /* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */
8 /* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */
9 /* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */
10 /* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */
11 /* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */
12 /* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */
13 /* THE USE OF THE SOFTWARE OR DOCUMENTATION. */
14 /* Under no circumstances shall University be liable for incidental, */
15 /* special, indirect, direct or consequential damages or loss of */
16 /* profits, interruption of business, or related expenses which may */
17 /* arise from use of Software or Documentation, including but not */
18 /* limited to those resulting from defects in Software and/or */
19 /* Documentation, or loss or inaccuracy of data of any kind. */
20 /*********************************************************************/
21
22 #ifndef KERNEL_OPERATION
23 #ifndef COMPLEX
24 #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y, FLAG) \
25 KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y), FLAG)
26 #else
27 #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y, FLAG) \
28 KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y), FLAG)
29 #endif
30 #endif
31
32 #ifndef KERNEL_OPERATION_C
33 #define KERNEL_OPERATION_C KERNEL_OPERATION
34 #endif
35
36 #ifndef ICOPY_OPERATION
37 #ifndef TRANS
38 #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
39 #else
40 #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
41 #endif
42 #endif
43
44 #ifndef OCOPY_OPERATION
45 #ifdef TRANS
46 #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
47 #else
48 #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
49 #endif
50 #endif
51
52 #ifndef M
53 #define M args -> n
54 #endif
55
56 #ifndef N
57 #define N args -> n
58 #endif
59
60 #ifndef K
61 #define K args -> k
62 #endif
63
64 #ifndef A
65 #define A args -> a
66 #endif
67
68 #ifndef B
69 #define B args -> b
70 #endif
71
72 #ifndef C
73 #define C args -> c
74 #endif
75
76 #ifndef LDA
77 #define LDA args -> lda
78 #endif
79
80 #ifndef LDB
81 #define LDB args -> ldb
82 #endif
83
84 #ifndef LDC
85 #define LDC args -> ldc
86 #endif
87
CNAME(blas_arg_t * args,BLASLONG * range_m,BLASLONG * range_n,FLOAT * sa,FLOAT * sb,BLASLONG dummy)88 int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) {
89
90 BLASLONG m_from, m_to, n_from, n_to, k, lda, ldb, ldc;
91 FLOAT *a, *b, *c, *alpha, *beta;
92
93 BLASLONG ls, is, js;
94 BLASLONG min_l, min_i, min_j;
95 BLASLONG jjs, min_jj;
96 BLASLONG m_start, m_end;
97
98 FLOAT *aa;
99
100 k = K;
101
102 a = (FLOAT *)A;
103 b = (FLOAT *)B;
104 c = (FLOAT *)C;
105
106 lda = LDA;
107 ldb = LDB;
108 ldc = LDC;
109
110 alpha = (FLOAT *)args -> alpha;
111 beta = (FLOAT *)args -> beta;
112
113 m_from = 0;
114 m_to = M;
115
116 if (range_m) {
117 m_from = *(((BLASLONG *)range_m) + 0);
118 m_to = *(((BLASLONG *)range_m) + 1);
119 }
120
121 n_from = 0;
122 n_to = N;
123
124 if (range_n) {
125 n_from = *(((BLASLONG *)range_n) + 0);
126 n_to = *(((BLASLONG *)range_n) + 1);
127 }
128
129 if (beta) {
130 #if !defined(COMPLEX) || defined(HER2K)
131 if (beta[0] != ONE)
132 #else
133 if ((beta[0] != ONE) || (beta[1] != ZERO))
134 #endif
135 syrk_beta(m_from, m_to, n_from, n_to, beta, c, ldc);
136 }
137
138 if ((k == 0) || (alpha == NULL)) return 0;
139
140 if ((alpha[0] == ZERO)
141 #ifdef COMPLEX
142 && (alpha[1] == ZERO)
143 #endif
144 ) return 0;
145
146 for(js = n_from; js < n_to; js += GEMM_R){
147 min_j = n_to - js;
148 if (min_j > GEMM_R) min_j = GEMM_R;
149
150 #ifndef LOWER
151 m_start = m_from;
152 m_end = js + min_j;
153 if (m_end > m_to) m_end = m_to;
154 #else
155 m_start = m_from;
156 m_end = m_to;
157 if (m_start < js) m_start = js;
158 #endif
159
160 for(ls = 0; ls < k; ls += min_l){
161 min_l = k - ls;
162 if (min_l >= GEMM_Q * 2) {
163 min_l = GEMM_Q;
164 } else
165 if (min_l > GEMM_Q) {
166 min_l = (min_l + 1) / 2;
167 }
168
169 min_i = m_end - m_start;
170
171 if (min_i >= GEMM_P * 2) {
172 min_i = GEMM_P;
173 } else
174 if (min_i > GEMM_P) {
175 min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
176 }
177
178 #ifndef LOWER
179
180 if (m_start >= js) {
181
182 ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa);
183
184 aa = sb + min_l * (m_start - js) * COMPSIZE;
185
186 OCOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, aa);
187
188 KERNEL_OPERATION(min_i, min_i, min_l, alpha, sa, aa, c, ldc, m_start, m_start, 1);
189
190 jjs = m_start + min_i;
191
192 } else {
193
194 ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa);
195
196 jjs = js;
197 }
198
199 for(; jjs < js + min_j; jjs += GEMM_UNROLL_MN){
200 min_jj = min_j + js - jjs;
201 if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
202
203 OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
204
205 KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
206 sa, sb + min_l * (jjs - js) * COMPSIZE,
207 c, ldc, m_start, jjs, 1);
208 }
209
210 for(is = m_start + min_i; is < m_end; is += min_i){
211 min_i = m_end - is;
212 if (min_i >= GEMM_P * 2) {
213 min_i = GEMM_P;
214 } else
215 if (min_i > GEMM_P) {
216 min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
217 }
218
219 ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
220
221 KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 1);
222
223 }
224
225 min_i = m_end - m_start;
226
227 if (min_i >= GEMM_P * 2) {
228 min_i = GEMM_P;
229 } else
230 if (min_i > GEMM_P) {
231 min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
232 }
233
234 if (m_start >= js) {
235
236 ICOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, sa);
237
238 aa = sb + min_l * (m_start - js) * COMPSIZE;
239
240 OCOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, aa);
241
242 KERNEL_OPERATION_C(min_i, min_i, min_l, alpha, sa, aa, c, ldc, m_start, m_start, 0);
243
244 jjs = m_start + min_i;
245
246 } else {
247
248 ICOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, sa);
249
250 jjs = js;
251 }
252
253 for(; jjs < js + min_j; jjs += GEMM_UNROLL_MN){
254 min_jj = min_j + js - jjs;
255 if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
256
257 OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
258
259 KERNEL_OPERATION_C(min_i, min_jj, min_l, alpha,
260 sa, sb + min_l * (jjs - js) * COMPSIZE,
261 c, ldc, m_start, jjs, 0);
262 }
263
264 for(is = m_start + min_i; is < m_end; is += min_i){
265 min_i = m_end - is;
266 if (min_i >= GEMM_P * 2) {
267 min_i = GEMM_P;
268 } else
269 if (min_i > GEMM_P) {
270 min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
271 }
272
273 ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa);
274
275 KERNEL_OPERATION_C(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 0);
276
277 }
278
279 #else
280
281 aa = sb + min_l * (m_start - js) * COMPSIZE;
282
283 ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa);
284
285 OCOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, aa);
286
287 KERNEL_OPERATION(min_i, MIN(min_i, min_j + js - m_start), min_l, alpha,
288 sa, aa, c, ldc, m_start, m_start, 1);
289
290 for(jjs = js; jjs < m_start; jjs += GEMM_UNROLL_MN){
291 min_jj = m_start - jjs;
292 if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
293
294 OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
295
296 KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
297 sa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs, 1);
298 }
299
300 for(is = m_start + min_i; is < m_end; is += min_i){
301
302 min_i = m_end - is;
303
304 if (min_i >= GEMM_P * 2) {
305 min_i = GEMM_P;
306 } else
307 if (min_i > GEMM_P) {
308 min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
309 }
310
311 aa = sb + min_l * (is - js) * COMPSIZE;
312
313 if (is < js + min_j) {
314
315 ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
316
317 OCOPY_OPERATION(min_l, min_i, b, ldb, ls, is, aa);
318
319 KERNEL_OPERATION(min_i, MIN(min_i, min_j - is + js), min_l, alpha, sa, aa, c, ldc, is, is, 1);
320
321 KERNEL_OPERATION(min_i, is - js, min_l, alpha, sa, sb, c, ldc, is, js, 1);
322
323 } else {
324
325 ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
326
327 KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 1);
328
329 }
330
331 }
332
333 min_i = m_end - m_start;
334
335 if (min_i >= GEMM_P * 2) {
336 min_i = GEMM_P;
337 } else
338 if (min_i > GEMM_P) {
339 min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
340 }
341
342 aa = sb + min_l * (m_start - js) * COMPSIZE;
343
344 ICOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, sa);
345
346 OCOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, aa);
347
348 KERNEL_OPERATION_C(min_i, MIN(min_i, min_j + js - m_start), min_l, alpha,
349 sa, aa, c, ldc, m_start, m_start, 0);
350
351 for(jjs = js; jjs < m_start; jjs += GEMM_UNROLL_MN){
352 min_jj = m_start - jjs;
353 if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
354
355 OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
356
357 KERNEL_OPERATION_C(min_i, min_jj, min_l, alpha,
358 sa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs, 0);
359 }
360
361 for(is = m_start + min_i; is < m_end; is += min_i){
362
363 min_i = m_end - is;
364
365 if (min_i >= GEMM_P * 2) {
366 min_i = GEMM_P;
367 } else
368 if (min_i > GEMM_P) {
369 min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
370 }
371
372 aa = sb + min_l * (is - js) * COMPSIZE;
373
374 if (is < js + min_j) {
375
376 ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa);
377
378 OCOPY_OPERATION(min_l, min_i, a, lda, ls, is, aa);
379
380 KERNEL_OPERATION_C(min_i, MIN(min_i, min_j - is + js), min_l, alpha, sa, aa, c, ldc, is, is, 0);
381
382 KERNEL_OPERATION_C(min_i, is - js, min_l, alpha, sa, sb, c, ldc, is, js, 0);
383
384 } else {
385
386 ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa);
387
388 KERNEL_OPERATION_C(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 0);
389
390 }
391
392 }
393
394
395
396 #endif
397 }
398 }
399
400 return 0;
401 }
402