1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
4 /* */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
8 /* */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
11 /* disclaimer. */
12 /* */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
17 /* */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
32 /* */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
38
39 #include <stdio.h>
40 #include "common.h"
41
42 static FLOAT dp1 = 1.;
43
44 #ifndef COMPLEX
45 #define TRMM_KERNEL TRMM_KERNEL_RT
46 #define SYRK_KERNEL SYRK_KERNEL_U
47 #else
48 #define TRMM_KERNEL TRMM_KERNEL_RC
49 #ifdef XDOUBLE
50 #define SYRK_KERNEL xherk_kernel_UN
51 #elif defined(DOUBLE)
52 #define SYRK_KERNEL zherk_kernel_UN
53 #else
54 #define SYRK_KERNEL cherk_kernel_UN
55 #endif
56 #endif
57
58 #if 0
59 #undef GEMM_P
60 #undef GEMM_Q
61 #undef GEMM_R
62
63 #define GEMM_P 8
64 #define GEMM_Q 20
65 #define GEMM_R 24
66 #endif
67
68 #define GEMM_PQ MAX(GEMM_P, GEMM_Q)
69 #define REAL_GEMM_R (GEMM_R - GEMM_PQ)
70
CNAME(blas_arg_t * args,BLASLONG * range_m,BLASLONG * range_n,FLOAT * sa,FLOAT * sb,BLASLONG myid)71 blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) {
72
73 BLASLONG n, lda;
74 FLOAT *a;
75
76 BLASLONG j, bk, blocking;
77 BLASLONG is, ls, ks;
78 BLASLONG jjs, min_jj;
79
80 BLASLONG min_i, min_l, min_k;
81 BLASLONG range_N[2];
82
83 FLOAT *sb2 = (FLOAT *)((((BLASLONG)sb
84 + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)
85 + GEMM_OFFSET_B);
86
87 #if 0
88 FLOAT *aa;
89 #endif
90
91 n = args -> n;
92 a = (FLOAT *)args -> a;
93 lda = args -> lda;
94
95 if (range_n) {
96 n = range_n[1] - range_n[0];
97 a += range_n[0] * (lda + 1) * COMPSIZE;
98 }
99
100 if (n <= DTB_ENTRIES) {
101 LAUU2_U(args, NULL, range_n, sa, sb, 0);
102 return 0;
103 }
104
105 blocking = GEMM_Q;
106 if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4;
107
108 for (j = 0; j < n; j += blocking) {
109 bk = n - j;
110 if (bk > blocking) bk = blocking;
111
112 if (j > 0) {
113
114 TRMM_OUTCOPY(bk, bk, a + (j + j * lda) * COMPSIZE, lda, 0, 0, sb);
115
116 for (ls = 0; ls < j; ls += REAL_GEMM_R) {
117 min_l = j - ls;
118
119 #if 0
120
121
122 if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R;
123 min_i = ls + min_l;
124 if (min_i > GEMM_P) min_i = GEMM_P;
125
126 if (ls > 0) {
127 GEMM_ITCOPY(bk, min_i, a + (j * lda) * COMPSIZE, lda, sa);
128 aa = sa;
129 } else {
130 aa = sb2;
131 }
132
133 for (jjs = ls; jjs < ls + min_l; jjs += GEMM_P){
134 min_jj = ls + min_l - jjs;
135 if (min_jj > GEMM_P) min_jj = GEMM_P;
136
137 GEMM_OTCOPY(bk, min_jj, a + (jjs + j * lda) * COMPSIZE, lda, sb2 + (jjs - ls) * bk * COMPSIZE);
138
139 SYRK_KERNEL(min_i, min_jj, bk, dp1,
140 aa,
141 sb2 + (jjs - ls) * bk * COMPSIZE,
142 a + (jjs * lda) * COMPSIZE, lda, - jjs);
143 }
144
145 if (ls + REAL_GEMM_R >= j ) {
146 for (ks = 0; ks < bk; ks += GEMM_P) {
147 min_k = bk - ks;
148 if (min_k > GEMM_P) min_k = GEMM_P;
149
150 TRMM_KERNEL(min_i, min_k, bk, dp1,
151 #ifdef COMPLEX
152 ZERO,
153 #endif
154 aa,
155 sb + ks * bk * COMPSIZE,
156 a + ((ks + j) * lda) * COMPSIZE, lda, -ks);
157 }
158 }
159
160 for(is = min_i; is < ls + min_l ; is += GEMM_P){
161 min_i = ls + min_l - is;
162 if (min_i > GEMM_P) min_i = GEMM_P;
163
164 if (is < ls) {
165 GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa);
166 aa = sa;
167 } else {
168 aa = sb2 + (is - ls) * bk * COMPSIZE;
169 }
170
171 SYRK_KERNEL(min_i, min_l, bk, dp1,
172 aa,
173 sb2,
174 a + (is + ls * lda) * COMPSIZE, lda, is - ls);
175
176 if (ls + REAL_GEMM_R >= j ) {
177 for (ks = 0; ks < bk; ks += GEMM_P) {
178 min_k = bk - ks;
179 if (min_k > GEMM_P) min_k = GEMM_P;
180
181 TRMM_KERNEL(min_i, min_k, bk, dp1,
182 #ifdef COMPLEX
183 ZERO,
184 #endif
185 aa,
186 sb + ks * bk * COMPSIZE,
187 a + (is + (ks + j) * lda) * COMPSIZE, lda, -ks);
188 }
189 }
190 }
191 #else
192 if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R;
193 min_i = ls + min_l;
194 if (min_i > GEMM_P) min_i = GEMM_P;
195
196 GEMM_ITCOPY(bk, min_i, a + (j * lda) * COMPSIZE, lda, sa);
197
198 for (jjs = ls; jjs < ls + min_l; jjs += GEMM_P){
199 min_jj = ls + min_l - jjs;
200 if (min_jj > GEMM_P) min_jj = GEMM_P;
201
202 GEMM_OTCOPY(bk, min_jj, a + (jjs + j * lda) * COMPSIZE, lda, sb2 + (jjs - ls) * bk * COMPSIZE);
203
204 SYRK_KERNEL(min_i, min_jj, bk, dp1,
205 sa,
206 sb2 + (jjs - ls) * bk * COMPSIZE,
207 a + (jjs * lda) * COMPSIZE, lda, - jjs);
208 }
209
210 if (ls + REAL_GEMM_R >= j ) {
211 for (ks = 0; ks < bk; ks += GEMM_P) {
212 min_k = bk - ks;
213 if (min_k > GEMM_P) min_k = GEMM_P;
214
215 TRMM_KERNEL(min_i, min_k, bk, dp1,
216 #ifdef COMPLEX
217 ZERO,
218 #endif
219 sa,
220 sb + ks * bk * COMPSIZE,
221 a + ((ks + j) * lda) * COMPSIZE, lda, -ks);
222 }
223 }
224
225 for(is = min_i; is < ls + min_l ; is += GEMM_P){
226 min_i = ls + min_l - is;
227 if (min_i > GEMM_P) min_i = GEMM_P;
228
229 GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa);
230
231 SYRK_KERNEL(min_i, min_l, bk, dp1,
232 sa,
233 sb2,
234 a + (is + ls * lda) * COMPSIZE, lda, is - ls);
235
236 if (ls + REAL_GEMM_R >= j ) {
237 for (ks = 0; ks < bk; ks += GEMM_P) {
238 min_k = bk - ks;
239 if (min_k > GEMM_P) min_k = GEMM_P;
240
241 TRMM_KERNEL(min_i, min_k, bk, dp1,
242 #ifdef COMPLEX
243 ZERO,
244 #endif
245 sa,
246 sb + ks * bk * COMPSIZE,
247 a + (is + (ks + j) * lda) * COMPSIZE, lda, -ks);
248 }
249 }
250 }
251 #endif
252 } /* end of ls */
253 }
254
255 if (!range_n) {
256 range_N[0] = j;
257 range_N[1] = j + bk;
258 } else {
259 range_N[0] = range_n[0] + j;
260 range_N[1] = range_n[0] + j + bk;
261 }
262
263 CNAME(args, NULL, range_N, sa, sb, 0);
264
265 }
266
267 return 0;
268 }
269