1 /*********************************************************************/
2 /* */
3 /* Optimized BLAS libraries */
4 /* By Kazushige Goto <kgoto@tacc.utexas.edu> */
5 /* */
6 /* Copyright (c) The University of Texas, 2009. All rights reserved. */
7 /* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */
8 /* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */
9 /* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */
10 /* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */
11 /* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */
12 /* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */
13 /* THE USE OF THE SOFTWARE OR DOCUMENTATION. */
14 /* Under no circumstances shall University be liable for incidental, */
15 /* special, indirect, direct or consequential damages or loss of */
16 /* profits, interruption of business, or related expenses which may */
17 /* arise from use of Software or Documentation, including but not */
18 /* limited to those resulting from defects in Software and/or */
19 /* Documentation, or loss or inaccuracy of data of any kind. */
20 /*********************************************************************/
21
22 #include <stdio.h>
23 #include "common.h"
24
25 static FLOAT dp1 = 1.;
26
27 #ifndef COMPLEX
28 #define TRMM_KERNEL TRMM_KERNEL_RT
29 #define SYRK_KERNEL SYRK_KERNEL_U
30 #else
31 #define TRMM_KERNEL TRMM_KERNEL_RC
32 #ifdef XDOUBLE
33 #define SYRK_KERNEL xherk_kernel_UN
34 #elif defined(DOUBLE)
35 #define SYRK_KERNEL zherk_kernel_UN
36 #else
37 #define SYRK_KERNEL cherk_kernel_UN
38 #endif
39 #endif
40
41 #if 0
42 #undef GEMM_P
43 #undef GEMM_Q
44 #undef GEMM_R
45
46 #define GEMM_P 8
47 #define GEMM_Q 20
48 #define GEMM_R 24
49 #endif
50
51 #define GEMM_PQ MAX(GEMM_P, GEMM_Q)
52 #define REAL_GEMM_R (GEMM_R - GEMM_PQ)
53
CNAME(blas_arg_t * args,BLASLONG * range_m,BLASLONG * range_n,FLOAT * sa,FLOAT * sb,BLASLONG myid)54 blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) {
55
56 BLASLONG n, lda;
57 FLOAT *a;
58
59 BLASLONG j, bk, blocking;
60 BLASLONG is, ls, ks;
61 BLASLONG jjs, min_jj;
62
63 BLASLONG min_i, min_l, min_k;
64 BLASLONG range_N[2];
65
66 FLOAT *sb2 = (FLOAT *)((((BLASLONG)sb
67 + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)
68 + GEMM_OFFSET_B);
69
70 #if 0
71 FLOAT *aa;
72 #endif
73
74 n = args -> n;
75 a = (FLOAT *)args -> a;
76 lda = args -> lda;
77
78 if (range_n) {
79 n = range_n[1] - range_n[0];
80 a += range_n[0] * (lda + 1) * COMPSIZE;
81 }
82
83 if (n <= DTB_ENTRIES) {
84 LAUU2_U(args, NULL, range_n, sa, sb, 0);
85 return 0;
86 }
87
88 blocking = GEMM_Q;
89 if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4;
90
91 for (j = 0; j < n; j += blocking) {
92 bk = n - j;
93 if (bk > blocking) bk = blocking;
94
95 if (j > 0) {
96
97 TRMM_OUTCOPY(bk, bk, a + (j + j * lda) * COMPSIZE, lda, 0, 0, sb);
98
99 for (ls = 0; ls < j; ls += REAL_GEMM_R) {
100 min_l = j - ls;
101
102 #if 0
103
104
105 if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R;
106 min_i = ls + min_l;
107 if (min_i > GEMM_P) min_i = GEMM_P;
108
109 if (ls > 0) {
110 GEMM_ITCOPY(bk, min_i, a + (j * lda) * COMPSIZE, lda, sa);
111 aa = sa;
112 } else {
113 aa = sb2;
114 }
115
116 for (jjs = ls; jjs < ls + min_l; jjs += GEMM_P){
117 min_jj = ls + min_l - jjs;
118 if (min_jj > GEMM_P) min_jj = GEMM_P;
119
120 GEMM_OTCOPY(bk, min_jj, a + (jjs + j * lda) * COMPSIZE, lda, sb2 + (jjs - ls) * bk * COMPSIZE);
121
122 SYRK_KERNEL(min_i, min_jj, bk, dp1,
123 aa,
124 sb2 + (jjs - ls) * bk * COMPSIZE,
125 a + (jjs * lda) * COMPSIZE, lda, - jjs);
126 }
127
128 if (ls + REAL_GEMM_R >= j ) {
129 for (ks = 0; ks < bk; ks += GEMM_P) {
130 min_k = bk - ks;
131 if (min_k > GEMM_P) min_k = GEMM_P;
132
133 TRMM_KERNEL(min_i, min_k, bk, dp1,
134 #ifdef COMPLEX
135 ZERO,
136 #endif
137 aa,
138 sb + ks * bk * COMPSIZE,
139 a + ((ks + j) * lda) * COMPSIZE, lda, -ks);
140 }
141 }
142
143 for(is = min_i; is < ls + min_l ; is += GEMM_P){
144 min_i = ls + min_l - is;
145 if (min_i > GEMM_P) min_i = GEMM_P;
146
147 if (is < ls) {
148 GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa);
149 aa = sa;
150 } else {
151 aa = sb2 + (is - ls) * bk * COMPSIZE;
152 }
153
154 SYRK_KERNEL(min_i, min_l, bk, dp1,
155 aa,
156 sb2,
157 a + (is + ls * lda) * COMPSIZE, lda, is - ls);
158
159 if (ls + REAL_GEMM_R >= j ) {
160 for (ks = 0; ks < bk; ks += GEMM_P) {
161 min_k = bk - ks;
162 if (min_k > GEMM_P) min_k = GEMM_P;
163
164 TRMM_KERNEL(min_i, min_k, bk, dp1,
165 #ifdef COMPLEX
166 ZERO,
167 #endif
168 aa,
169 sb + ks * bk * COMPSIZE,
170 a + (is + (ks + j) * lda) * COMPSIZE, lda, -ks);
171 }
172 }
173 }
174 #else
175 if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R;
176 min_i = ls + min_l;
177 if (min_i > GEMM_P) min_i = GEMM_P;
178
179 GEMM_ITCOPY(bk, min_i, a + (j * lda) * COMPSIZE, lda, sa);
180
181 for (jjs = ls; jjs < ls + min_l; jjs += GEMM_P){
182 min_jj = ls + min_l - jjs;
183 if (min_jj > GEMM_P) min_jj = GEMM_P;
184
185 GEMM_OTCOPY(bk, min_jj, a + (jjs + j * lda) * COMPSIZE, lda, sb2 + (jjs - ls) * bk * COMPSIZE);
186
187 SYRK_KERNEL(min_i, min_jj, bk, dp1,
188 sa,
189 sb2 + (jjs - ls) * bk * COMPSIZE,
190 a + (jjs * lda) * COMPSIZE, lda, - jjs);
191 }
192
193 if (ls + REAL_GEMM_R >= j ) {
194 for (ks = 0; ks < bk; ks += GEMM_P) {
195 min_k = bk - ks;
196 if (min_k > GEMM_P) min_k = GEMM_P;
197
198 TRMM_KERNEL(min_i, min_k, bk, dp1,
199 #ifdef COMPLEX
200 ZERO,
201 #endif
202 sa,
203 sb + ks * bk * COMPSIZE,
204 a + ((ks + j) * lda) * COMPSIZE, lda, -ks);
205 }
206 }
207
208 for(is = min_i; is < ls + min_l ; is += GEMM_P){
209 min_i = ls + min_l - is;
210 if (min_i > GEMM_P) min_i = GEMM_P;
211
212 GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa);
213
214 SYRK_KERNEL(min_i, min_l, bk, dp1,
215 sa,
216 sb2,
217 a + (is + ls * lda) * COMPSIZE, lda, is - ls);
218
219 if (ls + REAL_GEMM_R >= j ) {
220 for (ks = 0; ks < bk; ks += GEMM_P) {
221 min_k = bk - ks;
222 if (min_k > GEMM_P) min_k = GEMM_P;
223
224 TRMM_KERNEL(min_i, min_k, bk, dp1,
225 #ifdef COMPLEX
226 ZERO,
227 #endif
228 sa,
229 sb + ks * bk * COMPSIZE,
230 a + (is + (ks + j) * lda) * COMPSIZE, lda, -ks);
231 }
232 }
233 }
234 #endif
235 } /* end of ls */
236 }
237
238 if (!range_n) {
239 range_N[0] = j;
240 range_N[1] = j + bk;
241 } else {
242 range_N[0] = range_n[0] + j;
243 range_N[1] = range_n[0] + j + bk;
244 }
245
246 CNAME(args, NULL, range_N, sa, sb, 0);
247
248 }
249
250 return 0;
251 }
252