1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin.           */
3 /* All rights reserved.                                              */
4 /*                                                                   */
5 /* Redistribution and use in source and binary forms, with or        */
6 /* without modification, are permitted provided that the following   */
7 /* conditions are met:                                               */
8 /*                                                                   */
9 /*   1. Redistributions of source code must retain the above         */
10 /*      copyright notice, this list of conditions and the following  */
11 /*      disclaimer.                                                  */
12 /*                                                                   */
13 /*   2. Redistributions in binary form must reproduce the above      */
14 /*      copyright notice, this list of conditions and the following  */
15 /*      disclaimer in the documentation and/or other materials       */
16 /*      provided with the distribution.                              */
17 /*                                                                   */
18 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
32 /*                                                                   */
33 /* The views and conclusions contained in the software and           */
34 /* documentation are those of the authors and should not be          */
35 /* interpreted as representing official policies, either expressed   */
36 /* or implied, of The University of Texas at Austin.                 */
37 /*********************************************************************/
38 
39 #include <stdio.h>
40 #include "common.h"
41 
42 static FLOAT dp1 =  1.;
43 
44 #ifndef COMPLEX
45 #define TRMM_KERNEL	TRMM_KERNEL_RT
46 #define SYRK_KERNEL	SYRK_KERNEL_U
47 #else
48 #define TRMM_KERNEL	TRMM_KERNEL_RC
49 #ifdef XDOUBLE
50 #define SYRK_KERNEL	xherk_kernel_UN
51 #elif defined(DOUBLE)
52 #define SYRK_KERNEL	zherk_kernel_UN
53 #else
54 #define SYRK_KERNEL	cherk_kernel_UN
55 #endif
56 #endif
57 
58 #if 0
59 #undef GEMM_P
60 #undef GEMM_Q
61 #undef GEMM_R
62 
63 #define GEMM_P 8
64 #define GEMM_Q 20
65 #define GEMM_R 24
66 #endif
67 
68 #define GEMM_PQ  MAX(GEMM_P, GEMM_Q)
69 #define REAL_GEMM_R (GEMM_R - GEMM_PQ)
70 
CNAME(blas_arg_t * args,BLASLONG * range_m,BLASLONG * range_n,FLOAT * sa,FLOAT * sb,BLASLONG myid)71 blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) {
72 
73   BLASLONG  n, lda;
74   FLOAT *a;
75 
76   BLASLONG j, bk, blocking;
77   BLASLONG is, ls, ks;
78   BLASLONG jjs, min_jj;
79 
80   BLASLONG min_i, min_l, min_k;
81   BLASLONG range_N[2];
82 
83   FLOAT *sb2 = (FLOAT *)((((BLASLONG)sb
84 		    + GEMM_PQ  * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)
85 		  + GEMM_OFFSET_B);
86 
87 #if 0
88   FLOAT *aa;
89 #endif
90 
91   n      = args -> n;
92   a      = (FLOAT *)args -> a;
93   lda    = args -> lda;
94 
95   if (range_n) {
96     n      = range_n[1] - range_n[0];
97     a     += range_n[0] * (lda + 1) * COMPSIZE;
98   }
99 
100   if (n <= DTB_ENTRIES) {
101     LAUU2_U(args, NULL, range_n, sa, sb, 0);
102     return 0;
103   }
104 
105   blocking = GEMM_Q;
106   if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4;
107 
108   for (j = 0; j < n; j += blocking) {
109     bk = n - j;
110     if (bk > blocking) bk = blocking;
111 
112     if (j > 0) {
113 
114       TRMM_OUTCOPY(bk, bk, a + (j + j * lda) * COMPSIZE, lda, 0, 0, sb);
115 
116       for (ls = 0; ls < j; ls += REAL_GEMM_R) {
117 	min_l = j - ls;
118 
119 #if 0
120 
121 
122 	if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R;
123 	min_i = ls + min_l;
124 	if (min_i > GEMM_P) min_i = GEMM_P;
125 
126 	if (ls > 0) {
127 	  GEMM_ITCOPY(bk, min_i, a + (j * lda) * COMPSIZE, lda, sa);
128 	  aa = sa;
129 	} else {
130 	  aa = sb2;
131 	}
132 
133 	for (jjs = ls; jjs < ls + min_l; jjs += GEMM_P){
134 	  min_jj = ls + min_l - jjs;
135 	  if (min_jj > GEMM_P) min_jj = GEMM_P;
136 
137 	  GEMM_OTCOPY(bk,  min_jj,  a + (jjs  + j * lda) * COMPSIZE, lda, sb2 + (jjs - ls) * bk * COMPSIZE);
138 
139 	  SYRK_KERNEL(min_i, min_jj, bk, dp1,
140 		      aa,
141 		      sb2 + (jjs - ls) * bk * COMPSIZE,
142 		      a + (jjs * lda) * COMPSIZE, lda, - jjs);
143 	}
144 
145 	if (ls + REAL_GEMM_R >= j ) {
146 	  for (ks = 0; ks < bk; ks += GEMM_P) {
147 	    min_k = bk - ks;
148 	    if (min_k > GEMM_P) min_k = GEMM_P;
149 
150 	    TRMM_KERNEL(min_i, min_k, bk, dp1,
151 #ifdef COMPLEX
152 			ZERO,
153 #endif
154 			aa,
155 			sb + ks * bk * COMPSIZE,
156 			a + ((ks + j) * lda) * COMPSIZE, lda, -ks);
157 	  }
158 	}
159 
160 	for(is = min_i; is < ls + min_l ; is += GEMM_P){
161 	  min_i = ls + min_l - is;
162 	  if (min_i > GEMM_P) min_i = GEMM_P;
163 
164 	  if (is < ls) {
165 	    GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa);
166 	    aa = sa;
167 	  } else {
168 	    aa = sb2 + (is - ls) * bk * COMPSIZE;
169 	  }
170 
171 	  SYRK_KERNEL(min_i, min_l, bk, dp1,
172 		      aa,
173 		      sb2,
174 		      a + (is + ls * lda) * COMPSIZE, lda, is - ls);
175 
176 	  if (ls + REAL_GEMM_R >= j ) {
177 	    for (ks = 0; ks < bk; ks += GEMM_P) {
178 	      min_k = bk - ks;
179 	      if (min_k > GEMM_P) min_k = GEMM_P;
180 
181 	      TRMM_KERNEL(min_i, min_k, bk, dp1,
182 #ifdef COMPLEX
183 			  ZERO,
184 #endif
185 			  aa,
186 			  sb + ks * bk * COMPSIZE,
187 			  a + (is + (ks + j) * lda) * COMPSIZE, lda, -ks);
188 	    }
189 	  }
190 	}
191 #else
192 	if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R;
193 	min_i = ls + min_l;
194 	if (min_i > GEMM_P) min_i = GEMM_P;
195 
196 	GEMM_ITCOPY(bk, min_i, a + (j * lda) * COMPSIZE, lda, sa);
197 
198 	for (jjs = ls; jjs < ls + min_l; jjs += GEMM_P){
199 	  min_jj = ls + min_l - jjs;
200 	  if (min_jj > GEMM_P) min_jj = GEMM_P;
201 
202 	  GEMM_OTCOPY(bk,  min_jj,  a + (jjs  + j * lda) * COMPSIZE, lda, sb2 + (jjs - ls) * bk * COMPSIZE);
203 
204 	  SYRK_KERNEL(min_i, min_jj, bk, dp1,
205 		      sa,
206 		      sb2 + (jjs - ls) * bk * COMPSIZE,
207 		      a + (jjs * lda) * COMPSIZE, lda, - jjs);
208 	}
209 
210 	if (ls + REAL_GEMM_R >= j ) {
211 	  for (ks = 0; ks < bk; ks += GEMM_P) {
212 	    min_k = bk - ks;
213 	    if (min_k > GEMM_P) min_k = GEMM_P;
214 
215 	    TRMM_KERNEL(min_i, min_k, bk, dp1,
216 #ifdef COMPLEX
217 			ZERO,
218 #endif
219 			sa,
220 			sb + ks * bk * COMPSIZE,
221 			a + ((ks + j) * lda) * COMPSIZE, lda, -ks);
222 	  }
223 	}
224 
225 	for(is = min_i; is < ls + min_l ; is += GEMM_P){
226 	  min_i = ls + min_l - is;
227 	  if (min_i > GEMM_P) min_i = GEMM_P;
228 
229 	  GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa);
230 
231 	  SYRK_KERNEL(min_i, min_l, bk, dp1,
232 		      sa,
233 		      sb2,
234 		      a + (is + ls * lda) * COMPSIZE, lda, is - ls);
235 
236 	  if (ls + REAL_GEMM_R >= j ) {
237 	    for (ks = 0; ks < bk; ks += GEMM_P) {
238 	      min_k = bk - ks;
239 	      if (min_k > GEMM_P) min_k = GEMM_P;
240 
241 	      TRMM_KERNEL(min_i, min_k, bk, dp1,
242 #ifdef COMPLEX
243 			  ZERO,
244 #endif
245 			  sa,
246 			  sb + ks * bk * COMPSIZE,
247 			  a + (is + (ks + j) * lda) * COMPSIZE, lda, -ks);
248 	    }
249 	  }
250 	}
251 #endif
252       } /* end of ls */
253     }
254 
255     if (!range_n) {
256       range_N[0] = j;
257       range_N[1] = j + bk;
258     } else {
259       range_N[0] = range_n[0] + j;
260       range_N[1] = range_n[0] + j + bk;
261     }
262 
263     CNAME(args, NULL, range_N, sa, sb, 0);
264 
265   }
266 
267   return 0;
268 }
269