1 /*********************************************************************/
2 /*                                                                   */
3 /*             Optimized BLAS libraries                              */
4 /*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5 /*                                                                   */
6 /* Copyright (c) The University of Texas, 2009. All rights reserved. */
7 /* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8 /* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9 /* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10 /* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11 /* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12 /* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13 /* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14 /* Under no circumstances shall University be liable for incidental, */
15 /* special, indirect, direct or consequential damages or loss of     */
16 /* profits, interruption of business, or related expenses which may  */
17 /* arise from use of Software or Documentation, including but not    */
18 /* limited to those resulting from defects in Software and/or        */
19 /* Documentation, or loss or inaccuracy of data of any kind.         */
20 /*********************************************************************/
21 
22 #include <stdio.h>
23 #include "common.h"
24 
25 static FLOAT dp1 =  1.;
26 
27 #ifndef COMPLEX
28 #define TRMM_KERNEL	TRMM_KERNEL_RT
29 #define SYRK_KERNEL	SYRK_KERNEL_U
30 #else
31 #define TRMM_KERNEL	TRMM_KERNEL_RC
32 #ifdef XDOUBLE
33 #define SYRK_KERNEL	xherk_kernel_UN
34 #elif defined(DOUBLE)
35 #define SYRK_KERNEL	zherk_kernel_UN
36 #else
37 #define SYRK_KERNEL	cherk_kernel_UN
38 #endif
39 #endif
40 
41 #if 0
42 #undef GEMM_P
43 #undef GEMM_Q
44 #undef GEMM_R
45 
46 #define GEMM_P 8
47 #define GEMM_Q 20
48 #define GEMM_R 24
49 #endif
50 
51 #define GEMM_PQ  MAX(GEMM_P, GEMM_Q)
52 #define REAL_GEMM_R (GEMM_R - GEMM_PQ)
53 
CNAME(blas_arg_t * args,BLASLONG * range_m,BLASLONG * range_n,FLOAT * sa,FLOAT * sb,BLASLONG myid)54 blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) {
55 
56   BLASLONG  n, lda;
57   FLOAT *a;
58 
59   BLASLONG j, bk, blocking;
60   BLASLONG is, ls, ks;
61   BLASLONG jjs, min_jj;
62 
63   BLASLONG min_i, min_l, min_k;
64   BLASLONG range_N[2];
65 
66   FLOAT *sb2 = (FLOAT *)((((BLASLONG)sb
67 		    + GEMM_PQ  * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)
68 		  + GEMM_OFFSET_B);
69 
70 #if 0
71   FLOAT *aa;
72 #endif
73 
74   n      = args -> n;
75   a      = (FLOAT *)args -> a;
76   lda    = args -> lda;
77 
78   if (range_n) {
79     n      = range_n[1] - range_n[0];
80     a     += range_n[0] * (lda + 1) * COMPSIZE;
81   }
82 
83   if (n <= DTB_ENTRIES) {
84     LAUU2_U(args, NULL, range_n, sa, sb, 0);
85     return 0;
86   }
87 
88   blocking = GEMM_Q;
89   if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4;
90 
91   for (j = 0; j < n; j += blocking) {
92     bk = n - j;
93     if (bk > blocking) bk = blocking;
94 
95     if (j > 0) {
96 
97       TRMM_OUTCOPY(bk, bk, a + (j + j * lda) * COMPSIZE, lda, 0, 0, sb);
98 
99       for (ls = 0; ls < j; ls += REAL_GEMM_R) {
100 	min_l = j - ls;
101 
102 #if 0
103 
104 
105 	if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R;
106 	min_i = ls + min_l;
107 	if (min_i > GEMM_P) min_i = GEMM_P;
108 
109 	if (ls > 0) {
110 	  GEMM_ITCOPY(bk, min_i, a + (j * lda) * COMPSIZE, lda, sa);
111 	  aa = sa;
112 	} else {
113 	  aa = sb2;
114 	}
115 
116 	for (jjs = ls; jjs < ls + min_l; jjs += GEMM_P){
117 	  min_jj = ls + min_l - jjs;
118 	  if (min_jj > GEMM_P) min_jj = GEMM_P;
119 
120 	  GEMM_OTCOPY(bk,  min_jj,  a + (jjs  + j * lda) * COMPSIZE, lda, sb2 + (jjs - ls) * bk * COMPSIZE);
121 
122 	  SYRK_KERNEL(min_i, min_jj, bk, dp1,
123 		      aa,
124 		      sb2 + (jjs - ls) * bk * COMPSIZE,
125 		      a + (jjs * lda) * COMPSIZE, lda, - jjs);
126 	}
127 
128 	if (ls + REAL_GEMM_R >= j ) {
129 	  for (ks = 0; ks < bk; ks += GEMM_P) {
130 	    min_k = bk - ks;
131 	    if (min_k > GEMM_P) min_k = GEMM_P;
132 
133 	    TRMM_KERNEL(min_i, min_k, bk, dp1,
134 #ifdef COMPLEX
135 			ZERO,
136 #endif
137 			aa,
138 			sb + ks * bk * COMPSIZE,
139 			a + ((ks + j) * lda) * COMPSIZE, lda, -ks);
140 	  }
141 	}
142 
143 	for(is = min_i; is < ls + min_l ; is += GEMM_P){
144 	  min_i = ls + min_l - is;
145 	  if (min_i > GEMM_P) min_i = GEMM_P;
146 
147 	  if (is < ls) {
148 	    GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa);
149 	    aa = sa;
150 	  } else {
151 	    aa = sb2 + (is - ls) * bk * COMPSIZE;
152 	  }
153 
154 	  SYRK_KERNEL(min_i, min_l, bk, dp1,
155 		      aa,
156 		      sb2,
157 		      a + (is + ls * lda) * COMPSIZE, lda, is - ls);
158 
159 	  if (ls + REAL_GEMM_R >= j ) {
160 	    for (ks = 0; ks < bk; ks += GEMM_P) {
161 	      min_k = bk - ks;
162 	      if (min_k > GEMM_P) min_k = GEMM_P;
163 
164 	      TRMM_KERNEL(min_i, min_k, bk, dp1,
165 #ifdef COMPLEX
166 			  ZERO,
167 #endif
168 			  aa,
169 			  sb + ks * bk * COMPSIZE,
170 			  a + (is + (ks + j) * lda) * COMPSIZE, lda, -ks);
171 	    }
172 	  }
173 	}
174 #else
175 	if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R;
176 	min_i = ls + min_l;
177 	if (min_i > GEMM_P) min_i = GEMM_P;
178 
179 	GEMM_ITCOPY(bk, min_i, a + (j * lda) * COMPSIZE, lda, sa);
180 
181 	for (jjs = ls; jjs < ls + min_l; jjs += GEMM_P){
182 	  min_jj = ls + min_l - jjs;
183 	  if (min_jj > GEMM_P) min_jj = GEMM_P;
184 
185 	  GEMM_OTCOPY(bk,  min_jj,  a + (jjs  + j * lda) * COMPSIZE, lda, sb2 + (jjs - ls) * bk * COMPSIZE);
186 
187 	  SYRK_KERNEL(min_i, min_jj, bk, dp1,
188 		      sa,
189 		      sb2 + (jjs - ls) * bk * COMPSIZE,
190 		      a + (jjs * lda) * COMPSIZE, lda, - jjs);
191 	}
192 
193 	if (ls + REAL_GEMM_R >= j ) {
194 	  for (ks = 0; ks < bk; ks += GEMM_P) {
195 	    min_k = bk - ks;
196 	    if (min_k > GEMM_P) min_k = GEMM_P;
197 
198 	    TRMM_KERNEL(min_i, min_k, bk, dp1,
199 #ifdef COMPLEX
200 			ZERO,
201 #endif
202 			sa,
203 			sb + ks * bk * COMPSIZE,
204 			a + ((ks + j) * lda) * COMPSIZE, lda, -ks);
205 	  }
206 	}
207 
208 	for(is = min_i; is < ls + min_l ; is += GEMM_P){
209 	  min_i = ls + min_l - is;
210 	  if (min_i > GEMM_P) min_i = GEMM_P;
211 
212 	  GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa);
213 
214 	  SYRK_KERNEL(min_i, min_l, bk, dp1,
215 		      sa,
216 		      sb2,
217 		      a + (is + ls * lda) * COMPSIZE, lda, is - ls);
218 
219 	  if (ls + REAL_GEMM_R >= j ) {
220 	    for (ks = 0; ks < bk; ks += GEMM_P) {
221 	      min_k = bk - ks;
222 	      if (min_k > GEMM_P) min_k = GEMM_P;
223 
224 	      TRMM_KERNEL(min_i, min_k, bk, dp1,
225 #ifdef COMPLEX
226 			  ZERO,
227 #endif
228 			  sa,
229 			  sb + ks * bk * COMPSIZE,
230 			  a + (is + (ks + j) * lda) * COMPSIZE, lda, -ks);
231 	    }
232 	  }
233 	}
234 #endif
235       } /* end of ls */
236     }
237 
238     if (!range_n) {
239       range_N[0] = j;
240       range_N[1] = j + bk;
241     } else {
242       range_N[0] = range_n[0] + j;
243       range_N[1] = range_n[0] + j + bk;
244     }
245 
246     CNAME(args, NULL, range_N, sa, sb, 0);
247 
248   }
249 
250   return 0;
251 }
252