1 /**************************************************************************************************
2 *                                                                                                 *
3 * This file is part of BLASFEO.                                                                   *
4 *                                                                                                 *
5 * BLASFEO -- BLAS for embedded optimization.                                                      *
6 * Copyright (C) 2019 by Gianluca Frison.                                                          *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
8 * All rights reserved.                                                                            *
9 *                                                                                                 *
10 * The 2-Clause BSD License                                                                        *
11 *                                                                                                 *
12 * Redistribution and use in source and binary forms, with or without                              *
13 * modification, are permitted provided that the following conditions are met:                     *
14 *                                                                                                 *
15 * 1. Redistributions of source code must retain the above copyright notice, this                  *
16 *    list of conditions and the following disclaimer.                                             *
17 * 2. Redistributions in binary form must reproduce the above copyright notice,                    *
18 *    this list of conditions and the following disclaimer in the documentation                    *
19 *    and/or other materials provided with the distribution.                                       *
20 *                                                                                                 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND                 *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED                   *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE                          *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR                 *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES                  *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;                    *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND                     *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT                      *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS                   *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                                    *
31 *                                                                                                 *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de                             *
33 *                                                                                                 *
34 **************************************************************************************************/
35 
36 
37 
38 #if defined(LA_REFERENCE) | defined(TESTING_MODE)
39 
40 
41 // dgemm nn
GEMM_NN(int m,int n,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)42 void GEMM_NN(int m, int n, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
43 	{
44 	if(m<=0 | n<=0)
45 		return;
46 
47 	// invalidate stored inverse diagonal of result matrix
48 	sD->use_dA = 0;
49 
50 	int ii, jj, kk;
51 	REAL
52 		c_00, c_01,
53 		c_10, c_11;
54 	int lda = sA->m;
55 	int ldb = sB->m;
56 	int ldc = sC->m;
57 	int ldd = sD->m;
58 	REAL *pA = sA->pA + ai + aj*lda;
59 	REAL *pB = sB->pA + bi + bj*ldb;
60 	REAL *pC = sC->pA + ci + cj*ldc;
61 	REAL *pD = sD->pA + di + dj*ldd;
62 	jj = 0;
63 	for(; jj<n-1; jj+=2)
64 		{
65 		ii = 0;
66 		for(; ii<m-1; ii+=2)
67 			{
68 			c_00 = 0.0; ;
69 			c_10 = 0.0; ;
70 			c_01 = 0.0; ;
71 			c_11 = 0.0; ;
72 			for(kk=0; kk<k; kk++)
73 				{
74 				c_00 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+0)];
75 				c_10 += pA[(ii+1)+lda*kk] * pB[kk+ldb*(jj+0)];
76 				c_01 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+1)];
77 				c_11 += pA[(ii+1)+lda*kk] * pB[kk+ldb*(jj+1)];
78 				}
79 			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
80 			pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
81 			pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
82 			pD[(ii+1)+ldd*(jj+1)] = alpha * c_11 + beta * pC[(ii+1)+ldc*(jj+1)];
83 			}
84 		for(; ii<m; ii++)
85 			{
86 			c_00 = 0.0; ;
87 			c_01 = 0.0; ;
88 			for(kk=0; kk<k; kk++)
89 				{
90 				c_00 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+0)];
91 				c_01 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+1)];
92 				}
93 			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
94 			pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
95 			}
96 		}
97 	for(; jj<n; jj++)
98 		{
99 		ii = 0;
100 		for(; ii<m-1; ii+=2)
101 			{
102 			c_00 = 0.0; ;
103 			c_10 = 0.0; ;
104 			for(kk=0; kk<k; kk++)
105 				{
106 				c_00 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+0)];
107 				c_10 += pA[(ii+1)+lda*kk] * pB[kk+ldb*(jj+0)];
108 				}
109 			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
110 			pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
111 			}
112 		for(; ii<m; ii++)
113 			{
114 			c_00 = 0.0; ;
115 			for(kk=0; kk<k; kk++)
116 				{
117 				c_00 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+0)];
118 				}
119 			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
120 			}
121 		}
122 	return;
123 	}
124 
125 
126 
127 // dgemm nt
GEMM_NT(int m,int n,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)128 void GEMM_NT(int m, int n, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
129 	{
130 	if(m<=0 | n<=0)
131 		return;
132 
133 	// invalidate stored inverse diagonal of result matrix
134 	sD->use_dA = 0;
135 
136 	int ii, jj, kk;
137 	REAL
138 		c_00, c_01,
139 		c_10, c_11;
140 	int lda = sA->m;
141 	int ldb = sB->m;
142 	int ldc = sC->m;
143 	int ldd = sD->m;
144 	REAL *pA = sA->pA + ai + aj*lda;
145 	REAL *pB = sB->pA + bi + bj*ldb;
146 	REAL *pC = sC->pA + ci + cj*ldc;
147 	REAL *pD = sD->pA + di + dj*ldd;
148 	jj = 0;
149 	for(; jj<n-1; jj+=2)
150 		{
151 		ii = 0;
152 		for(; ii<m-1; ii+=2)
153 			{
154 			c_00 = 0.0;
155 			c_10 = 0.0;
156 			c_01 = 0.0;
157 			c_11 = 0.0;
158 			for(kk=0; kk<k; kk++)
159 				{
160 				c_00 += pA[(ii+0)+lda*kk] * pB[(jj+0)+ldb*kk];
161 				c_10 += pA[(ii+1)+lda*kk] * pB[(jj+0)+ldb*kk];
162 				c_01 += pA[(ii+0)+lda*kk] * pB[(jj+1)+ldb*kk];
163 				c_11 += pA[(ii+1)+lda*kk] * pB[(jj+1)+ldb*kk];
164 				}
165 			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
166 			pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
167 			pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
168 			pD[(ii+1)+ldd*(jj+1)] = alpha * c_11 + beta * pC[(ii+1)+ldc*(jj+1)];
169 			}
170 		for(; ii<m; ii++)
171 			{
172 			c_00 = 0.0;
173 			c_01 = 0.0;
174 			for(kk=0; kk<k; kk++)
175 				{
176 				c_00 += pA[(ii+0)+lda*kk] * pB[(jj+0)+ldb*kk];
177 				c_01 += pA[(ii+0)+lda*kk] * pB[(jj+1)+ldb*kk];
178 				}
179 			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
180 			pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
181 			}
182 		}
183 	for(; jj<n; jj++)
184 		{
185 		ii = 0;
186 		for(; ii<m-1; ii+=2)
187 			{
188 			c_00 = 0.0;
189 			c_10 = 0.0;
190 			for(kk=0; kk<k; kk++)
191 				{
192 				c_00 += pA[(ii+0)+lda*kk] * pB[(jj+0)+ldb*kk];
193 				c_10 += pA[(ii+1)+lda*kk] * pB[(jj+0)+ldb*kk];
194 				}
195 			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
196 			pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
197 			}
198 		for(; ii<m; ii++)
199 			{
200 			c_00 = 0.0;
201 			for(kk=0; kk<k; kk++)
202 				{
203 				c_00 += pA[(ii+0)+lda*kk] * pB[(jj+0)+ldb*kk];
204 				}
205 			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
206 			}
207 		}
208 	return;
209 	}
210 
211 
212 
213 // dgemm tn
GEMM_TN(int m,int n,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)214 void GEMM_TN(int m, int n, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
215 	{
216 	if(m<=0 | n<=0)
217 		return;
218 
219 	// invalidate stored inverse diagonal of result matrix
220 	sD->use_dA = 0;
221 
222 	int ii, jj, kk;
223 	REAL
224 		c_00, c_01,
225 		c_10, c_11;
226 	int lda = sA->m;
227 	int ldb = sB->m;
228 	int ldc = sC->m;
229 	int ldd = sD->m;
230 	REAL *pA = sA->pA + ai + aj*lda;
231 	REAL *pB = sB->pA + bi + bj*ldb;
232 	REAL *pC = sC->pA + ci + cj*ldc;
233 	REAL *pD = sD->pA + di + dj*ldd;
234 	jj = 0;
235 	for(; jj<n-1; jj+=2)
236 		{
237 		ii = 0;
238 		for(; ii<m-1; ii+=2)
239 			{
240 			c_00 = 0.0; ;
241 			c_10 = 0.0; ;
242 			c_01 = 0.0; ;
243 			c_11 = 0.0; ;
244 			for(kk=0; kk<k; kk++)
245 				{
246 				c_00 += pA[kk+lda*(ii+0)] * pB[kk+ldb*(jj+0)];
247 				c_10 += pA[kk+lda*(ii+1)] * pB[kk+ldb*(jj+0)];
248 				c_01 += pA[kk+lda*(ii+0)] * pB[kk+ldb*(jj+1)];
249 				c_11 += pA[kk+lda*(ii+1)] * pB[kk+ldb*(jj+1)];
250 				}
251 			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
252 			pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
253 			pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
254 			pD[(ii+1)+ldd*(jj+1)] = alpha * c_11 + beta * pC[(ii+1)+ldc*(jj+1)];
255 			}
256 		for(; ii<m; ii++)
257 			{
258 			c_00 = 0.0; ;
259 			c_01 = 0.0; ;
260 			for(kk=0; kk<k; kk++)
261 				{
262 				c_00 += pA[kk+lda*(ii+0)] * pB[kk+ldb*(jj+0)];
263 				c_01 += pA[kk+lda*(ii+0)] * pB[kk+ldb*(jj+1)];
264 				}
265 			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
266 			pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
267 			}
268 		}
269 	for(; jj<n; jj++)
270 		{
271 		ii = 0;
272 		for(; ii<m-1; ii+=2)
273 			{
274 			c_00 = 0.0; ;
275 			c_10 = 0.0; ;
276 			for(kk=0; kk<k; kk++)
277 				{
278 				c_00 += pA[kk+lda*(ii+0)] * pB[kk+ldb*(jj+0)];
279 				c_10 += pA[kk+lda*(ii+1)] * pB[kk+ldb*(jj+0)];
280 				}
281 			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
282 			pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
283 			}
284 		for(; ii<m; ii++)
285 			{
286 			c_00 = 0.0; ;
287 			for(kk=0; kk<k; kk++)
288 				{
289 				c_00 += pA[kk+lda*(ii+0)] * pB[kk+ldb*(jj+0)];
290 				}
291 			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
292 			}
293 		}
294 	return;
295 	}
296 
297 
298 
299 // dgemm tt
GEMM_TT(int m,int n,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)300 void GEMM_TT(int m, int n, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
301 	{
302 	if(m<=0 | n<=0)
303 		return;
304 
305 	// invalidate stored inverse diagonal of result matrix
306 	sD->use_dA = 0;
307 
308 	int ii, jj, kk;
309 	REAL
310 		c_00, c_01,
311 		c_10, c_11;
312 	int lda = sA->m;
313 	int ldb = sB->m;
314 	int ldc = sC->m;
315 	int ldd = sD->m;
316 	REAL *pA = sA->pA + ai + aj*lda;
317 	REAL *pB = sB->pA + bi + bj*ldb;
318 	REAL *pC = sC->pA + ci + cj*ldc;
319 	REAL *pD = sD->pA + di + dj*ldd;
320 	jj = 0;
321 	for(; jj<n-1; jj+=2)
322 		{
323 		ii = 0;
324 		for(; ii<m-1; ii+=2)
325 			{
326 			c_00 = 0.0; ;
327 			c_10 = 0.0; ;
328 			c_01 = 0.0; ;
329 			c_11 = 0.0; ;
330 			for(kk=0; kk<k; kk++)
331 				{
332 				c_00 += pA[kk+lda*(ii+0)] * pB[(jj+0)+ldb*kk];
333 				c_10 += pA[kk+lda*(ii+1)] * pB[(jj+0)+ldb*kk];
334 				c_01 += pA[kk+lda*(ii+0)] * pB[(jj+1)+ldb*kk];
335 				c_11 += pA[kk+lda*(ii+1)] * pB[(jj+1)+ldb*kk];
336 				}
337 			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
338 			pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
339 			pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
340 			pD[(ii+1)+ldd*(jj+1)] = alpha * c_11 + beta * pC[(ii+1)+ldc*(jj+1)];
341 			}
342 		for(; ii<m; ii++)
343 			{
344 			c_00 = 0.0; ;
345 			c_01 = 0.0; ;
346 			for(kk=0; kk<k; kk++)
347 				{
348 				c_00 += pA[kk+lda*(ii+0)] * pB[(jj+0)+ldb*kk];
349 				c_01 += pA[kk+lda*(ii+0)] * pB[(jj+1)+ldb*kk];
350 				}
351 			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
352 			pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
353 			}
354 		}
355 	for(; jj<n; jj++)
356 		{
357 		ii = 0;
358 		for(; ii<m-1; ii+=2)
359 			{
360 			c_00 = 0.0; ;
361 			c_10 = 0.0; ;
362 			for(kk=0; kk<k; kk++)
363 				{
364 				c_00 += pA[kk+lda*(ii+0)] * pB[(jj+0)+ldb*kk];
365 				c_10 += pA[kk+lda*(ii+1)] * pB[(jj+0)+ldb*kk];
366 				}
367 			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
368 			pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
369 			}
370 		for(; ii<m; ii++)
371 			{
372 			c_00 = 0.0; ;
373 			for(kk=0; kk<k; kk++)
374 				{
375 				c_00 += pA[kk+lda*(ii+0)] * pB[(jj+0)+ldb*kk];
376 				}
377 			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
378 			}
379 		}
380 	return;
381 	}
382 
383 
384 
385 // dtrsm_left_lower_nottransposed_notunit
TRSM_LLNN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)386 void TRSM_LLNN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
387 	{
388 	if(m<=0 | n<=0)
389 		return;
390 
391 	// invalidate stored inverse diagonal of result matrix
392 	sD->use_dA = 0;
393 
394 	int ii, jj, kk;
395 	REAL
396 		d_00, d_01,
397 		d_10, d_11;
398 	int lda = sA->m;
399 	int ldb = sB->m;
400 	int ldd = sD->m;
401 	REAL *pA = sA->pA + ai + aj*lda; // triangular
402 	REAL *pB = sB->pA + bi + bj*ldb;
403 	REAL *pD = sD->pA + di + dj*ldd;
404 	REAL *dA = sA->dA;
405 	if(ai==0 & aj==0)
406 		{
407 		if(sA->use_dA<n)
408 			{
409 			for(ii=0; ii<n; ii++)
410 				dA[ii] = 1.0 / pA[ii+lda*ii];
411 			sA->use_dA = n;
412 			}
413 		}
414 	else
415 		{
416 		for(ii=0; ii<n; ii++)
417 			dA[ii] = 1.0 / pA[ii+lda*ii];
418 		sA->use_dA = 0; // nonzero offset makes diagonal dirty
419 		}
420 	// solve
421 	jj = 0;
422 	for(; jj<n-1; jj+=2)
423 		{
424 		ii = 0;
425 		for(; ii<m-1; ii+=2)
426 			{
427 			d_00 = alpha * pB[ii+0+ldb*(jj+0)];
428 			d_10 = alpha * pB[ii+1+ldb*(jj+0)];
429 			d_01 = alpha * pB[ii+0+ldb*(jj+1)];
430 			d_11 = alpha * pB[ii+1+ldb*(jj+1)];
431 			kk = 0;
432 			for(; kk<ii; kk++)
433 				{
434 				d_00 -= pA[ii+0+lda*kk] * pD[kk+ldd*(jj+0)];
435 				d_10 -= pA[ii+1+lda*kk] * pD[kk+ldd*(jj+0)];
436 				d_01 -= pA[ii+0+lda*kk] * pD[kk+ldd*(jj+1)];
437 				d_11 -= pA[ii+1+lda*kk] * pD[kk+ldd*(jj+1)];
438 				}
439 			d_00 *= dA[ii+0];
440 			d_01 *= dA[ii+0];
441 			d_10 -= pA[ii+1+lda*ii] * d_00;
442 			d_11 -= pA[ii+1+lda*ii] * d_01;
443 			d_10 *= dA[ii+1];
444 			d_11 *= dA[ii+1];
445 			pD[ii+0+ldd*(jj+0)] = d_00;
446 			pD[ii+1+ldd*(jj+0)] = d_10;
447 			pD[ii+0+ldd*(jj+1)] = d_01;
448 			pD[ii+1+ldd*(jj+1)] = d_11;
449 			}
450 		for(; ii<m; ii++)
451 			{
452 			d_00 = alpha * pB[ii+ldb*(jj+0)];
453 			d_01 = alpha * pB[ii+ldb*(jj+1)];
454 			for(kk=0; kk<ii; kk++)
455 				{
456 				d_00 -= pA[ii+lda*kk] * pD[kk+ldd*(jj+0)];
457 				d_01 -= pA[ii+lda*kk] * pD[kk+ldd*(jj+1)];
458 				}
459 			d_00 *= dA[ii+0];
460 			d_01 *= dA[ii+0];
461 			pD[ii+ldd*(jj+0)] = d_00;
462 			pD[ii+ldd*(jj+1)] = d_01;
463 			}
464 		}
465 	for(; jj<n; jj++)
466 		{
467 		ii = 0;
468 		for(; ii<m-1; ii+=2)
469 			{
470 			d_00 = alpha * pB[ii+0+ldb*jj];
471 			d_10 = alpha * pB[ii+1+ldb*jj];
472 			for(kk=0; kk<ii; kk++)
473 				{
474 				d_00 -= pA[ii+0+lda*kk] * pD[kk+ldd*jj];
475 				d_10 -= pA[ii+1+lda*kk] * pD[kk+ldd*jj];
476 				}
477 			d_00 *= dA[ii+0];
478 			d_10 -= pA[ii+1+lda*kk] * d_00;
479 			d_10 *= dA[ii+1];
480 			pD[ii+0+ldd*jj] = d_00;
481 			pD[ii+1+ldd*jj] = d_10;
482 			}
483 		for(; ii<m; ii++)
484 			{
485 			d_00 = alpha * pB[ii+ldb*jj];
486 			for(kk=0; kk<ii; kk++)
487 				{
488 				d_00 -= pA[ii+lda*kk] * pD[kk+ldd*jj];
489 				}
490 			d_00 *= dA[ii+0];
491 			pD[ii+ldd*jj] = d_00;
492 			}
493 		}
494 	return;
495 	}
496 
497 
498 
499 // dtrsm_left_lower_nottransposed_unit
TRSM_LLNU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)500 void TRSM_LLNU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
501 	{
502 	if(m<=0 | n<=0)
503 		return;
504 
505 	// invalidate stored inverse diagonal of result matrix
506 	sD->use_dA = 0;
507 
508 	int ii, jj, kk;
509 	REAL
510 		d_00, d_01,
511 		d_10, d_11;
512 	int lda = sA->m;
513 	int ldb = sB->m;
514 	int ldd = sD->m;
515 	REAL *pA = sA->pA + ai + aj*lda; // triangular
516 	REAL *pB = sB->pA + bi + bj*ldb;
517 	REAL *pD = sD->pA + di + dj*ldd;
518 	// solve
519 	jj = 0;
520 	for(; jj<n-1; jj+=2)
521 		{
522 		ii = 0;
523 		for(; ii<m-1; ii+=2)
524 			{
525 			d_00 = alpha * pB[ii+0+ldb*(jj+0)];
526 			d_10 = alpha * pB[ii+1+ldb*(jj+0)];
527 			d_01 = alpha * pB[ii+0+ldb*(jj+1)];
528 			d_11 = alpha * pB[ii+1+ldb*(jj+1)];
529 			kk = 0;
530 			for(; kk<ii; kk++)
531 				{
532 				d_00 -= pA[ii+0+lda*kk] * pD[kk+ldd*(jj+0)];
533 				d_10 -= pA[ii+1+lda*kk] * pD[kk+ldd*(jj+0)];
534 				d_01 -= pA[ii+0+lda*kk] * pD[kk+ldd*(jj+1)];
535 				d_11 -= pA[ii+1+lda*kk] * pD[kk+ldd*(jj+1)];
536 				}
537 			d_10 -= pA[ii+1+lda*kk] * d_00;
538 			d_11 -= pA[ii+1+lda*kk] * d_01;
539 			pD[ii+0+ldd*(jj+0)] = d_00;
540 			pD[ii+1+ldd*(jj+0)] = d_10;
541 			pD[ii+0+ldd*(jj+1)] = d_01;
542 			pD[ii+1+ldd*(jj+1)] = d_11;
543 			}
544 		for(; ii<m; ii++)
545 			{
546 			d_00 = alpha * pB[ii+ldb*(jj+0)];
547 			d_01 = alpha * pB[ii+ldb*(jj+1)];
548 			for(kk=0; kk<ii; kk++)
549 				{
550 				d_00 -= pA[ii+lda*kk] * pD[kk+ldd*(jj+0)];
551 				d_01 -= pA[ii+lda*kk] * pD[kk+ldd*(jj+1)];
552 				}
553 			pD[ii+ldd*(jj+0)] = d_00;
554 			pD[ii+ldd*(jj+1)] = d_01;
555 			}
556 		}
557 	for(; jj<n; jj++)
558 		{
559 		ii = 0;
560 		for(; ii<m-1; ii+=2)
561 			{
562 			d_00 = alpha * pB[ii+0+ldb*jj];
563 			d_10 = alpha * pB[ii+1+ldb*jj];
564 			for(kk=0; kk<ii; kk++)
565 				{
566 				d_00 -= pA[ii+0+lda*kk] * pD[kk+ldd*jj];
567 				d_10 -= pA[ii+1+lda*kk] * pD[kk+ldd*jj];
568 				}
569 			d_10 -= pA[ii+1+lda*kk] * d_00;
570 			pD[ii+0+ldd*jj] = d_00;
571 			pD[ii+1+ldd*jj] = d_10;
572 			}
573 		for(; ii<m; ii++)
574 			{
575 			d_00 = alpha * pB[ii+ldb*jj];
576 			for(kk=0; kk<ii; kk++)
577 				{
578 				d_00 -= pA[ii+lda*kk] * pD[kk+ldd*jj];
579 				}
580 			pD[ii+ldd*jj] = d_00;
581 			}
582 		}
583 	return;
584 	}
585 
586 
587 
588 // dtrsm_lltn
TRSM_LLTN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)589 void TRSM_LLTN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
590 	{
591 	if(m<=0 | n<=0)
592 		return;
593 
594 	// invalidate stored inverse diagonal of result matrix
595 	sD->use_dA = 0;
596 
597 	int ii, jj, kk, id;
598 	REAL
599 		d_00, d_01,
600 		d_10, d_11;
601 	int lda = sA->m;
602 	int ldb = sB->m;
603 	int ldd = sD->m;
604 	REAL *pA = sA->pA + ai + aj*lda; // triangular
605 	REAL *pB = sB->pA + bi + bj*ldb;
606 	REAL *pD = sD->pA + di + dj*ldd;
607 	REAL *dA = sA->dA;
608 	if(ai==0 & aj==0)
609 		{
610 		if (sA->use_dA<m)
611 			{
612 			// invert diagonal of pA
613 			for(ii=0; ii<m; ii++)
614 				dA[ii] = 1.0/pA[ii+lda*ii];
615 			// use only now
616 			sA->use_dA = m;
617 			}
618 		}
619 	else
620 		{
621 		for(ii=0; ii<m; ii++)
622 			dA[ii] = 1.0 / pA[ii+lda*ii];
623 		sA->use_dA = 0; // nonzero offset makes diagonal dirty
624 		}
625 	// solve
626 	jj = 0;
627 	for(; jj<n-1; jj+=2)
628 		{
629 		ii = 0;
630 		for(; ii<m-1; ii+=2)
631 			{
632 			id = m-ii-2;
633 			d_00 = alpha * pB[id+0+ldb*(jj+0)];
634 			d_10 = alpha * pB[id+1+ldb*(jj+0)];
635 			d_01 = alpha * pB[id+0+ldb*(jj+1)];
636 			d_11 = alpha * pB[id+1+ldb*(jj+1)];
637 			kk = id+2;
638 			for(; kk<m; kk++)
639 				{
640 				d_00 -= pA[kk+0+lda*(id+0)] * pD[kk+0+ldd*(jj+0)];
641 				d_10 -= pA[kk+0+lda*(id+1)] * pD[kk+0+ldd*(jj+0)];
642 				d_01 -= pA[kk+0+lda*(id+0)] * pD[kk+0+ldd*(jj+1)];
643 				d_11 -= pA[kk+0+lda*(id+1)] * pD[kk+0+ldd*(jj+1)];
644 				}
645 			d_10 *= dA[id+1];
646 			d_11 *= dA[id+1];
647 			d_00 -= pA[id+1+lda*(id+0)] * d_10;
648 			d_01 -= pA[id+1+lda*(id+0)] * d_11;
649 			d_00 *= dA[id+0];
650 			d_01 *= dA[id+0];
651 			pD[id+0+ldd*(jj+0)] = d_00;
652 			pD[id+1+ldd*(jj+0)] = d_10;
653 			pD[id+0+ldd*(jj+1)] = d_01;
654 			pD[id+1+ldd*(jj+1)] = d_11;
655 			}
656 		for(; ii<m; ii++)
657 			{
658 			id = m-ii-1;
659 			d_00 = alpha * pB[id+0+ldb*(jj+0)];
660 			d_01 = alpha * pB[id+0+ldb*(jj+1)];
661 			kk = id+1;
662 			for(; kk<m; kk++)
663 				{
664 				d_00 -= pA[kk+0+lda*(id+0)] * pD[kk+0+ldd*(jj+0)];
665 				d_01 -= pA[kk+0+lda*(id+0)] * pD[kk+0+ldd*(jj+1)];
666 				}
667 			d_00 *= dA[id+0];
668 			d_01 *= dA[id+0];
669 			pD[id+0+ldd*(jj+0)] = d_00;
670 			pD[id+0+ldd*(jj+1)] = d_01;
671 			}
672 		}
673 	for(; jj<n; jj++)
674 		{
675 		ii = 0;
676 		for(; ii<m-1; ii+=2)
677 			{
678 			id = m-ii-2;
679 			d_00 = alpha * pB[id+0+ldb*(jj+0)];
680 			d_10 = alpha * pB[id+1+ldb*(jj+0)];
681 			kk = id+2;
682 			for(; kk<m; kk++)
683 				{
684 				d_00 -= pA[kk+0+lda*(id+0)] * pD[kk+0+ldd*(jj+0)];
685 				d_10 -= pA[kk+0+lda*(id+1)] * pD[kk+0+ldd*(jj+0)];
686 				}
687 			d_10 *= dA[id+1];
688 			d_00 -= pA[id+1+lda*(id+0)] * d_10;
689 			d_00 *= dA[id+0];
690 			pD[id+0+ldd*(jj+0)] = d_00;
691 			pD[id+1+ldd*(jj+0)] = d_10;
692 			}
693 		for(; ii<m; ii++)
694 			{
695 			id = m-ii-1;
696 			d_00 = alpha * pB[id+0+ldb*(jj+0)];
697 			kk = id+1;
698 			for(; kk<m; kk++)
699 				{
700 				d_00 -= pA[kk+0+lda*(id+0)] * pD[kk+0+ldd*(jj+0)];
701 				}
702 			d_00 *= dA[id+0];
703 			pD[id+0+ldd*(jj+0)] = d_00;
704 			}
705 		}
706 	return;
707 	}
708 
709 
710 
711 // dtrsm_lltu
TRSM_LLTU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)712 void TRSM_LLTU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
713 	{
714 	if(m<=0 | n<=0)
715 		return;
716 
717 	// invalidate stored inverse diagonal of result matrix
718 	sD->use_dA = 0;
719 
720 	int ii, jj, kk, id;
721 	REAL
722 		d_00, d_01,
723 		d_10, d_11;
724 	int lda = sA->m;
725 	int ldb = sB->m;
726 	int ldd = sD->m;
727 	REAL *pA = sA->pA + ai + aj*lda; // triangular
728 	REAL *pB = sB->pA + bi + bj*ldb;
729 	REAL *pD = sD->pA + di + dj*ldd;
730 	// solve
731 	jj = 0;
732 	for(; jj<n-1; jj+=2)
733 		{
734 		ii = 0;
735 		for(; ii<m-1; ii+=2)
736 			{
737 			id = m-ii-2;
738 			d_00 = alpha * pB[id+0+ldb*(jj+0)];
739 			d_10 = alpha * pB[id+1+ldb*(jj+0)];
740 			d_01 = alpha * pB[id+0+ldb*(jj+1)];
741 			d_11 = alpha * pB[id+1+ldb*(jj+1)];
742 			kk = id+2;
743 
744 			for(; kk<m; kk++)
745 				{
746 				d_00 -= pA[kk+0+lda*(id+0)] * pD[kk+0+ldd*(jj+0)];
747 				d_10 -= pA[kk+0+lda*(id+1)] * pD[kk+0+ldd*(jj+0)];
748 				d_01 -= pA[kk+0+lda*(id+0)] * pD[kk+0+ldd*(jj+1)];
749 				d_11 -= pA[kk+0+lda*(id+1)] * pD[kk+0+ldd*(jj+1)];
750 				}
751 			d_00 -= pA[id+1+lda*(id+0)] * d_10;
752 			d_01 -= pA[id+1+lda*(id+0)] * d_11;
753 			pD[id+0+ldd*(jj+0)] = d_00;
754 			pD[id+1+ldd*(jj+0)] = d_10;
755 			pD[id+0+ldd*(jj+1)] = d_01;
756 			pD[id+1+ldd*(jj+1)] = d_11;
757 			}
758 		for(; ii<m; ii++)
759 			{
760 			id = m-ii-1;
761 			d_00 = alpha * pB[id+0+ldb*(jj+0)];
762 			d_01 = alpha * pB[id+0+ldb*(jj+1)];
763 			kk = id+1;
764 			for(; kk<m; kk++)
765 				{
766 				d_00 -= pA[kk+0+lda*(id+0)] * pD[kk+0+ldd*(jj+0)];
767 				d_01 -= pA[kk+0+lda*(id+0)] * pD[kk+0+ldd*(jj+1)];
768 				}
769 			pD[id+0+ldd*(jj+0)] = d_00;
770 			pD[id+0+ldd*(jj+1)] = d_01;
771 			}
772 		}
773 	for(; jj<n; jj++)
774 		{
775 		ii = 0;
776 		for(; ii<m-1; ii+=2)
777 			{
778 			id = m-ii-2;
779 			d_00 = alpha * pB[id+0+ldb*(jj+0)];
780 			d_10 = alpha * pB[id+1+ldb*(jj+0)];
781 			kk = id+2;
782 			for(; kk<m; kk++)
783 				{
784 				d_00 -= pA[kk+0+lda*(id+0)] * pD[kk+0+ldd*(jj+0)];
785 				d_10 -= pA[kk+0+lda*(id+1)] * pD[kk+0+ldd*(jj+0)];
786 				}
787 			d_00 -= pA[id+1+lda*(id+0)] * d_10;
788 			pD[id+0+ldd*(jj+0)] = d_00;
789 			pD[id+1+ldd*(jj+0)] = d_10;
790 			}
791 		for(; ii<m; ii++)
792 			{
793 			id = m-ii-1;
794 			d_00 = alpha * pB[id+0+ldb*(jj+0)];
795 			kk = id+1;
796 			for(; kk<m; kk++)
797 				{
798 				d_00 -= pA[kk+0+lda*(id+0)] * pD[kk+0+ldd*(jj+0)];
799 				}
800 			pD[id+0+ldd*(jj+0)] = d_00;
801 			}
802 		}
803 	return;
804 	}
805 
806 
807 
808 // dtrsm_left_upper_nottransposed_notunit
TRSM_LUNN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)809 void TRSM_LUNN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
810 	{
811 	if(m<=0 | n<=0)
812 		return;
813 
814 	// invalidate stored inverse diagonal of result matrix
815 	sD->use_dA = 0;
816 
817 	int ii, jj, kk, id;
818 	REAL
819 		d_00, d_01,
820 		d_10, d_11;
821 	int lda = sA->m;
822 	int ldb = sB->m;
823 	int ldd = sD->m;
824 	REAL *pA = sA->pA + ai + aj*lda; // triangular
825 	REAL *pB = sB->pA + bi + bj*ldb;
826 	REAL *pD = sD->pA + di + dj*ldd;
827 	REAL *dA = sA->dA;
828 	if(ai==0 & aj==0)
829 		{
830 		if (sA->use_dA<m)
831 			{
832 			// invert diagonal of pA
833 			for(ii=0; ii<m; ii++)
834 				dA[ii] = 1.0/pA[ii+lda*ii];
835 			// use only now
836 			sA->use_dA = m;
837 			}
838 		}
839 	else
840 		{
841 		for(ii=0; ii<m; ii++)
842 			dA[ii] = 1.0 / pA[ii+lda*ii];
843 		sA->use_dA = 0; // nonzero offset makes diagonal dirty
844 		}
845 	// solve
846 	jj = 0;
847 	for(; jj<n-1; jj+=2)
848 		{
849 		ii = 0;
850 		for(; ii<m-1; ii+=2)
851 			{
852 			id = m-ii-2;
853 			d_00 = alpha * pB[id+0+ldb*(jj+0)];
854 			d_10 = alpha * pB[id+1+ldb*(jj+0)];
855 			d_01 = alpha * pB[id+0+ldb*(jj+1)];
856 			d_11 = alpha * pB[id+1+ldb*(jj+1)];
857 			kk = id+2;
858 			for(; kk<m; kk++)
859 				{
860 				d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
861 				d_10 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
862 				d_01 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
863 				d_11 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
864 				}
865 			d_10 *= dA[id+1];
866 			d_11 *= dA[id+1];
867 			d_00 -= pA[id+0+lda*(id+1)] * d_10;
868 			d_01 -= pA[id+0+lda*(id+1)] * d_11;
869 			d_00 *= dA[id+0];
870 			d_01 *= dA[id+0];
871 			pD[id+0+ldd*(jj+0)] = d_00;
872 			pD[id+1+ldd*(jj+0)] = d_10;
873 			pD[id+0+ldd*(jj+1)] = d_01;
874 			pD[id+1+ldd*(jj+1)] = d_11;
875 			}
876 		for(; ii<m; ii++)
877 			{
878 			id = m-ii-1;
879 			d_00 = alpha * pB[id+0+ldb*(jj+0)];
880 			d_01 = alpha * pB[id+0+ldb*(jj+1)];
881 			kk = id+1;
882 			for(; kk<m; kk++)
883 				{
884 				d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
885 				d_01 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
886 				}
887 			d_00 *= dA[id+0];
888 			d_01 *= dA[id+0];
889 			pD[id+0+ldd*(jj+0)] = d_00;
890 			pD[id+0+ldd*(jj+1)] = d_01;
891 			}
892 		}
893 	for(; jj<n; jj++)
894 		{
895 		ii = 0;
896 		for(; ii<m-1; ii+=2)
897 			{
898 			id = m-ii-2;
899 			d_00 = alpha * pB[id+0+ldb*(jj+0)];
900 			d_10 = alpha * pB[id+1+ldb*(jj+0)];
901 			kk = id+2;
902 			for(; kk<m; kk++)
903 				{
904 				d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
905 				d_10 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
906 				}
907 			d_10 *= dA[id+1];
908 			d_00 -= pA[id+0+lda*(id+1)] * d_10;
909 			d_00 *= dA[id+0];
910 			pD[id+0+ldd*(jj+0)] = d_00;
911 			pD[id+1+ldd*(jj+0)] = d_10;
912 			}
913 		for(; ii<m; ii++)
914 			{
915 			id = m-ii-1;
916 			d_00 = alpha * pB[id+0+ldb*(jj+0)];
917 			kk = id+1;
918 			for(; kk<m; kk++)
919 				{
920 				d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
921 				}
922 			d_00 *= dA[id+0];
923 			pD[id+0+ldd*(jj+0)] = d_00;
924 			}
925 		}
926 	return;
927 	}
928 
929 
930 
931 // dtrsm_lunu
TRSM_LUNU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)932 void TRSM_LUNU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
933 	{
934 	if(m<=0 | n<=0)
935 		return;
936 
937 	// invalidate stored inverse diagonal of result matrix
938 	sD->use_dA = 0;
939 
940 	int ii, jj, kk, id;
941 	REAL
942 		d_00, d_01,
943 		d_10, d_11;
944 	int lda = sA->m;
945 	int ldb = sB->m;
946 	int ldd = sD->m;
947 	REAL *pA = sA->pA + ai + aj*lda; // triangular
948 	REAL *pB = sB->pA + bi + bj*ldb;
949 	REAL *pD = sD->pA + di + dj*ldd;
950 	// solve
951 	jj = 0;
952 	for(; jj<n-1; jj+=2)
953 		{
954 		ii = 0;
955 		for(; ii<m-1; ii+=2)
956 			{
957 			id = m-ii-2;
958 			d_00 = alpha * pB[id+0+ldb*(jj+0)];
959 			d_10 = alpha * pB[id+1+ldb*(jj+0)];
960 			d_01 = alpha * pB[id+0+ldb*(jj+1)];
961 			d_11 = alpha * pB[id+1+ldb*(jj+1)];
962 			kk = id+2;
963 			for(; kk<m; kk++)
964 				{
965 				d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
966 				d_10 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
967 				d_01 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
968 				d_11 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
969 				}
970 			d_00 -= pA[id+0+lda*(id+1)] * d_10;
971 			d_01 -= pA[id+0+lda*(id+1)] * d_11;
972 			pD[id+0+ldd*(jj+0)] = d_00;
973 			pD[id+1+ldd*(jj+0)] = d_10;
974 			pD[id+0+ldd*(jj+1)] = d_01;
975 			pD[id+1+ldd*(jj+1)] = d_11;
976 			}
977 		for(; ii<m; ii++)
978 			{
979 			id = m-ii-1;
980 			d_00 = alpha * pB[id+0+ldb*(jj+0)];
981 			d_01 = alpha * pB[id+0+ldb*(jj+1)];
982 			kk = id+1;
983 			for(; kk<m; kk++)
984 				{
985 				d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
986 				d_01 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
987 				}
988 			pD[id+0+ldd*(jj+0)] = d_00;
989 			pD[id+0+ldd*(jj+1)] = d_01;
990 			}
991 		}
992 	for(; jj<n; jj++)
993 		{
994 		ii = 0;
995 		for(; ii<m-1; ii+=2)
996 			{
997 			id = m-ii-2;
998 			d_00 = alpha * pB[id+0+ldb*(jj+0)];
999 			d_10 = alpha * pB[id+1+ldb*(jj+0)];
1000 			kk = id+2;
1001 			for(; kk<m; kk++)
1002 				{
1003 				d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
1004 				d_10 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
1005 				}
1006 			d_00 -= pA[id+0+lda*(id+1)] * d_10;
1007 			pD[id+0+ldd*(jj+0)] = d_00;
1008 			pD[id+1+ldd*(jj+0)] = d_10;
1009 			}
1010 		for(; ii<m; ii++)
1011 			{
1012 			id = m-ii-1;
1013 			d_00 = alpha * pB[id+0+ldb*(jj+0)];
1014 			kk = id+1;
1015 			for(; kk<m; kk++)
1016 				{
1017 				d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
1018 				}
1019 			pD[id+0+ldd*(jj+0)] = d_00;
1020 			}
1021 		}
1022 	return;
1023 	return;
1024 	}
1025 
1026 
1027 
1028 // dtrsm_lutn
TRSM_LUTN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)1029 void TRSM_LUTN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
1030 	{
1031 	if(m<=0 | n<=0)
1032 		return;
1033 
1034 	// invalidate stored inverse diagonal of result matrix
1035 	sD->use_dA = 0;
1036 
1037 	int ii, jj, kk;
1038 	REAL
1039 		d_00, d_01,
1040 		d_10, d_11;
1041 	int lda = sA->m;
1042 	int ldb = sB->m;
1043 	int ldd = sD->m;
1044 	REAL *pA = sA->pA + ai + aj*lda; // triangular
1045 	REAL *pB = sB->pA + bi + bj*ldb;
1046 	REAL *pD = sD->pA + di + dj*ldd;
1047 	REAL *dA = sA->dA;
1048 	if(ai==0 & aj==0)
1049 		{
1050 		if(sA->use_dA<n)
1051 			{
1052 			for(ii=0; ii<n; ii++)
1053 				dA[ii] = 1.0 / pA[ii+lda*ii];
1054 			sA->use_dA = n;
1055 			}
1056 		}
1057 	else
1058 		{
1059 		for(ii=0; ii<n; ii++)
1060 			dA[ii] = 1.0 / pA[ii+lda*ii];
1061 		sA->use_dA = 0; // nonzero offset makes diagonal dirty
1062 		}
1063 	// solve
1064 	jj = 0;
1065 	for(; jj<n-1; jj+=2)
1066 		{
1067 		ii = 0;
1068 		for(; ii<m-1; ii+=2)
1069 			{
1070 			d_00 = alpha * pB[ii+0+ldb*(jj+0)];
1071 			d_10 = alpha * pB[ii+1+ldb*(jj+0)];
1072 			d_01 = alpha * pB[ii+0+ldb*(jj+1)];
1073 			d_11 = alpha * pB[ii+1+ldb*(jj+1)];
1074 			kk = 0;
1075 			for(; kk<ii; kk++)
1076 				{
1077 				d_00 -= pA[kk+lda*(ii+0)] * pD[kk+ldd*(jj+0)];
1078 				d_10 -= pA[kk+lda*(ii+1)] * pD[kk+ldd*(jj+0)];
1079 				d_01 -= pA[kk+lda*(ii+0)] * pD[kk+ldd*(jj+1)];
1080 				d_11 -= pA[kk+lda*(ii+1)] * pD[kk+ldd*(jj+1)];
1081 				}
1082 			d_00 *= dA[ii+0];
1083 			d_01 *= dA[ii+0];
1084 			d_10 -= pA[ii+lda*(ii+1)] * d_00;
1085 			d_11 -= pA[ii+lda*(ii+1)] * d_01;
1086 			d_10 *= dA[ii+1];
1087 			d_11 *= dA[ii+1];
1088 			pD[ii+0+ldd*(jj+0)] = d_00;
1089 			pD[ii+1+ldd*(jj+0)] = d_10;
1090 			pD[ii+0+ldd*(jj+1)] = d_01;
1091 			pD[ii+1+ldd*(jj+1)] = d_11;
1092 			}
1093 		for(; ii<m; ii++)
1094 			{
1095 			d_00 = alpha * pB[ii+ldb*(jj+0)];
1096 			d_01 = alpha * pB[ii+ldb*(jj+1)];
1097 			for(kk=0; kk<ii; kk++)
1098 				{
1099 				d_00 -= pA[kk+lda*ii] * pD[kk+ldd*(jj+0)];
1100 				d_01 -= pA[kk+lda*ii] * pD[kk+ldd*(jj+1)];
1101 				}
1102 			d_00 *= dA[ii+0];
1103 			d_01 *= dA[ii+0];
1104 			pD[ii+ldd*(jj+0)] = d_00;
1105 			pD[ii+ldd*(jj+1)] = d_01;
1106 			}
1107 		}
1108 	for(; jj<n; jj++)
1109 		{
1110 		ii = 0;
1111 		for(; ii<m-1; ii+=2)
1112 			{
1113 			d_00 = alpha * pB[ii+0+ldb*jj];
1114 			d_10 = alpha * pB[ii+1+ldb*jj];
1115 			for(kk=0; kk<ii; kk++)
1116 				{
1117 				d_00 -= pA[kk+lda*(ii+0)] * pD[kk+ldd*jj];
1118 				d_10 -= pA[kk+lda*(ii+1)] * pD[kk+ldd*jj];
1119 				}
1120 			d_00 *= dA[ii+0];
1121 			d_10 -= pA[kk+lda*(ii+1)] * d_00;
1122 			d_10 *= dA[ii+1];
1123 			pD[ii+0+ldd*jj] = d_00;
1124 			pD[ii+1+ldd*jj] = d_10;
1125 			}
1126 		for(; ii<m; ii++)
1127 			{
1128 			d_00 = alpha * pB[ii+ldb*jj];
1129 			for(kk=0; kk<ii; kk++)
1130 				{
1131 				d_00 -= pA[kk+lda*ii] * pD[kk+ldd*jj];
1132 				}
1133 			d_00 *= dA[ii+0];
1134 			pD[ii+ldd*jj] = d_00;
1135 			}
1136 		}
1137 	return;
1138 	}
1139 
1140 
1141 
1142 // dtrsm_lutu
TRSM_LUTU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)1143 void TRSM_LUTU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
1144 	{
1145 	if(m<=0 | n<=0)
1146 		return;
1147 
1148 	// invalidate stored inverse diagonal of result matrix
1149 	sD->use_dA = 0;
1150 
1151 	int ii, jj, kk;
1152 	REAL
1153 		d_00, d_01,
1154 		d_10, d_11;
1155 	int lda = sA->m;
1156 	int ldb = sB->m;
1157 	int ldd = sD->m;
1158 	REAL *pA = sA->pA + ai + aj*lda; // triangular
1159 	REAL *pB = sB->pA + bi + bj*ldb;
1160 	REAL *pD = sD->pA + di + dj*ldd;
1161 	// solve
1162 	jj = 0;
1163 	for(; jj<n-1; jj+=2)
1164 		{
1165 		ii = 0;
1166 		for(; ii<m-1; ii+=2)
1167 			{
1168 			d_00 = alpha * pB[ii+0+ldb*(jj+0)];
1169 			d_10 = alpha * pB[ii+1+ldb*(jj+0)];
1170 			d_01 = alpha * pB[ii+0+ldb*(jj+1)];
1171 			d_11 = alpha * pB[ii+1+ldb*(jj+1)];
1172 			kk = 0;
1173 			for(; kk<ii; kk++)
1174 				{
1175 				d_00 -= pA[kk+lda*(ii+0)] * pD[kk+ldd*(jj+0)];
1176 				d_10 -= pA[kk+lda*(ii+1)] * pD[kk+ldd*(jj+0)];
1177 				d_01 -= pA[kk+lda*(ii+0)] * pD[kk+ldd*(jj+1)];
1178 				d_11 -= pA[kk+lda*(ii+1)] * pD[kk+ldd*(jj+1)];
1179 				}
1180 			d_10 -= pA[ii+lda*(ii+1)] * d_00;
1181 			d_11 -= pA[ii+lda*(ii+1)] * d_01;
1182 			pD[ii+0+ldd*(jj+0)] = d_00;
1183 			pD[ii+1+ldd*(jj+0)] = d_10;
1184 			pD[ii+0+ldd*(jj+1)] = d_01;
1185 			pD[ii+1+ldd*(jj+1)] = d_11;
1186 			}
1187 		for(; ii<m; ii++)
1188 			{
1189 			d_00 = alpha * pB[ii+ldb*(jj+0)];
1190 			d_01 = alpha * pB[ii+ldb*(jj+1)];
1191 			for(kk=0; kk<ii; kk++)
1192 				{
1193 				d_00 -= pA[kk+lda*ii] * pD[kk+ldd*(jj+0)];
1194 				d_01 -= pA[kk+lda*ii] * pD[kk+ldd*(jj+1)];
1195 				}
1196 			pD[ii+ldd*(jj+0)] = d_00;
1197 			pD[ii+ldd*(jj+1)] = d_01;
1198 			}
1199 		}
1200 	for(; jj<n; jj++)
1201 		{
1202 		ii = 0;
1203 		for(; ii<m-1; ii+=2)
1204 			{
1205 			d_00 = alpha * pB[ii+0+ldb*jj];
1206 			d_10 = alpha * pB[ii+1+ldb*jj];
1207 			for(kk=0; kk<ii; kk++)
1208 				{
1209 				d_00 -= pA[kk+lda*(ii+0)] * pD[kk+ldd*jj];
1210 				d_10 -= pA[kk+lda*(ii+1)] * pD[kk+ldd*jj];
1211 				}
1212 			d_10 -= pA[kk+lda*(ii+1)] * d_00;
1213 			pD[ii+0+ldd*jj] = d_00;
1214 			pD[ii+1+ldd*jj] = d_10;
1215 			}
1216 		for(; ii<m; ii++)
1217 			{
1218 			d_00 = alpha * pB[ii+ldb*jj];
1219 			for(kk=0; kk<ii; kk++)
1220 				{
1221 				d_00 -= pA[kk+lda*ii] * pD[kk+ldd*jj];
1222 				}
1223 			pD[ii+ldd*jj] = d_00;
1224 			}
1225 		}
1226 	return;
1227 	}
1228 
1229 
1230 
1231 // dtrsm_rlnn
TRSM_RLNN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)1232 void TRSM_RLNN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
1233 	{
1234 	if(m<=0 | n<=0)
1235 		return;
1236 
1237 	// invalidate stored inverse diagonal of result matrix
1238 	sD->use_dA = 0;
1239 
1240 	char cl = 'l';
1241 	char cn = 'n';
1242 	char cr = 'r';
1243 	char ct = 't';
1244 	char cu = 'u';
1245 	int i1 = 1;
1246 	int ii, jj, kk, id;
1247 	int lda = sA->m;
1248 	int ldb = sB->m;
1249 	int ldd = sD->m;
1250 	REAL
1251 		d_00, d_01,
1252 		d_10, d_11;
1253 	REAL *pA = sA->pA+ai+aj*lda;
1254 	REAL *pB = sB->pA+bi+bj*ldb;
1255 	REAL *pD = sD->pA+di+dj*ldd;
1256 	REAL *dA = sA->dA;
1257 	if(ai==0 & aj==0)
1258 		{
1259 		if (sA->use_dA<n)
1260 			{
1261 			// invert diagonal of pA
1262 			for(ii=0; ii<n; ii++)
1263 				dA[ii] = 1.0/pA[ii+lda*ii];
1264 			// use only now
1265 			sA->use_dA = n;
1266 			}
1267 		}
1268 	else
1269 		{
1270 		for(ii=0; ii<n; ii++)
1271 			dA[ii] = 1.0 / pA[ii+lda*ii];
1272 		sA->use_dA = 0; // nonzero offset makes diagonal dirty
1273 		}
1274 	// solve
1275 	jj = 0;
1276 	for(; jj<n-1; jj+=2)
1277 		{
1278 		ii = 0;
1279 		id = n-jj-2;
1280 		for(; ii<m-1; ii+=2)
1281 			{
1282 			d_00 = alpha * pB[ii+0+ldb*(id+0)];
1283 			d_10 = alpha * pB[ii+1+ldb*(id+0)];
1284 			d_01 = alpha * pB[ii+0+ldb*(id+1)];
1285 			d_11 = alpha * pB[ii+1+ldb*(id+1)];
1286 			kk = id+2;
1287 			for(; kk<n; kk++)
1288 				{
1289 				d_00 -= pA[kk+0+lda*(id+0)] * pD[ii+0+ldd*(kk+0)];
1290 				d_10 -= pA[kk+0+lda*(id+0)] * pD[ii+1+ldd*(kk+0)];
1291 				d_01 -= pA[kk+0+lda*(id+1)] * pD[ii+0+ldd*(kk+0)];
1292 				d_11 -= pA[kk+0+lda*(id+1)] * pD[ii+1+ldd*(kk+0)];
1293 				}
1294 			d_01 *= dA[id+1];
1295 			d_11 *= dA[id+1];
1296 			d_00 -= pA[id+1+lda*(id+0)] * d_01;
1297 			d_10 -= pA[id+1+lda*(id+0)] * d_11;
1298 			d_00 *= dA[id+0];
1299 			d_10 *= dA[id+0];
1300 			pD[ii+0+ldd*(id+0)] = d_00;
1301 			pD[ii+1+ldd*(id+0)] = d_10;
1302 			pD[ii+0+ldd*(id+1)] = d_01;
1303 			pD[ii+1+ldd*(id+1)] = d_11;
1304 			}
1305 		for(; ii<m; ii++)
1306 			{
1307 			d_00 = alpha * pB[ii+0+ldb*(id+0)];
1308 			d_01 = alpha * pB[ii+0+ldb*(id+1)];
1309 			kk = id+2;
1310 			for(; kk<n; kk++)
1311 				{
1312 				d_00 -= pA[kk+0+lda*(id+0)] * pD[ii+0+ldd*(kk+0)];
1313 				d_01 -= pA[kk+0+lda*(id+1)] * pD[ii+0+ldd*(kk+0)];
1314 				}
1315 			d_01 *= dA[id+1];
1316 			d_00 -= pA[id+1+lda*(id+0)] * d_01;
1317 			d_00 *= dA[id+0];
1318 			pD[ii+0+ldd*(id+0)] = d_00;
1319 			pD[ii+0+ldd*(id+1)] = d_01;
1320 			}
1321 		}
1322 	for(; jj<n; jj++)
1323 		{
1324 		ii = 0;
1325 		id = n-jj-1;
1326 		for(; ii<m-1; ii+=2)
1327 			{
1328 			d_00 = alpha * pB[ii+0+ldb*(id+0)];
1329 			d_10 = alpha * pB[ii+1+ldb*(id+0)];
1330 			kk = id+1;
1331 			for(; kk<n; kk++)
1332 				{
1333 				d_00 -= pA[kk+0+lda*(id+0)] * pD[ii+0+ldd*(kk+0)];
1334 				d_10 -= pA[kk+0+lda*(id+0)] * pD[ii+1+ldd*(kk+0)];
1335 				}
1336 			d_00 *= dA[id+0];
1337 			d_10 *= dA[id+0];
1338 			pD[ii+0+ldd*(id+0)] = d_00;
1339 			pD[ii+1+ldd*(id+0)] = d_10;
1340 			}
1341 		for(; ii<m; ii++)
1342 			{
1343 			d_00 = alpha * pB[ii+ldb*(id)];
1344 			kk = id+1;
1345 			for(; kk<n; kk++)
1346 				d_00 -= pA[kk+lda*(id)] * pD[ii+ldd*(kk)];
1347 			pD[ii+ldd*(id)] = d_00 * dA[id];
1348 			}
1349 		}
1350 	return;
1351 	}
1352 
1353 
1354 
1355 // dtrsm_rlnu
TRSM_RLNU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)1356 void TRSM_RLNU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
1357 	{
1358 	if(m<=0 | n<=0)
1359 		return;
1360 
1361 	// invalidate stored inverse diagonal of result matrix
1362 	sD->use_dA = 0;
1363 
1364 	char cl = 'l';
1365 	char cn = 'n';
1366 	char cr = 'r';
1367 	char ct = 't';
1368 	char cu = 'u';
1369 	int i1 = 1;
1370 	int ii, jj, kk, id;
1371 	int lda = sA->m;
1372 	int ldb = sB->m;
1373 	int ldd = sD->m;
1374 	REAL
1375 		d_00, d_01,
1376 		d_10, d_11;
1377 	REAL *pA = sA->pA+ai+aj*lda;
1378 	REAL *pB = sB->pA+bi+bj*ldb;
1379 	REAL *pD = sD->pA+di+dj*ldd;
1380 	// solve
1381 	jj = 0;
1382 	for(; jj<n-1; jj+=2)
1383 		{
1384 		ii = 0;
1385 		id = n-jj-2;
1386 		for(; ii<m-1; ii+=2)
1387 			{
1388 			d_00 = alpha * pB[ii+0+ldb*(id+0)];
1389 			d_10 = alpha * pB[ii+1+ldb*(id+0)];
1390 			d_01 = alpha * pB[ii+0+ldb*(id+1)];
1391 			d_11 = alpha * pB[ii+1+ldb*(id+1)];
1392 			kk = id+2;
1393 			for(; kk<n; kk++)
1394 				{
1395 				d_00 -= pA[kk+0+lda*(id+0)] * pD[ii+0+ldd*(kk+0)];
1396 				d_10 -= pA[kk+0+lda*(id+0)] * pD[ii+1+ldd*(kk+0)];
1397 				d_01 -= pA[kk+0+lda*(id+1)] * pD[ii+0+ldd*(kk+0)];
1398 				d_11 -= pA[kk+0+lda*(id+1)] * pD[ii+1+ldd*(kk+0)];
1399 				}
1400 			d_00 -= pA[id+1+lda*(id+0)] * d_01;
1401 			d_10 -= pA[id+1+lda*(id+0)] * d_11;
1402 			pD[ii+0+ldd*(id+0)] = d_00;
1403 			pD[ii+1+ldd*(id+0)] = d_10;
1404 			pD[ii+0+ldd*(id+1)] = d_01;
1405 			pD[ii+1+ldd*(id+1)] = d_11;
1406 			}
1407 		for(; ii<m; ii++)
1408 			{
1409 			d_00 = alpha * pB[ii+0+ldb*(id+0)];
1410 			d_01 = alpha * pB[ii+0+ldb*(id+1)];
1411 			kk = id+2;
1412 			for(; kk<n; kk++)
1413 				{
1414 				d_00 -= pA[kk+0+lda*(id+0)] * pD[ii+0+ldd*(kk+0)];
1415 				d_01 -= pA[kk+0+lda*(id+1)] * pD[ii+0+ldd*(kk+0)];
1416 				}
1417 			d_00 -= pA[id+1+lda*(id+0)] * d_01;
1418 			pD[ii+0+ldd*(id+0)] = d_00;
1419 			pD[ii+0+ldd*(id+1)] = d_01;
1420 			}
1421 		}
1422 	for(; jj<n; jj++)
1423 		{
1424 		ii = 0;
1425 		id = n-jj-1;
1426 		for(; ii<m-1; ii+=2)
1427 			{
1428 			d_00 = alpha * pB[ii+0+ldb*(id+0)];
1429 			d_10 = alpha * pB[ii+1+ldb*(id+0)];
1430 			kk = id+1;
1431 			for(; kk<n; kk++)
1432 				{
1433 				d_00 -= pA[kk+0+lda*(id+0)] * pD[ii+0+ldd*(kk+0)];
1434 				d_10 -= pA[kk+0+lda*(id+0)] * pD[ii+1+ldd*(kk+0)];
1435 				}
1436 			pD[ii+0+ldd*(id+0)] = d_00;
1437 			pD[ii+1+ldd*(id+0)] = d_10;
1438 			}
1439 		for(; ii<m; ii++)
1440 			{
1441 			d_00 = alpha * pB[ii+ldb*(id)];
1442 			kk = id+1;
1443 			for(; kk<n; kk++)
1444 				d_00 -= pA[kk+lda*(id)] * pD[ii+ldd*(kk)];
1445 			}
1446 		}
1447 	return;
1448 	}
1449 
1450 
1451 
1452 // dtrsm_right_lower_transposed_not-unit
TRSM_RLTN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)1453 void TRSM_RLTN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
1454 	{
1455 	if(m<=0 | n<=0)
1456 		return;
1457 
1458 	// invalidate stored inverse diagonal of result matrix
1459 	sD->use_dA = 0;
1460 
1461 	int ii, jj, kk;
1462 	int lda = sA->m;
1463 	int ldb = sB->m;
1464 	int ldd = sD->m;
1465 	REAL *pA = sA->pA + ai + aj*lda;
1466 	REAL *pB = sB->pA + bi + bj*ldb;
1467 	REAL *pD = sD->pA + di + dj*ldd;
1468 	REAL *dA = sA->dA;
1469 	if(ai==0 & aj==0)
1470 		{
1471 		if(sA->use_dA<n)
1472 			{
1473 			for(ii=0; ii<n; ii++)
1474 				dA[ii] = 1.0 / pA[ii+lda*ii];
1475 			sA->use_dA = n;
1476 			}
1477 		}
1478 	else
1479 		{
1480 		for(ii=0; ii<n; ii++)
1481 			dA[ii] = 1.0 / pA[ii+lda*ii];
1482 		sA->use_dA = 0; // nonzero offset makes diagonal dirty
1483 		}
1484 	REAL
1485 		f_00_inv,
1486 		f_10, f_11_inv,
1487 		c_00, c_01,
1488 		c_10, c_11;
1489 	// solve
1490 	jj = 0;
1491 	for(; jj<n-1; jj+=2)
1492 		{
1493 		f_00_inv = dA[jj+0];
1494 		f_10 = pA[jj+1+lda*(jj+0)];
1495 		f_11_inv = dA[jj+1];
1496 		ii = 0;
1497 		for(; ii<m-1; ii+=2)
1498 			{
1499 			c_00 = alpha * pB[ii+0+ldb*(jj+0)];
1500 			c_10 = alpha * pB[ii+1+ldb*(jj+0)];
1501 			c_01 = alpha * pB[ii+0+ldb*(jj+1)];
1502 			c_11 = alpha * pB[ii+1+ldb*(jj+1)];
1503 			for(kk=0; kk<jj; kk++)
1504 				{
1505 				c_00 -= pD[ii+0+ldd*kk] * pA[jj+0+lda*kk];
1506 				c_10 -= pD[ii+1+ldd*kk] * pA[jj+0+lda*kk];
1507 				c_01 -= pD[ii+0+ldd*kk] * pA[jj+1+lda*kk];
1508 				c_11 -= pD[ii+1+ldd*kk] * pA[jj+1+lda*kk];
1509 				}
1510 			c_00 *= f_00_inv;
1511 			c_10 *= f_00_inv;
1512 			pD[ii+0+ldd*(jj+0)] = c_00;
1513 			pD[ii+1+ldd*(jj+0)] = c_10;
1514 			c_01 -= c_00 * f_10;
1515 			c_11 -= c_10 * f_10;
1516 			c_01 *= f_11_inv;
1517 			c_11 *= f_11_inv;
1518 			pD[ii+0+ldd*(jj+1)] = c_01;
1519 			pD[ii+1+ldd*(jj+1)] = c_11;
1520 			}
1521 		for(; ii<m; ii++)
1522 			{
1523 			c_00 = alpha * pB[ii+0+ldb*(jj+0)];
1524 			c_01 = alpha * pB[ii+0+ldb*(jj+1)];
1525 			for(kk=0; kk<jj; kk++)
1526 				{
1527 				c_00 -= pD[ii+0+ldd*kk] * pA[jj+0+lda*kk];
1528 				c_01 -= pD[ii+0+ldd*kk] * pA[jj+1+lda*kk];
1529 				}
1530 			c_00 *= f_00_inv;
1531 			pD[ii+0+ldd*(jj+0)] = c_00;
1532 			c_01 -= c_00 * f_10;
1533 			c_01 *= f_11_inv;
1534 			pD[ii+0+ldd*(jj+1)] = c_01;
1535 			}
1536 		}
1537 	for(; jj<n; jj++)
1538 		{
1539 		// factorize diagonal
1540 		f_00_inv = dA[jj];
1541 		for(ii=0; ii<m; ii++)
1542 			{
1543 			c_00 = alpha * pB[ii+ldb*jj];
1544 			for(kk=0; kk<jj; kk++)
1545 				{
1546 				c_00 -= pD[ii+ldd*kk] * pA[jj+lda*kk];
1547 				}
1548 			c_00 *= f_00_inv;
1549 			pD[ii+ldd*jj] = c_00;
1550 			}
1551 		}
1552 	return;
1553 	}
1554 
1555 
1556 
1557 // dtrsm_right_lower_transposed_unit
TRSM_RLTU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)1558 void TRSM_RLTU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
1559 	{
1560 	if(m<=0 | n<=0)
1561 		return;
1562 
1563 	// invalidate stored inverse diagonal of result matrix
1564 	sD->use_dA = 0;
1565 
1566 	int ii, jj, kk;
1567 	int lda = sA->m;
1568 	int ldb = sB->m;
1569 	int ldd = sD->m;
1570 	REAL *pA = sA->pA + ai + aj*lda;
1571 	REAL *pB = sB->pA + bi + bj*ldb;
1572 	REAL *pD = sD->pA + di + dj*ldd;
1573 	REAL
1574 		f_10,
1575 		c_00, c_01,
1576 		c_10, c_11;
1577 	// solve
1578 	jj = 0;
1579 	for(; jj<n-1; jj+=2)
1580 		{
1581 		f_10 = pA[jj+1+lda*(jj+0)];
1582 		ii = 0;
1583 		for(; ii<m-1; ii+=2)
1584 			{
1585 			c_00 = alpha * pB[ii+0+ldb*(jj+0)];
1586 			c_10 = alpha * pB[ii+1+ldb*(jj+0)];
1587 			c_01 = alpha * pB[ii+0+ldb*(jj+1)];
1588 			c_11 = alpha * pB[ii+1+ldb*(jj+1)];
1589 			for(kk=0; kk<jj; kk++)
1590 				{
1591 				c_00 -= pD[ii+0+ldd*kk] * pA[jj+0+lda*kk];
1592 				c_10 -= pD[ii+1+ldd*kk] * pA[jj+0+lda*kk];
1593 				c_01 -= pD[ii+0+ldd*kk] * pA[jj+1+lda*kk];
1594 				c_11 -= pD[ii+1+ldd*kk] * pA[jj+1+lda*kk];
1595 				}
1596 			pD[ii+0+ldd*(jj+0)] = c_00;
1597 			pD[ii+1+ldd*(jj+0)] = c_10;
1598 			c_01 -= c_00 * f_10;
1599 			c_11 -= c_10 * f_10;
1600 			pD[ii+0+ldd*(jj+1)] = c_01;
1601 			pD[ii+1+ldd*(jj+1)] = c_11;
1602 			}
1603 		for(; ii<m; ii++)
1604 			{
1605 			c_00 = alpha * pB[ii+0+ldb*(jj+0)];
1606 			c_01 = alpha * pB[ii+0+ldb*(jj+1)];
1607 			for(kk=0; kk<jj; kk++)
1608 				{
1609 				c_00 -= pD[ii+0+ldd*kk] * pA[jj+0+lda*kk];
1610 				c_01 -= pD[ii+0+ldd*kk] * pA[jj+1+lda*kk];
1611 				}
1612 			pD[ii+0+ldd*(jj+0)] = c_00;
1613 			c_01 -= c_00 * f_10;
1614 			pD[ii+0+ldd*(jj+1)] = c_01;
1615 			}
1616 		}
1617 	for(; jj<n; jj++)
1618 		{
1619 		for(ii=0; ii<m; ii++)
1620 			{
1621 			c_00 = alpha * pB[ii+ldb*jj];
1622 			for(kk=0; kk<jj; kk++)
1623 				{
1624 				c_00 -= pD[ii+ldd*kk] * pA[jj+lda*kk];
1625 				}
1626 			pD[ii+ldd*jj] = c_00;
1627 			}
1628 		}
1629 	return;
1630 	}
1631 
1632 
1633 
1634 // dtrsm_runn
TRSM_RUNN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)1635 void TRSM_RUNN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
1636 	{
1637 	if(m<=0 | n<=0)
1638 		return;
1639 
1640 	// invalidate stored inverse diagonal of result matrix
1641 	sD->use_dA = 0;
1642 
1643 	int ii, jj, kk;
1644 	int lda = sA->m;
1645 	int ldb = sB->m;
1646 	int ldd = sD->m;
1647 	REAL *pA = sA->pA + ai + aj*lda;
1648 	REAL *pB = sB->pA + bi + bj*ldb;
1649 	REAL *pD = sD->pA + di + dj*ldd;
1650 	REAL *dA = sA->dA;
1651 	if(ai==0 & aj==0)
1652 		{
1653 		if(sA->use_dA<n)
1654 			{
1655 			for(ii=0; ii<n; ii++)
1656 				dA[ii] = 1.0 / pA[ii+lda*ii];
1657 			sA->use_dA = n;
1658 			}
1659 		}
1660 	else
1661 		{
1662 		for(ii=0; ii<n; ii++)
1663 			dA[ii] = 1.0 / pA[ii+lda*ii];
1664 		sA->use_dA = 0; // nonzero offset makes diagonal dirty
1665 		}
1666 	REAL
1667 		f_00_inv,
1668 		f_10, f_11_inv,
1669 		c_00, c_01,
1670 		c_10, c_11;
1671 	// solve
1672 	jj = 0;
1673 	for(; jj<n-1; jj+=2)
1674 		{
1675 		f_00_inv = dA[jj+0];
1676 		f_10 = pA[jj+0+lda*(jj+1)];
1677 		f_11_inv = dA[jj+1];
1678 		ii = 0;
1679 		for(; ii<m-1; ii+=2)
1680 			{
1681 			c_00 = alpha * pB[ii+0+ldb*(jj+0)];
1682 			c_10 = alpha * pB[ii+1+ldb*(jj+0)];
1683 			c_01 = alpha * pB[ii+0+ldb*(jj+1)];
1684 			c_11 = alpha * pB[ii+1+ldb*(jj+1)];
1685 			for(kk=0; kk<jj; kk++)
1686 				{
1687 				c_00 -= pD[ii+0+ldd*kk] * pA[kk+lda*(jj+0)];
1688 				c_10 -= pD[ii+1+ldd*kk] * pA[kk+lda*(jj+0)];
1689 				c_01 -= pD[ii+0+ldd*kk] * pA[kk+lda*(jj+1)];
1690 				c_11 -= pD[ii+1+ldd*kk] * pA[kk+lda*(jj+1)];
1691 				}
1692 			c_00 *= f_00_inv;
1693 			c_10 *= f_00_inv;
1694 			pD[ii+0+ldd*(jj+0)] = c_00;
1695 			pD[ii+1+ldd*(jj+0)] = c_10;
1696 			c_01 -= c_00 * f_10;
1697 			c_11 -= c_10 * f_10;
1698 			c_01 *= f_11_inv;
1699 			c_11 *= f_11_inv;
1700 			pD[ii+0+ldd*(jj+1)] = c_01;
1701 			pD[ii+1+ldd*(jj+1)] = c_11;
1702 			}
1703 		for(; ii<m; ii++)
1704 			{
1705 			c_00 = alpha * pB[ii+0+ldb*(jj+0)];
1706 			c_01 = alpha * pB[ii+0+ldb*(jj+1)];
1707 			for(kk=0; kk<jj; kk++)
1708 				{
1709 				c_00 -= pD[ii+0+ldd*kk] * pA[kk+lda*(jj+0)];
1710 				c_01 -= pD[ii+0+ldd*kk] * pA[kk+lda*(jj+1)];
1711 				}
1712 			c_00 *= f_00_inv;
1713 			pD[ii+0+ldd*(jj+0)] = c_00;
1714 			c_01 -= c_00 * f_10;
1715 			c_01 *= f_11_inv;
1716 			pD[ii+0+ldd*(jj+1)] = c_01;
1717 			}
1718 		}
1719 	for(; jj<n; jj++)
1720 		{
1721 		// factorize diagonal
1722 		f_00_inv = dA[jj];
1723 		for(ii=0; ii<m; ii++)
1724 			{
1725 			c_00 = alpha * pB[ii+ldb*jj];
1726 			for(kk=0; kk<jj; kk++)
1727 				{
1728 				c_00 -= pD[ii+ldd*kk] * pA[kk+lda*jj];
1729 				}
1730 			c_00 *= f_00_inv;
1731 			pD[ii+ldd*jj] = c_00;
1732 			}
1733 		}
1734 	return;
1735 	}
1736 
1737 
1738 
1739 // dtrsm_runu
TRSM_RUNU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)1740 void TRSM_RUNU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
1741 	{
1742 	if(m<=0 | n<=0)
1743 		return;
1744 
1745 	// invalidate stored inverse diagonal of result matrix
1746 	sD->use_dA = 0;
1747 
1748 	int ii, jj, kk;
1749 	int lda = sA->m;
1750 	int ldb = sB->m;
1751 	int ldd = sD->m;
1752 	REAL *pA = sA->pA + ai + aj*lda;
1753 	REAL *pB = sB->pA + bi + bj*ldb;
1754 	REAL *pD = sD->pA + di + dj*ldd;
1755 	REAL
1756 		f_10,
1757 		c_00, c_01,
1758 		c_10, c_11;
1759 	// solve
1760 	jj = 0;
1761 	for(; jj<n-1; jj+=2)
1762 		{
1763 		f_10 = pA[jj+0+lda*(jj+1)];
1764 		ii = 0;
1765 		for(; ii<m-1; ii+=2)
1766 			{
1767 			c_00 = alpha * pB[ii+0+ldb*(jj+0)];
1768 			c_10 = alpha * pB[ii+1+ldb*(jj+0)];
1769 			c_01 = alpha * pB[ii+0+ldb*(jj+1)];
1770 			c_11 = alpha * pB[ii+1+ldb*(jj+1)];
1771 			for(kk=0; kk<jj; kk++)
1772 				{
1773 				c_00 -= pD[ii+0+ldd*kk] * pA[kk+lda*(jj+0)];
1774 				c_10 -= pD[ii+1+ldd*kk] * pA[kk+lda*(jj+0)];
1775 				c_01 -= pD[ii+0+ldd*kk] * pA[kk+lda*(jj+1)];
1776 				c_11 -= pD[ii+1+ldd*kk] * pA[kk+lda*(jj+1)];
1777 				}
1778 			pD[ii+0+ldd*(jj+0)] = c_00;
1779 			pD[ii+1+ldd*(jj+0)] = c_10;
1780 			c_01 -= c_00 * f_10;
1781 			c_11 -= c_10 * f_10;
1782 			pD[ii+0+ldd*(jj+1)] = c_01;
1783 			pD[ii+1+ldd*(jj+1)] = c_11;
1784 			}
1785 		for(; ii<m; ii++)
1786 			{
1787 			c_00 = alpha * pB[ii+0+ldb*(jj+0)];
1788 			c_01 = alpha * pB[ii+0+ldb*(jj+1)];
1789 			for(kk=0; kk<jj; kk++)
1790 				{
1791 				c_00 -= pD[ii+0+ldd*kk] * pA[kk+lda*(jj+0)];
1792 				c_01 -= pD[ii+0+ldd*kk] * pA[kk+lda*(jj+1)];
1793 				}
1794 			pD[ii+0+ldd*(jj+0)] = c_00;
1795 			c_01 -= c_00 * f_10;
1796 			pD[ii+0+ldd*(jj+1)] = c_01;
1797 			}
1798 		}
1799 	for(; jj<n; jj++)
1800 		{
1801 		// factorize diagonal
1802 		for(ii=0; ii<m; ii++)
1803 			{
1804 			c_00 = alpha * pB[ii+ldb*jj];
1805 			for(kk=0; kk<jj; kk++)
1806 				{
1807 				c_00 -= pD[ii+ldd*kk] * pA[kk+lda*jj];
1808 				}
1809 			pD[ii+ldd*jj] = c_00;
1810 			}
1811 		}
1812 	return;
1813 	}
1814 
1815 
1816 
1817 // dtrsm_right_upper_transposed_notunit
TRSM_RUTN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)1818 void TRSM_RUTN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
1819 	{
1820 	if(m<=0 | n<=0)
1821 		return;
1822 
1823 	// invalidate stored inverse diagonal of result matrix
1824 	sD->use_dA = 0;
1825 
1826 	char cl = 'l';
1827 	char cn = 'n';
1828 	char cr = 'r';
1829 	char ct = 't';
1830 	char cu = 'u';
1831 	int i1 = 1;
1832 	int ii, jj, kk, id;
1833 	int lda = sA->m;
1834 	int ldb = sB->m;
1835 	int ldd = sD->m;
1836 	REAL
1837 		d_00, d_01,
1838 		d_10, d_11;
1839 	REAL *pA = sA->pA+ai+aj*lda;
1840 	REAL *pB = sB->pA+bi+bj*ldb;
1841 	REAL *pD = sD->pA+di+dj*ldd;
1842 	REAL *dA = sA->dA;
1843 	if(ai==0 & aj==0)
1844 		{
1845 		if (sA->use_dA<n)
1846 			{
1847 			// invert diagonal of pA
1848 			for(ii=0; ii<n; ii++)
1849 				dA[ii] = 1.0/pA[ii+lda*ii];
1850 			// use only now
1851 			sA->use_dA = n;
1852 			}
1853 		}
1854 	else
1855 		{
1856 		for(ii=0; ii<n; ii++)
1857 			dA[ii] = 1.0 / pA[ii+lda*ii];
1858 		sA->use_dA = 0; // nonzero offset makes diagonal dirty
1859 		}
1860 	// solve
1861 	jj = 0;
1862 	for(; jj<n-1; jj+=2)
1863 		{
1864 		ii = 0;
1865 		id = n-jj-2;
1866 		for(; ii<m-1; ii+=2)
1867 			{
1868 			d_00 = alpha * pB[ii+0+ldb*(id+0)];
1869 			d_10 = alpha * pB[ii+1+ldb*(id+0)];
1870 			d_01 = alpha * pB[ii+0+ldb*(id+1)];
1871 			d_11 = alpha * pB[ii+1+ldb*(id+1)];
1872 			kk = id+2;
1873 			for(; kk<n; kk++)
1874 				{
1875 				d_00 -= pA[id+0+lda*(kk+0)] * pD[ii+0+ldd*(kk+0)];
1876 				d_10 -= pA[id+0+lda*(kk+0)] * pD[ii+1+ldd*(kk+0)];
1877 				d_01 -= pA[id+1+lda*(kk+0)] * pD[ii+0+ldd*(kk+0)];
1878 				d_11 -= pA[id+1+lda*(kk+0)] * pD[ii+1+ldd*(kk+0)];
1879 				}
1880 			d_01 *= dA[id+1];
1881 			d_11 *= dA[id+1];
1882 			d_00 -= pA[id+0+lda*(id+1)] * d_01;
1883 			d_10 -= pA[id+0+lda*(id+1)] * d_11;
1884 			d_00 *= dA[id+0];
1885 			d_10 *= dA[id+0];
1886 			pD[ii+0+ldd*(id+0)] = d_00;
1887 			pD[ii+1+ldd*(id+0)] = d_10;
1888 			pD[ii+0+ldd*(id+1)] = d_01;
1889 			pD[ii+1+ldd*(id+1)] = d_11;
1890 			}
1891 		for(; ii<m; ii++)
1892 			{
1893 			d_00 = alpha * pB[ii+0+ldb*(id+0)];
1894 			d_01 = alpha * pB[ii+0+ldb*(id+1)];
1895 			kk = id+2;
1896 			for(; kk<n; kk++)
1897 				{
1898 				d_00 -= pA[id+0+lda*(kk+0)] * pD[ii+0+ldd*(kk+0)];
1899 				d_01 -= pA[id+1+lda*(kk+0)] * pD[ii+0+ldd*(kk+0)];
1900 				}
1901 			d_01 *= dA[id+1];
1902 			d_00 -= pA[id+0+lda*(id+1)] * d_01;
1903 			d_00 *= dA[id+0];
1904 			pD[ii+0+ldd*(id+0)] = d_00;
1905 			pD[ii+0+ldd*(id+1)] = d_01;
1906 			}
1907 		}
1908 	for(; jj<n; jj++)
1909 		{
1910 		ii = 0;
1911 		id = n-jj-1;
1912 		for(; ii<m-1; ii+=2)
1913 			{
1914 			d_00 = alpha * pB[ii+0+ldb*(id+0)];
1915 			d_10 = alpha * pB[ii+1+ldb*(id+0)];
1916 			kk = id+1;
1917 			for(; kk<n; kk++)
1918 				{
1919 				d_00 -= pA[id+0+lda*(kk+0)] * pD[ii+0+ldd*(kk+0)];
1920 				d_10 -= pA[id+0+lda*(kk+0)] * pD[ii+1+ldd*(kk+0)];
1921 				}
1922 			d_00 *= dA[id+0];
1923 			d_10 *= dA[id+0];
1924 			pD[ii+0+ldd*(id+0)] = d_00;
1925 			pD[ii+1+ldd*(id+0)] = d_10;
1926 			}
1927 		for(; ii<m; ii++)
1928 			{
1929 			d_00 = alpha * pB[ii+ldb*(id)];
1930 			kk = id+1;
1931 			for(; kk<n; kk++)
1932 				d_00 -= pA[id+lda*(kk)] * pD[ii+ldd*(kk)];
1933 			pD[ii+ldd*(id)] = d_00 * dA[id];
1934 			}
1935 		}
1936 	return;
1937 	}
1938 
1939 
1940 
1941 // dtrsm_rutu
TRSM_RUTU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)1942 void TRSM_RUTU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
1943 	{
1944 	if(m<=0 | n<=0)
1945 		return;
1946 
1947 	// invalidate stored inverse diagonal of result matrix
1948 	sD->use_dA = 0;
1949 
1950 	char cl = 'l';
1951 	char cn = 'n';
1952 	char cr = 'r';
1953 	char ct = 't';
1954 	char cu = 'u';
1955 	int i1 = 1;
1956 	int ii, jj, kk, id;
1957 	int lda = sA->m;
1958 	int ldb = sB->m;
1959 	int ldd = sD->m;
1960 	REAL
1961 		d_00, d_01,
1962 		d_10, d_11;
1963 	REAL *pA = sA->pA+ai+aj*lda;
1964 	REAL *pB = sB->pA+bi+bj*ldb;
1965 	REAL *pD = sD->pA+di+dj*ldd;
1966 	// solve
1967 	jj = 0;
1968 	for(; jj<n-1; jj+=2)
1969 		{
1970 		ii = 0;
1971 		id = n-jj-2;
1972 		for(; ii<m-1; ii+=2)
1973 			{
1974 			d_00 = alpha * pB[ii+0+ldb*(id+0)];
1975 			d_10 = alpha * pB[ii+1+ldb*(id+0)];
1976 			d_01 = alpha * pB[ii+0+ldb*(id+1)];
1977 			d_11 = alpha * pB[ii+1+ldb*(id+1)];
1978 			kk = id+2;
1979 			for(; kk<n; kk++)
1980 				{
1981 				d_00 -= pA[id+0+lda*(kk+0)] * pD[ii+0+ldd*(kk+0)];
1982 				d_10 -= pA[id+0+lda*(kk+0)] * pD[ii+1+ldd*(kk+0)];
1983 				d_01 -= pA[id+1+lda*(kk+0)] * pD[ii+0+ldd*(kk+0)];
1984 				d_11 -= pA[id+1+lda*(kk+0)] * pD[ii+1+ldd*(kk+0)];
1985 				}
1986 			d_00 -= pA[id+0+lda*(id+1)] * d_01;
1987 			d_10 -= pA[id+0+lda*(id+1)] * d_11;
1988 			pD[ii+0+ldd*(id+0)] = d_00;
1989 			pD[ii+1+ldd*(id+0)] = d_10;
1990 			pD[ii+0+ldd*(id+1)] = d_01;
1991 			pD[ii+1+ldd*(id+1)] = d_11;
1992 			}
1993 		for(; ii<m; ii++)
1994 			{
1995 			d_00 = alpha * pB[ii+0+ldb*(id+0)];
1996 			d_01 = alpha * pB[ii+0+ldb*(id+1)];
1997 			kk = id+2;
1998 			for(; kk<n; kk++)
1999 				{
2000 				d_00 -= pA[id+0+lda*(kk+0)] * pD[ii+0+ldd*(kk+0)];
2001 				d_01 -= pA[id+1+lda*(kk+0)] * pD[ii+0+ldd*(kk+0)];
2002 				}
2003 			d_00 -= pA[id+0+lda*(id+1)] * d_01;
2004 			pD[ii+0+ldd*(id+0)] = d_00;
2005 			pD[ii+0+ldd*(id+1)] = d_01;
2006 			}
2007 		}
2008 	for(; jj<n; jj++)
2009 		{
2010 		ii = 0;
2011 		id = n-jj-1;
2012 		for(; ii<m-1; ii+=2)
2013 			{
2014 			d_00 = alpha * pB[ii+0+ldb*(id+0)];
2015 			d_10 = alpha * pB[ii+1+ldb*(id+0)];
2016 			kk = id+1;
2017 			for(; kk<n; kk++)
2018 				{
2019 				d_00 -= pA[id+0+lda*(kk+0)] * pD[ii+0+ldd*(kk+0)];
2020 				d_10 -= pA[id+0+lda*(kk+0)] * pD[ii+1+ldd*(kk+0)];
2021 				}
2022 			pD[ii+0+ldd*(id+0)] = d_00;
2023 			pD[ii+1+ldd*(id+0)] = d_10;
2024 			}
2025 		for(; ii<m; ii++)
2026 			{
2027 			d_00 = alpha * pB[ii+ldb*(id)];
2028 			kk = id+1;
2029 			for(; kk<n; kk++)
2030 				d_00 -= pA[id+lda*(kk)] * pD[ii+ldd*(kk)];
2031 			}
2032 		}
2033 	return;
2034 	}
2035 
2036 
2037 
2038 // dtrmm_right_upper_transposed_notunit (A triangular !!!)
TRMM_RUTN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)2039 void TRMM_RUTN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
2040 	{
2041 	if(m<=0 | n<=0)
2042 		return;
2043 
2044 	// invalidate stored inverse diagonal of result matrix
2045 	sD->use_dA = 0;
2046 
2047 	int ii, jj, kk;
2048 	REAL
2049 		c_00, c_01,
2050 		c_10, c_11;
2051 	int lda = sA->m;
2052 	int ldb = sB->m;
2053 	int ldd = sD->m;
2054 	REAL *pA = sA->pA + ai + aj*lda;
2055 	REAL *pB = sB->pA + bi + bj*ldb;
2056 	REAL *pD = sD->pA + di + dj*ldd;
2057 	jj = 0;
2058 	for(; jj<n-1; jj+=2)
2059 		{
2060 		ii = 0;
2061 		for(; ii<m-1; ii+=2)
2062 			{
2063 			c_00 = 0.0;
2064 			c_10 = 0.0;
2065 			c_01 = 0.0;
2066 			c_11 = 0.0;
2067 			kk = jj;
2068 			c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
2069 			c_10 += pB[(ii+1)+ldb*kk] * pA[(jj+0)+lda*kk];
2070 			kk++;
2071 			for(; kk<n; kk++)
2072 				{
2073 				c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
2074 				c_10 += pB[(ii+1)+ldb*kk] * pA[(jj+0)+lda*kk];
2075 				c_01 += pB[(ii+0)+ldb*kk] * pA[(jj+1)+lda*kk];
2076 				c_11 += pB[(ii+1)+ldb*kk] * pA[(jj+1)+lda*kk];
2077 				}
2078 			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
2079 			pD[(ii+1)+ldd*(jj+0)] = alpha * c_10;
2080 			pD[(ii+0)+ldd*(jj+1)] = alpha * c_01;
2081 			pD[(ii+1)+ldd*(jj+1)] = alpha * c_11;
2082 			}
2083 		for(; ii<m; ii++)
2084 			{
2085 			c_00 = 0.0;
2086 			c_01 = 0.0;
2087 			kk = jj;
2088 			c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
2089 			kk++;
2090 			for(; kk<n; kk++)
2091 				{
2092 				c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
2093 				c_01 += pB[(ii+0)+ldb*kk] * pA[(jj+1)+lda*kk];
2094 				}
2095 			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
2096 			pD[(ii+0)+ldd*(jj+1)] = alpha * c_01;
2097 			}
2098 		}
2099 	for(; jj<n; jj++)
2100 		{
2101 		ii = 0;
2102 		for(; ii<m-1; ii+=2)
2103 			{
2104 			c_00 = 0.0;
2105 			c_10 = 0.0;
2106 			for(kk=jj; kk<n; kk++)
2107 				{
2108 				c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
2109 				c_10 += pB[(ii+1)+ldb*kk] * pA[(jj+0)+lda*kk];
2110 				}
2111 			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
2112 			pD[(ii+1)+ldd*(jj+0)] = alpha * c_10;
2113 			}
2114 		for(; ii<m; ii++)
2115 			{
2116 			c_00 = 0.0;
2117 			for(kk=jj; kk<n; kk++)
2118 				{
2119 				c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
2120 				}
2121 			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
2122 			}
2123 		}
2124 	return;
2125 	}
2126 
2127 
2128 
2129 // dtrmm_right_lower_nottransposed_notunit (A triangular !!!)
TRMM_RLNN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)2130 void TRMM_RLNN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
2131 	{
2132 	if(m<=0 | n<=0)
2133 		return;
2134 
2135 	// invalidate stored inverse diagonal of result matrix
2136 	sD->use_dA = 0;
2137 
2138 	int ii, jj, kk;
2139 	REAL
2140 		c_00, c_01,
2141 		c_10, c_11;
2142 	int lda = sA->m;
2143 	int ldb = sB->m;
2144 	int ldd = sD->m;
2145 	REAL *pA = sA->pA + ai + aj*lda;
2146 	REAL *pB = sB->pA + bi + bj*ldb;
2147 	REAL *pD = sD->pA + di + dj*ldd;
2148 	jj = 0;
2149 	for(; jj<n-1; jj+=2)
2150 		{
2151 		ii = 0;
2152 		for(; ii<m-1; ii+=2)
2153 			{
2154 			c_00 = 0.0; ;
2155 			c_10 = 0.0; ;
2156 			c_01 = 0.0; ;
2157 			c_11 = 0.0; ;
2158 			kk = jj;
2159 			c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
2160 			c_10 += pB[(ii+1)+ldb*kk] * pA[kk+lda*(jj+0)];
2161 			kk++;
2162 			for(; kk<n; kk++)
2163 				{
2164 				c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
2165 				c_10 += pB[(ii+1)+ldb*kk] * pA[kk+lda*(jj+0)];
2166 				c_01 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+1)];
2167 				c_11 += pB[(ii+1)+ldb*kk] * pA[kk+lda*(jj+1)];
2168 				}
2169 			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
2170 			pD[(ii+1)+ldd*(jj+0)] = alpha * c_10;
2171 			pD[(ii+0)+ldd*(jj+1)] = alpha * c_01;
2172 			pD[(ii+1)+ldd*(jj+1)] = alpha * c_11;
2173 			}
2174 		for(; ii<m; ii++)
2175 			{
2176 			c_00 = 0.0; ;
2177 			c_01 = 0.0; ;
2178 			kk = jj;
2179 			c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
2180 			kk++;
2181 			for(; kk<n; kk++)
2182 				{
2183 				c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
2184 				c_01 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+1)];
2185 				}
2186 			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
2187 			pD[(ii+0)+ldd*(jj+1)] = alpha * c_01;
2188 			}
2189 		}
2190 	for(; jj<n; jj++)
2191 		{
2192 		ii = 0;
2193 		for(; ii<m-1; ii+=2)
2194 			{
2195 			c_00 = 0.0; ;
2196 			c_10 = 0.0; ;
2197 			for(kk=jj; kk<n; kk++)
2198 				{
2199 				c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
2200 				c_10 += pB[(ii+1)+ldb*kk] * pA[kk+lda*(jj+0)];
2201 				}
2202 			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
2203 			pD[(ii+1)+ldd*(jj+0)] = alpha * c_10;
2204 			}
2205 		for(; ii<m; ii++)
2206 			{
2207 			c_00 = 0.0; ;
2208 			for(kk=jj; kk<n; kk++)
2209 				{
2210 				c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
2211 				}
2212 			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
2213 			}
2214 		}
2215 	return;
2216 	}
2217 
2218 
2219 
2220 // dsyrk_lower not-transposed
SYRK_LN(int m,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)2221 void SYRK_LN(int m, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
2222 	{
2223 	if(m<=0)
2224 		return;
2225 
2226 	// invalidate stored inverse diagonal of result matrix
2227 	sD->use_dA = 0;
2228 
2229 	int ii, jj, kk;
2230 	REAL
2231 		c_00, c_01,
2232 		c_10, c_11;
2233 	int lda = sA->m;
2234 	int ldb = sB->m;
2235 	int ldc = sC->m;
2236 	int ldd = sD->m;
2237 	REAL *pA = sA->pA + ai + aj*lda;
2238 	REAL *pB = sB->pA + bi + bj*ldb;
2239 	REAL *pC = sC->pA + ci + cj*ldc;
2240 	REAL *pD = sD->pA + di + dj*ldd;
2241 	jj = 0;
2242 	for(; jj<m-1; jj+=2)
2243 		{
2244 		// diagonal
2245 		c_00 = 0.0;
2246 		c_10 = 0.0;
2247 		c_11 = 0.0;
2248 		for(kk=0; kk<k; kk++)
2249 			{
2250 			c_00 += pA[jj+0+lda*kk] * pB[jj+0+ldb*kk];
2251 			c_10 += pA[jj+1+lda*kk] * pB[jj+0+ldb*kk];
2252 			c_11 += pA[jj+1+lda*kk] * pB[jj+1+ldb*kk];
2253 			}
2254 		pD[jj+0+ldd*(jj+0)] = beta * pC[jj+0+ldc*(jj+0)] + alpha * c_00;
2255 		pD[jj+1+ldd*(jj+0)] = beta * pC[jj+1+ldc*(jj+0)] + alpha * c_10;
2256 		pD[jj+1+ldd*(jj+1)] = beta * pC[jj+1+ldc*(jj+1)] + alpha * c_11;
2257 		// lower
2258 		ii = jj+2;
2259 		for(; ii<m-1; ii+=2)
2260 			{
2261 			c_00 = 0.0;
2262 			c_10 = 0.0;
2263 			c_01 = 0.0;
2264 			c_11 = 0.0;
2265 			for(kk=0; kk<k; kk++)
2266 				{
2267 				c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
2268 				c_10 += pA[ii+1+lda*kk] * pB[jj+0+ldb*kk];
2269 				c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
2270 				c_11 += pA[ii+1+lda*kk] * pB[jj+1+ldb*kk];
2271 				}
2272 			pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
2273 			pD[ii+1+ldd*(jj+0)] = beta * pC[ii+1+ldc*(jj+0)] + alpha * c_10;
2274 			pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
2275 			pD[ii+1+ldd*(jj+1)] = beta * pC[ii+1+ldc*(jj+1)] + alpha * c_11;
2276 			}
2277 		for(; ii<m; ii++)
2278 			{
2279 			c_00 = 0.0;
2280 			c_01 = 0.0;
2281 			for(kk=0; kk<k; kk++)
2282 				{
2283 				c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
2284 				c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
2285 				}
2286 			pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
2287 			pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
2288 			}
2289 		}
2290 	if(jj<m)
2291 		{
2292 		// diagonal
2293 		c_00 = 0.0;
2294 		for(kk=0; kk<k; kk++)
2295 			{
2296 			c_00 += pA[jj+lda*kk] * pB[jj+ldb*kk];
2297 			}
2298 		pD[jj+ldd*jj] = beta * pC[jj+ldc*jj] + alpha * c_00;
2299 		}
2300 	return;
2301 	}
2302 
2303 
2304 
2305 // dsyrk_lower not-transposed
SYRK_LN_MN(int m,int n,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)2306 void SYRK_LN_MN(int m, int n, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
2307 	{
2308 	if(m<=0 | n<=0)
2309 		return;
2310 
2311 	// invalidate stored inverse diagonal of result matrix
2312 	sD->use_dA = 0;
2313 
2314 	int ii, jj, kk;
2315 	REAL
2316 		c_00, c_01,
2317 		c_10, c_11;
2318 	int lda = sA->m;
2319 	int ldb = sB->m;
2320 	int ldc = sC->m;
2321 	int ldd = sD->m;
2322 	REAL *pA = sA->pA + ai + aj*lda;
2323 	REAL *pB = sB->pA + bi + bj*ldb;
2324 	REAL *pC = sC->pA + ci + cj*ldc;
2325 	REAL *pD = sD->pA + di + dj*ldd;
2326 	jj = 0;
2327 	for(; jj<n-1; jj+=2)
2328 		{
2329 		// diagonal
2330 		c_00 = 0.0;
2331 		c_10 = 0.0;
2332 		c_11 = 0.0;
2333 		for(kk=0; kk<k; kk++)
2334 			{
2335 			c_00 += pA[jj+0+lda*kk] * pB[jj+0+ldb*kk];
2336 			c_10 += pA[jj+1+lda*kk] * pB[jj+0+ldb*kk];
2337 			c_11 += pA[jj+1+lda*kk] * pB[jj+1+ldb*kk];
2338 			}
2339 		pD[jj+0+ldd*(jj+0)] = beta * pC[jj+0+ldc*(jj+0)] + alpha * c_00;
2340 		pD[jj+1+ldd*(jj+0)] = beta * pC[jj+1+ldc*(jj+0)] + alpha * c_10;
2341 		pD[jj+1+ldd*(jj+1)] = beta * pC[jj+1+ldc*(jj+1)] + alpha * c_11;
2342 		// lower
2343 		ii = jj+2;
2344 		for(; ii<m-1; ii+=2)
2345 			{
2346 			c_00 = 0.0;
2347 			c_10 = 0.0;
2348 			c_01 = 0.0;
2349 			c_11 = 0.0;
2350 			for(kk=0; kk<k; kk++)
2351 				{
2352 				c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
2353 				c_10 += pA[ii+1+lda*kk] * pB[jj+0+ldb*kk];
2354 				c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
2355 				c_11 += pA[ii+1+lda*kk] * pB[jj+1+ldb*kk];
2356 				}
2357 			pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
2358 			pD[ii+1+ldd*(jj+0)] = beta * pC[ii+1+ldc*(jj+0)] + alpha * c_10;
2359 			pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
2360 			pD[ii+1+ldd*(jj+1)] = beta * pC[ii+1+ldc*(jj+1)] + alpha * c_11;
2361 			}
2362 		for(; ii<m; ii++)
2363 			{
2364 			c_00 = 0.0;
2365 			c_01 = 0.0;
2366 			for(kk=0; kk<k; kk++)
2367 				{
2368 				c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
2369 				c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
2370 				}
2371 			pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
2372 			pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
2373 			}
2374 		}
2375 	for(; jj<n; jj++)
2376 		{
2377 		// diagonal
2378 		c_00 = 0.0;
2379 		for(kk=0; kk<k; kk++)
2380 			{
2381 			c_00 += pA[jj+lda*kk] * pB[jj+ldb*kk];
2382 			}
2383 		pD[jj+ldd*jj] = beta * pC[jj+ldc*jj] + alpha * c_00;
2384 		// lower
2385 		for(ii=jj+1; ii<m; ii++)
2386 			{
2387 			c_00 = 0.0;
2388 			for(kk=0; kk<k; kk++)
2389 				{
2390 				c_00 += pA[ii+lda*kk] * pB[jj+ldb*kk];
2391 				}
2392 			pD[ii+ldd*jj] = beta * pC[ii+ldc*jj] + alpha * c_00;
2393 			}
2394 		}
2395 	return;
2396 	}
2397 
2398 
2399 
2400 // dsyrk_lower transposed
SYRK_LT(int m,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)2401 void SYRK_LT(int m, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
2402 	{
2403 	if(m<=0)
2404 		return;
2405 
2406 	// invalidate stored inverse diagonal of result matrix
2407 	sD->use_dA = 0;
2408 
2409 	int ii, jj, kk;
2410 	REAL
2411 		c_00, c_01,
2412 		c_10, c_11;
2413 	int lda = sA->m;
2414 	int ldb = sB->m;
2415 	int ldc = sC->m;
2416 	int ldd = sD->m;
2417 	REAL *pA = sA->pA + ai + aj*lda;
2418 	REAL *pB = sB->pA + bi + bj*ldb;
2419 	REAL *pC = sC->pA + ci + cj*ldc;
2420 	REAL *pD = sD->pA + di + dj*ldd;
2421 	jj = 0;
2422 	for(; jj<m-1; jj+=2)
2423 		{
2424 		// diagonal
2425 		c_00 = 0.0;
2426 		c_10 = 0.0;
2427 		c_11 = 0.0;
2428 		for(kk=0; kk<k; kk++)
2429 			{
2430 			c_00 += pA[kk+lda*(jj+0)] * pB[kk+ldb*(jj+0)];
2431 			c_10 += pA[kk+lda*(jj+1)] * pB[kk+ldb*(jj+0)];
2432 			c_11 += pA[kk+lda*(jj+1)] * pB[kk+ldb*(jj+1)];
2433 			}
2434 		pD[jj+0+ldd*(jj+0)] = beta * pC[jj+0+ldc*(jj+0)] + alpha * c_00;
2435 		pD[jj+1+ldd*(jj+0)] = beta * pC[jj+1+ldc*(jj+0)] + alpha * c_10;
2436 		pD[jj+1+ldd*(jj+1)] = beta * pC[jj+1+ldc*(jj+1)] + alpha * c_11;
2437 		// lower
2438 		ii = jj+2;
2439 		for(; ii<m-1; ii+=2)
2440 			{
2441 			c_00 = 0.0;
2442 			c_10 = 0.0;
2443 			c_01 = 0.0;
2444 			c_11 = 0.0;
2445 			for(kk=0; kk<k; kk++)
2446 				{
2447 				c_00 += pA[kk+lda*(ii+0)] * pB[kk+ldb*(jj+0)];
2448 				c_10 += pA[kk+lda*(ii+1)] * pB[kk+ldb*(jj+0)];
2449 				c_01 += pA[kk+lda*(ii+0)] * pB[kk+ldb*(jj+1)];
2450 				c_11 += pA[kk+lda*(ii+1)] * pB[kk+ldb*(jj+1)];
2451 				}
2452 			pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
2453 			pD[ii+1+ldd*(jj+0)] = beta * pC[ii+1+ldc*(jj+0)] + alpha * c_10;
2454 			pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
2455 			pD[ii+1+ldd*(jj+1)] = beta * pC[ii+1+ldc*(jj+1)] + alpha * c_11;
2456 			}
2457 		for(; ii<m; ii++)
2458 			{
2459 			c_00 = 0.0;
2460 			c_01 = 0.0;
2461 			for(kk=0; kk<k; kk++)
2462 				{
2463 				c_00 += pA[kk+lda*(ii+0)] * pB[kk+ldb*(jj+0)];
2464 				c_01 += pA[kk+lda*(ii+0)] * pB[kk+ldb*(jj+1)];
2465 				}
2466 			pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
2467 			pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
2468 			}
2469 		}
2470 	if(jj<m)
2471 		{
2472 		// diagonal
2473 		c_00 = 0.0;
2474 		for(kk=0; kk<k; kk++)
2475 			{
2476 			c_00 += pA[kk+lda*jj] * pB[kk+ldb*jj];
2477 			}
2478 		pD[jj+ldd*jj] = beta * pC[jj+ldc*jj] + alpha * c_00;
2479 		}
2480 	return;
2481 	}
2482 
2483 
2484 
2485 // dsyrk_upper not-transposed
SYRK_UN(int m,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)2486 void SYRK_UN(int m, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
2487 	{
2488 	if(m<=0)
2489 		return;
2490 
2491 	// invalidate stored inverse diagonal of result matrix
2492 	sD->use_dA = 0;
2493 
2494 	int ii, jj, kk;
2495 	REAL
2496 		c_00, c_01,
2497 		c_10, c_11;
2498 	int lda = sA->m;
2499 	int ldb = sB->m;
2500 	int ldc = sC->m;
2501 	int ldd = sD->m;
2502 	REAL *pA = sA->pA + ai + aj*lda;
2503 	REAL *pB = sB->pA + bi + bj*ldb;
2504 	REAL *pC = sC->pA + ci + cj*ldc;
2505 	REAL *pD = sD->pA + di + dj*ldd;
2506 	jj = 0;
2507 	for(; jj<m-1; jj+=2)
2508 		{
2509 		// upper
2510 		ii = 0;
2511 		for(; ii<jj; ii+=2)
2512 			{
2513 			c_00 = 0.0;
2514 			c_10 = 0.0;
2515 			c_01 = 0.0;
2516 			c_11 = 0.0;
2517 			for(kk=0; kk<k; kk++)
2518 				{
2519 				c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
2520 				c_10 += pA[ii+1+lda*kk] * pB[jj+0+ldb*kk];
2521 				c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
2522 				c_11 += pA[ii+1+lda*kk] * pB[jj+1+ldb*kk];
2523 				}
2524 			pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
2525 			pD[ii+1+ldd*(jj+0)] = beta * pC[ii+1+ldc*(jj+0)] + alpha * c_10;
2526 			pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
2527 			pD[ii+1+ldd*(jj+1)] = beta * pC[ii+1+ldc*(jj+1)] + alpha * c_11;
2528 			}
2529 		// diagonal
2530 		c_00 = 0.0;
2531 		c_01 = 0.0;
2532 		c_11 = 0.0;
2533 		for(kk=0; kk<k; kk++)
2534 			{
2535 			c_00 += pA[jj+0+lda*kk] * pB[jj+0+ldb*kk];
2536 			c_01 += pA[jj+0+lda*kk] * pB[jj+1+ldb*kk];
2537 			c_11 += pA[jj+1+lda*kk] * pB[jj+1+ldb*kk];
2538 			}
2539 		pD[jj+0+ldd*(jj+0)] = beta * pC[jj+0+ldc*(jj+0)] + alpha * c_00;
2540 		pD[jj+0+ldd*(jj+1)] = beta * pC[jj+0+ldc*(jj+1)] + alpha * c_01;
2541 		pD[jj+1+ldd*(jj+1)] = beta * pC[jj+1+ldc*(jj+1)] + alpha * c_11;
2542 		}
2543 	if(jj<m)
2544 		{
2545 		// upper
2546 		ii = 0;
2547 		for(; ii<jj; ii+=2)
2548 			{
2549 			c_00 = 0.0;
2550 			c_10 = 0.0;
2551 			for(kk=0; kk<k; kk++)
2552 				{
2553 				c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
2554 				c_10 += pA[ii+1+lda*kk] * pB[jj+0+ldb*kk];
2555 				}
2556 			pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
2557 			pD[ii+1+ldd*(jj+0)] = beta * pC[ii+1+ldc*(jj+0)] + alpha * c_10;
2558 			}
2559 		// diagonal
2560 		c_00 = 0.0;
2561 		for(kk=0; kk<k; kk++)
2562 			{
2563 			c_00 += pA[jj+0+lda*kk] * pB[jj+0+ldb*kk];
2564 			}
2565 		pD[jj+0+ldd*(jj+0)] = beta * pC[jj+0+ldc*(jj+0)] + alpha * c_00;
2566 		}
2567 	return;
2568 	}
2569 
2570 
2571 
2572 // dsyrk_upper transposed
SYRK_UT(int m,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)2573 void SYRK_UT(int m, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
2574 	{
2575 	if(m<=0)
2576 		return;
2577 
2578 	// invalidate stored inverse diagonal of result matrix
2579 	sD->use_dA = 0;
2580 
2581 	int ii, jj, kk;
2582 	REAL
2583 		c_00, c_01,
2584 		c_10, c_11;
2585 	int lda = sA->m;
2586 	int ldb = sB->m;
2587 	int ldc = sC->m;
2588 	int ldd = sD->m;
2589 	REAL *pA = sA->pA + ai + aj*lda;
2590 	REAL *pB = sB->pA + bi + bj*ldb;
2591 	REAL *pC = sC->pA + ci + cj*ldc;
2592 	REAL *pD = sD->pA + di + dj*ldd;
2593 	jj = 0;
2594 	for(; jj<m-1; jj+=2)
2595 		{
2596 		// upper
2597 		ii = 0;
2598 		for(; ii<jj; ii+=2)
2599 			{
2600 			c_00 = 0.0;
2601 			c_10 = 0.0;
2602 			c_01 = 0.0;
2603 			c_11 = 0.0;
2604 			for(kk=0; kk<k; kk++)
2605 				{
2606 				c_00 += pA[kk+lda*(ii+0)] * pB[kk+ldb*(jj+0)];
2607 				c_10 += pA[kk+lda*(ii+1)] * pB[kk+ldb*(jj+0)];
2608 				c_01 += pA[kk+lda*(ii+0)] * pB[kk+ldb*(jj+1)];
2609 				c_11 += pA[kk+lda*(ii+1)] * pB[kk+ldb*(jj+1)];
2610 				}
2611 			pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
2612 			pD[ii+1+ldd*(jj+0)] = beta * pC[ii+1+ldc*(jj+0)] + alpha * c_10;
2613 			pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
2614 			pD[ii+1+ldd*(jj+1)] = beta * pC[ii+1+ldc*(jj+1)] + alpha * c_11;
2615 			}
2616 		// diagonal
2617 		c_00 = 0.0;
2618 		c_01 = 0.0;
2619 		c_11 = 0.0;
2620 		for(kk=0; kk<k; kk++)
2621 			{
2622 			c_00 += pA[kk+lda*(jj+0)] * pB[kk+ldb*(jj+0)];
2623 			c_01 += pA[kk+lda*(jj+0)] * pB[kk+ldb*(jj+1)];
2624 			c_11 += pA[kk+lda*(jj+1)] * pB[kk+ldb*(jj+1)];
2625 			}
2626 		pD[jj+0+ldd*(jj+0)] = beta * pC[jj+0+ldc*(jj+0)] + alpha * c_00;
2627 		pD[jj+0+ldd*(jj+1)] = beta * pC[jj+0+ldc*(jj+1)] + alpha * c_01;
2628 		pD[jj+1+ldd*(jj+1)] = beta * pC[jj+1+ldc*(jj+1)] + alpha * c_11;
2629 		}
2630 	if(jj<m)
2631 		{
2632 		// upper
2633 		ii = 0;
2634 		for(; ii<jj; ii+=2)
2635 			{
2636 			c_00 = 0.0;
2637 			c_10 = 0.0;
2638 			for(kk=0; kk<k; kk++)
2639 				{
2640 				c_00 += pA[kk+lda*(ii+0)] * pB[kk+ldb*(jj+0)];
2641 				c_10 += pA[kk+lda*(ii+1)] * pB[kk+ldb*(jj+0)];
2642 				}
2643 			pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
2644 			pD[ii+1+ldd*(jj+0)] = beta * pC[ii+1+ldc*(jj+0)] + alpha * c_10;
2645 			}
2646 		// diagonal
2647 		c_00 = 0.0;
2648 		for(kk=0; kk<k; kk++)
2649 			{
2650 			c_00 += pA[kk+lda*(jj+0)] * pB[kk+ldb*(jj+0)];
2651 			}
2652 		pD[jj+0+ldd*(jj+0)] = beta * pC[jj+0+ldc*(jj+0)] + alpha * c_00;
2653 		}
2654 	return;
2655 	}
2656 
2657 
2658 
2659 #elif defined(LA_EXTERNAL_BLAS_WRAPPER)
2660 
2661 
2662 
2663 // dgemm nn
GEMM_NN(int m,int n,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)2664 void GEMM_NN(int m, int n, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
2665 	{
2666 
2667 	// invalidate stored inverse diagonal of result matrix
2668 	sD->use_dA = 0;
2669 
2670 	int jj;
2671 	char cn = 'n';
2672 	REAL *pA = sA->pA+ai+aj*sA->m;
2673 	REAL *pB = sB->pA+bi+bj*sB->m;
2674 	REAL *pC = sC->pA+ci+cj*sC->m;
2675 	REAL *pD = sD->pA+di+dj*sD->m;
2676 	int i1 = 1;
2677 	int lda = sA->m;
2678 	int ldb = sB->m;
2679 	int ldc = sC->m;
2680 	int ldd = sD->m;
2681 	if(!(beta==0.0 || pC==pD))
2682 		{
2683 		for(jj=0; jj<n; jj++)
2684 			COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
2685 		}
2686 	GEMM(&cn, &cn, &m, &n, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
2687 	return;
2688 	}
2689 
2690 
2691 
2692 // dgemm nt
GEMM_NT(int m,int n,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)2693 void GEMM_NT(int m, int n, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
2694 	{
2695 
2696 	// invalidate stored inverse diagonal of result matrix
2697 	sD->use_dA = 0;
2698 
2699 	int jj;
2700 	char cn = 'n';
2701 	char ct = 't';
2702 	REAL *pA = sA->pA+ai+aj*sA->m;
2703 	REAL *pB = sB->pA+bi+bj*sB->m;
2704 	REAL *pC = sC->pA+ci+cj*sC->m;
2705 	REAL *pD = sD->pA+di+dj*sD->m;
2706 	int i1 = 1;
2707 	int lda = sA->m;
2708 	int ldb = sB->m;
2709 	int ldc = sC->m;
2710 	int ldd = sD->m;
2711 	if(!(beta==0.0 || pC==pD))
2712 		{
2713 		for(jj=0; jj<n; jj++)
2714 			COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
2715 		}
2716 	GEMM(&cn, &ct, &m, &n, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
2717 	return;
2718 	}
2719 
2720 
2721 
2722 // dgemm tn
GEMM_TN(int m,int n,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)2723 void GEMM_TN(int m, int n, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
2724 	{
2725 
2726 	// invalidate stored inverse diagonal of result matrix
2727 	sD->use_dA = 0;
2728 
2729 	int jj;
2730 	char cn = 'n';
2731 	char ct = 't';
2732 	REAL *pA = sA->pA+ai+aj*sA->m;
2733 	REAL *pB = sB->pA+bi+bj*sB->m;
2734 	REAL *pC = sC->pA+ci+cj*sC->m;
2735 	REAL *pD = sD->pA+di+dj*sD->m;
2736 	int i1 = 1;
2737 	int lda = sA->m;
2738 	int ldb = sB->m;
2739 	int ldc = sC->m;
2740 	int ldd = sD->m;
2741 	if(!(beta==0.0 || pC==pD))
2742 		{
2743 		for(jj=0; jj<n; jj++)
2744 			COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
2745 		}
2746 	GEMM(&ct, &cn, &m, &n, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
2747 	return;
2748 	}
2749 
2750 
2751 
2752 // dgemm tt
GEMM_TT(int m,int n,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)2753 void GEMM_TT(int m, int n, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
2754 	{
2755 
2756 	// invalidate stored inverse diagonal of result matrix
2757 	sD->use_dA = 0;
2758 
2759 	int jj;
2760 	char cn = 'n';
2761 	char ct = 't';
2762 	REAL *pA = sA->pA+ai+aj*sA->m;
2763 	REAL *pB = sB->pA+bi+bj*sB->m;
2764 	REAL *pC = sC->pA+ci+cj*sC->m;
2765 	REAL *pD = sD->pA+di+dj*sD->m;
2766 	int i1 = 1;
2767 	int lda = sA->m;
2768 	int ldb = sB->m;
2769 	int ldc = sC->m;
2770 	int ldd = sD->m;
2771 	if(!(beta==0.0 || pC==pD))
2772 		{
2773 		for(jj=0; jj<n; jj++)
2774 			COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
2775 		}
2776 	GEMM(&ct, &ct, &m, &n, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
2777 	return;
2778 	}
2779 
2780 
2781 
2782 // dtrsm_left_lower_nottransposed_notunit
TRSM_LLNN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)2783 void TRSM_LLNN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
2784 	{
2785 
2786 	// invalidate stored inverse diagonal of result matrix
2787 	sD->use_dA = 0;
2788 
2789 	int jj;
2790 	char cl = 'l';
2791 	char cn = 'n';
2792 	REAL *pA = sA->pA+ai+aj*sA->m;
2793 	REAL *pB = sB->pA+bi+bj*sB->m;
2794 	REAL *pD = sD->pA+di+dj*sD->m;
2795 	int i1 = 1;
2796 	int lda = sA->m;
2797 	int ldb = sB->m;
2798 	int ldd = sD->m;
2799 	if(!(pB==pD))
2800 		{
2801 		for(jj=0; jj<n; jj++)
2802 			COPY(&m, pB+jj*ldb, &i1, pD+jj*sD->m, &i1);
2803 		}
2804 	TRSM(&cl, &cl, &cn, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
2805 	return;
2806 	}
2807 
2808 
2809 
2810 // dtrsm_left_lower_nottransposed_unit
TRSM_LLNU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)2811 void TRSM_LLNU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
2812 	{
2813 
2814 	// invalidate stored inverse diagonal of result matrix
2815 	sD->use_dA = 0;
2816 
2817 	int jj;
2818 	char cl = 'l';
2819 	char cn = 'n';
2820 	char cu = 'u';
2821 	REAL *pA = sA->pA+ai+aj*sA->m;
2822 	REAL *pB = sB->pA+bi+bj*sB->m;
2823 	REAL *pD = sD->pA+di+dj*sD->m;
2824 	int i1 = 1;
2825 	int lda = sA->m;
2826 	int ldb = sB->m;
2827 	int ldd = sD->m;
2828 	if(!(pB==pD))
2829 		{
2830 		for(jj=0; jj<n; jj++)
2831 			COPY(&m, pB+jj*ldb, &i1, pD+jj*sD->m, &i1);
2832 		}
2833 	TRSM(&cl, &cl, &cn, &cu, &m, &n, &alpha, pA, &lda, pD, &ldd);
2834 	return;
2835 	}
2836 
2837 
2838 
2839 // dtrsm_left_lower_transposed_notunit
TRSM_LLTN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)2840 void TRSM_LLTN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
2841 	{
2842 
2843 	// invalidate stored inverse diagonal of result matrix
2844 	sD->use_dA = 0;
2845 
2846 	int jj;
2847 	char cl = 'l';
2848 	char cn = 'n';
2849 	char ct = 't';
2850 	REAL *pA = sA->pA+ai+aj*sA->m;
2851 	REAL *pB = sB->pA+bi+bj*sB->m;
2852 	REAL *pD = sD->pA+di+dj*sD->m;
2853 	int i1 = 1;
2854 	int lda = sA->m;
2855 	int ldb = sB->m;
2856 	int ldd = sD->m;
2857 	if(!(pB==pD))
2858 		{
2859 		for(jj=0; jj<n; jj++)
2860 			COPY(&m, pB+jj*ldb, &i1, pD+jj*sD->m, &i1);
2861 		}
2862 	TRSM(&cl, &cl, &ct, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
2863 	return;
2864 	}
2865 
2866 
2867 
2868 // dtrsm_left_lower_transposed_unit
TRSM_LLTU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)2869 void TRSM_LLTU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
2870 	{
2871 
2872 	// invalidate stored inverse diagonal of result matrix
2873 	sD->use_dA = 0;
2874 
2875 	int jj;
2876 	char cl = 'l';
2877 	char cn = 'n';
2878 	char ct = 't';
2879 	char cu = 'u';
2880 	REAL *pA = sA->pA+ai+aj*sA->m;
2881 	REAL *pB = sB->pA+bi+bj*sB->m;
2882 	REAL *pD = sD->pA+di+dj*sD->m;
2883 	int i1 = 1;
2884 	int lda = sA->m;
2885 	int ldb = sB->m;
2886 	int ldd = sD->m;
2887 	if(!(pB==pD))
2888 		{
2889 		for(jj=0; jj<n; jj++)
2890 			COPY(&m, pB+jj*ldb, &i1, pD+jj*sD->m, &i1);
2891 		}
2892 	TRSM(&cl, &cl, &ct, &cu, &m, &n, &alpha, pA, &lda, pD, &ldd);
2893 	return;
2894 	}
2895 
2896 
2897 
2898 // dtrsm_left_upper_nottransposed_notunit
TRSM_LUNN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)2899 void TRSM_LUNN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
2900 	{
2901 
2902 	// invalidate stored inverse diagonal of result matrix
2903 	sD->use_dA = 0;
2904 
2905 	int jj;
2906 	char cl = 'l';
2907 	char cn = 'n';
2908 	char cu = 'u';
2909 	REAL *pA = sA->pA+ai+aj*sA->m;
2910 	REAL *pB = sB->pA+bi+bj*sB->m;
2911 	REAL *pD = sD->pA+di+dj*sD->m;
2912 	int i1 = 1;
2913 	int lda = sA->m;
2914 	int ldb = sB->m;
2915 	int ldd = sD->m;
2916 	if(!(pB==pD))
2917 		{
2918 		for(jj=0; jj<n; jj++)
2919 			COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
2920 		}
2921 	TRSM(&cl, &cu, &cn, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
2922 	return;
2923 	}
2924 
2925 
2926 
2927 // dtrsm_left_upper_nottransposed_unit
TRSM_LUNU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)2928 void TRSM_LUNU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
2929 	{
2930 
2931 	// invalidate stored inverse diagonal of result matrix
2932 	sD->use_dA = 0;
2933 
2934 	int jj;
2935 	char cl = 'l';
2936 	char cn = 'n';
2937 	char cu = 'u';
2938 	REAL *pA = sA->pA+ai+aj*sA->m;
2939 	REAL *pB = sB->pA+bi+bj*sB->m;
2940 	REAL *pD = sD->pA+di+dj*sD->m;
2941 	int i1 = 1;
2942 	int lda = sA->m;
2943 	int ldb = sB->m;
2944 	int ldd = sD->m;
2945 	if(!(pB==pD))
2946 		{
2947 		for(jj=0; jj<n; jj++)
2948 			COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
2949 		}
2950 	TRSM(&cl, &cu, &cn, &cu, &m, &n, &alpha, pA, &lda, pD, &ldd);
2951 	return;
2952 	}
2953 
2954 
2955 
2956 // dtrsm_left_upper_transposed_notunit
TRSM_LUTN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)2957 void TRSM_LUTN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
2958 	{
2959 
2960 	// invalidate stored inverse diagonal of result matrix
2961 	sD->use_dA = 0;
2962 
2963 	int jj;
2964 	char cl = 'l';
2965 	char cn = 'n';
2966 	char ct = 't';
2967 	char cu = 'u';
2968 	REAL *pA = sA->pA+ai+aj*sA->m;
2969 	REAL *pB = sB->pA+bi+bj*sB->m;
2970 	REAL *pD = sD->pA+di+dj*sD->m;
2971 	int i1 = 1;
2972 	int lda = sA->m;
2973 	int ldb = sB->m;
2974 	int ldd = sD->m;
2975 	if(!(pB==pD))
2976 		{
2977 		for(jj=0; jj<n; jj++)
2978 			COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
2979 		}
2980 	TRSM(&cl, &cu, &ct, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
2981 	return;
2982 	}
2983 
2984 
2985 
2986 // dtrsm_left_upper_transposed_unit
TRSM_LUTU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)2987 void TRSM_LUTU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
2988 	{
2989 
2990 	// invalidate stored inverse diagonal of result matrix
2991 	sD->use_dA = 0;
2992 
2993 	int jj;
2994 	char cl = 'l';
2995 	char cn = 'n';
2996 	char ct = 't';
2997 	char cu = 'u';
2998 	REAL *pA = sA->pA+ai+aj*sA->m;
2999 	REAL *pB = sB->pA+bi+bj*sB->m;
3000 	REAL *pD = sD->pA+di+dj*sD->m;
3001 	int i1 = 1;
3002 	int lda = sA->m;
3003 	int ldb = sB->m;
3004 	int ldd = sD->m;
3005 	if(!(pB==pD))
3006 		{
3007 		for(jj=0; jj<n; jj++)
3008 			COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
3009 		}
3010 	TRSM(&cl, &cu, &ct, &cu, &m, &n, &alpha, pA, &lda, pD, &ldd);
3011 	return;
3012 	}
3013 
3014 
3015 
3016 // dtrsm_right_lower_nottransposed_notunit
TRSM_RLNN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)3017 void TRSM_RLNN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
3018 	{
3019 
3020 	// invalidate stored inverse diagonal of result matrix
3021 	sD->use_dA = 0;
3022 
3023 	int jj;
3024 	char cl = 'l';
3025 	char cn = 'n';
3026 	char cr = 'r';
3027 	char ct = 't';
3028 	char cu = 'u';
3029 	REAL *pA = sA->pA+ai+aj*sA->m;
3030 	REAL *pB = sB->pA+bi+bj*sB->m;
3031 	REAL *pD = sD->pA+di+dj*sD->m;
3032 	int i1 = 1;
3033 	int lda = sA->m;
3034 	int ldb = sB->m;
3035 	int ldd = sD->m;
3036 	if(!(pB==pD))
3037 		{
3038 		for(jj=0; jj<n; jj++)
3039 			COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
3040 		}
3041 	TRSM(&cr, &cl, &cn, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
3042 	return;
3043 	}
3044 
3045 
3046 
3047 // dtrsm_right_lower_nottransposed_unit
TRSM_RLNU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)3048 void TRSM_RLNU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
3049 	{
3050 
3051 	// invalidate stored inverse diagonal of result matrix
3052 	sD->use_dA = 0;
3053 
3054 	int jj;
3055 	char cl = 'l';
3056 	char cn = 'n';
3057 	char cr = 'r';
3058 	char ct = 't';
3059 	char cu = 'u';
3060 	REAL *pA = sA->pA+ai+aj*sA->m;
3061 	REAL *pB = sB->pA+bi+bj*sB->m;
3062 	REAL *pD = sD->pA+di+dj*sD->m;
3063 	int i1 = 1;
3064 	int lda = sA->m;
3065 	int ldb = sB->m;
3066 	int ldd = sD->m;
3067 	if(!(pB==pD))
3068 		{
3069 		for(jj=0; jj<n; jj++)
3070 			COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
3071 		}
3072 	TRSM(&cr, &cl, &cn, &cu, &m, &n, &alpha, pA, &lda, pD, &ldd);
3073 	return;
3074 	}
3075 
3076 
3077 
3078 // dtrsm_right_lower_transposed_notunit
TRSM_RLTN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)3079 void TRSM_RLTN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
3080 	{
3081 
3082 	// invalidate stored inverse diagonal of result matrix
3083 	sD->use_dA = 0;
3084 
3085 	int jj;
3086 	char cl = 'l';
3087 	char cn = 'n';
3088 	char cr = 'r';
3089 	char ct = 't';
3090 	char cu = 'u';
3091 	REAL *pA = sA->pA+ai+aj*sA->m;
3092 	REAL *pB = sB->pA+bi+bj*sB->m;
3093 	REAL *pD = sD->pA+di+dj*sD->m;
3094 	int i1 = 1;
3095 	int lda = sA->m;
3096 	int ldb = sB->m;
3097 	int ldd = sD->m;
3098 	if(!(pB==pD))
3099 		{
3100 		for(jj=0; jj<n; jj++)
3101 			COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
3102 		}
3103 	TRSM(&cr, &cl, &ct, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
3104 	return;
3105 	}
3106 
3107 
3108 
3109 // dtrsm_right_lower_transposed_unit
TRSM_RLTU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)3110 void TRSM_RLTU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
3111 	{
3112 
3113 	// invalidate stored inverse diagonal of result matrix
3114 	sD->use_dA = 0;
3115 
3116 	int jj;
3117 	char cl = 'l';
3118 	char cn = 'n';
3119 	char cr = 'r';
3120 	char ct = 't';
3121 	char cu = 'u';
3122 	REAL *pA = sA->pA+ai+aj*sA->m;
3123 	REAL *pB = sB->pA+bi+bj*sB->m;
3124 	REAL *pD = sD->pA+di+dj*sD->m;
3125 	int i1 = 1;
3126 	int lda = sA->m;
3127 	int ldb = sB->m;
3128 	int ldd = sD->m;
3129 	if(!(pB==pD))
3130 		{
3131 		for(jj=0; jj<n; jj++)
3132 			COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
3133 		}
3134 	TRSM(&cr, &cl, &ct, &cu, &m, &n, &alpha, pA, &lda, pD, &ldd);
3135 	return;
3136 	}
3137 
3138 
3139 
3140 // dtrsm_right_upper_nottransposed_notunit
TRSM_RUNN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)3141 void TRSM_RUNN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
3142 	{
3143 
3144 	// invalidate stored inverse diagonal of result matrix
3145 	sD->use_dA = 0;
3146 
3147 	int jj;
3148 	char cl = 'l';
3149 	char cn = 'n';
3150 	char cr = 'r';
3151 	char ct = 't';
3152 	char cu = 'u';
3153 	REAL *pA = sA->pA+ai+aj*sA->m;
3154 	REAL *pB = sB->pA+bi+bj*sB->m;
3155 	REAL *pD = sD->pA+di+dj*sD->m;
3156 	int i1 = 1;
3157 	int lda = sA->m;
3158 	int ldb = sB->m;
3159 	int ldd = sD->m;
3160 	if(!(pB==pD))
3161 		{
3162 		for(jj=0; jj<n; jj++)
3163 			COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
3164 		}
3165 	TRSM(&cr, &cu, &cn, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
3166 	return;
3167 	}
3168 
3169 
3170 
3171 // dtrsm_right_upper_nottransposed_unit
TRSM_RUNU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)3172 void TRSM_RUNU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
3173 	{
3174 
3175 	// invalidate stored inverse diagonal of result matrix
3176 	sD->use_dA = 0;
3177 
3178 	int jj;
3179 	char cl = 'l';
3180 	char cn = 'n';
3181 	char cr = 'r';
3182 	char ct = 't';
3183 	char cu = 'u';
3184 	REAL *pA = sA->pA+ai+aj*sA->m;
3185 	REAL *pB = sB->pA+bi+bj*sB->m;
3186 	REAL *pD = sD->pA+di+dj*sD->m;
3187 	int i1 = 1;
3188 	int lda = sA->m;
3189 	int ldb = sB->m;
3190 	int ldd = sD->m;
3191 	if(!(pB==pD))
3192 		{
3193 		for(jj=0; jj<n; jj++)
3194 			COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
3195 		}
3196 	TRSM(&cr, &cu, &cn, &cu, &m, &n, &alpha, pA, &lda, pD, &ldd);
3197 	return;
3198 	}
3199 
3200 
3201 
3202 // dtrsm_right_upper_transposed_notunit
TRSM_RUTN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)3203 void TRSM_RUTN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
3204 	{
3205 
3206 	// invalidate stored inverse diagonal of result matrix
3207 	sD->use_dA = 0;
3208 
3209 	int jj;
3210 	char cl = 'l';
3211 	char cn = 'n';
3212 	char cr = 'r';
3213 	char ct = 't';
3214 	char cu = 'u';
3215 	REAL *pA = sA->pA+ai+aj*sA->m;
3216 	REAL *pB = sB->pA+bi+bj*sB->m;
3217 	REAL *pD = sD->pA+di+dj*sD->m;
3218 	int i1 = 1;
3219 	int lda = sA->m;
3220 	int ldb = sB->m;
3221 	int ldd = sD->m;
3222 	if(!(pB==pD))
3223 		{
3224 		for(jj=0; jj<n; jj++)
3225 			COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
3226 		}
3227 	TRSM(&cr, &cu, &ct, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
3228 	return;
3229 	}
3230 
3231 
3232 
3233 // dtrsm_right_upper_transposed_unit
TRSM_RUTU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)3234 void TRSM_RUTU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
3235 	{
3236 
3237 	// invalidate stored inverse diagonal of result matrix
3238 	sD->use_dA = 0;
3239 
3240 	int jj;
3241 	char cl = 'l';
3242 	char cn = 'n';
3243 	char cr = 'r';
3244 	char ct = 't';
3245 	char cu = 'u';
3246 	REAL *pA = sA->pA+ai+aj*sA->m;
3247 	REAL *pB = sB->pA+bi+bj*sB->m;
3248 	REAL *pD = sD->pA+di+dj*sD->m;
3249 	int i1 = 1;
3250 	int lda = sA->m;
3251 	int ldb = sB->m;
3252 	int ldd = sD->m;
3253 	if(!(pB==pD))
3254 		{
3255 		for(jj=0; jj<n; jj++)
3256 			COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
3257 		}
3258 	TRSM(&cr, &cu, &ct, &cu, &m, &n, &alpha, pA, &lda, pD, &ldd);
3259 	return;
3260 	}
3261 
3262 
3263 
3264 // dtrmm_right_upper_transposed_notunit (A triangular !!!)
TRMM_RUTN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)3265 void TRMM_RUTN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
3266 	{
3267 
3268 	// invalidate stored inverse diagonal of result matrix
3269 	sD->use_dA = 0;
3270 
3271 	int jj;
3272 	char cl = 'l';
3273 	char cn = 'n';
3274 	char cr = 'r';
3275 	char ct = 't';
3276 	char cu = 'u';
3277 	REAL *pA = sA->pA+ai+aj*sA->m;
3278 	REAL *pB = sB->pA+bi+bj*sB->m;
3279 	REAL *pD = sD->pA+di+dj*sD->m;
3280 	int i1 = 1;
3281 	int lda = sA->m;
3282 	int ldb = sB->m;
3283 	int ldd = sD->m;
3284 	if(!(pB==pD))
3285 		{
3286 		for(jj=0; jj<n; jj++)
3287 			COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
3288 		}
3289 	TRMM(&cr, &cu, &ct, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
3290 	return;
3291 	}
3292 
3293 
3294 
3295 // dtrmm_right_lower_nottransposed_notunit (A triangular !!!)
TRMM_RLNN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)3296 void TRMM_RLNN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
3297 	{
3298 
3299 	// invalidate stored inverse diagonal of result matrix
3300 	sD->use_dA = 0;
3301 
3302 	int jj;
3303 	char cl = 'l';
3304 	char cn = 'n';
3305 	char cr = 'r';
3306 	char ct = 't';
3307 	char cu = 'u';
3308 	REAL *pA = sA->pA+ai+aj*sA->m;
3309 	REAL *pB = sB->pA+bi+bj*sB->m;
3310 	REAL *pD = sD->pA+di+dj*sD->m;
3311 	int i1 = 1;
3312 	int lda = sA->m;
3313 	int ldb = sB->m;
3314 	int ldd = sD->m;
3315 	if(!(pB==pD))
3316 		{
3317 		for(jj=0; jj<n; jj++)
3318 			COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
3319 		}
3320 	TRMM(&cr, &cl, &cn, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
3321 	return;
3322 	}
3323 
3324 
3325 
3326 // dsyrk lower not-transposed (allowing for different factors => use dgemm !!!)
SYRK_LN(int m,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)3327 void SYRK_LN(int m, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
3328 	{
3329 
3330 	// invalidate stored inverse diagonal of result matrix
3331 	sD->use_dA = 0;
3332 
3333 	int jj;
3334 	char cl = 'l';
3335 	char cn = 'n';
3336 	char cr = 'r';
3337 	char ct = 't';
3338 	char cu = 'u';
3339 	REAL *pA = sA->pA + ai + aj*sA->m;
3340 	REAL *pB = sB->pA + bi + bj*sB->m;
3341 	REAL *pC = sC->pA + ci + cj*sC->m;
3342 	REAL *pD = sD->pA + di + dj*sD->m;
3343 	int i1 = 1;
3344 	int lda = sA->m;
3345 	int ldb = sB->m;
3346 	int ldc = sC->m;
3347 	int ldd = sD->m;
3348 	if(!(beta==0.0 || pC==pD))
3349 		{
3350 		for(jj=0; jj<m; jj++)
3351 			COPY(&m, pC+jj*sC->m, &i1, pD+jj*sD->m, &i1);
3352 		}
3353 	if(pA==pB)
3354 		{
3355 		SYRK(&cl, &cn, &m, &k, &alpha, pA, &lda, &beta, pD, &ldd);
3356 		}
3357 	else
3358 		{
3359 		GEMM(&cn, &ct, &m, &m, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
3360 		}
3361 	return;
3362 	}
3363 
3364 
3365 
3366 // dsyrk lower not-transposed (allowing for different factors => use dgemm !!!)
SYRK_LN_MN(int m,int n,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)3367 void SYRK_LN_MN(int m, int n, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
3368 	{
3369 
3370 	// invalidate stored inverse diagonal of result matrix
3371 	sD->use_dA = 0;
3372 
3373 	int jj;
3374 	char cl = 'l';
3375 	char cn = 'n';
3376 	char cr = 'r';
3377 	char ct = 't';
3378 	char cu = 'u';
3379 	REAL *pA = sA->pA + ai + aj*sA->m;
3380 	REAL *pB = sB->pA + bi + bj*sB->m;
3381 	REAL *pC = sC->pA + ci + cj*sC->m;
3382 	REAL *pD = sD->pA + di + dj*sD->m;
3383 	int i1 = 1;
3384 	int mmn = m-n;
3385 	int lda = sA->m;
3386 	int ldb = sB->m;
3387 	int ldc = sC->m;
3388 	int ldd = sD->m;
3389 	if(!(beta==0.0 || pC==pD))
3390 		{
3391 		for(jj=0; jj<n; jj++)
3392 			COPY(&m, pC+jj*sC->m, &i1, pD+jj*sD->m, &i1);
3393 		}
3394 	if(pA==pB)
3395 		{
3396 		SYRK(&cl, &cn, &n, &k, &alpha, pA, &lda, &beta, pD, &ldd);
3397 		GEMM(&cn, &ct, &mmn, &n, &k, &alpha, pA+n, &lda, pB, &ldb, &beta, pD+n, &ldd);
3398 		}
3399 	else
3400 		{
3401 		GEMM(&cn, &ct, &m, &n, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
3402 		}
3403 	return;
3404 	}
3405 
3406 
3407 
3408 // dsyrk lower transposed (allowing for different factors => use dgemm !!!)
SYRK_LT(int m,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)3409 void SYRK_LT(int m, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
3410 	{
3411 
3412 	// invalidate stored inverse diagonal of result matrix
3413 	sD->use_dA = 0;
3414 
3415 	int jj;
3416 	char cl = 'l';
3417 	char cn = 'n';
3418 	char cr = 'r';
3419 	char ct = 't';
3420 	char cu = 'u';
3421 	REAL *pA = sA->pA + ai + aj*sA->m;
3422 	REAL *pB = sB->pA + bi + bj*sB->m;
3423 	REAL *pC = sC->pA + ci + cj*sC->m;
3424 	REAL *pD = sD->pA + di + dj*sD->m;
3425 	int i1 = 1;
3426 	int lda = sA->m;
3427 	int ldb = sB->m;
3428 	int ldc = sC->m;
3429 	int ldd = sD->m;
3430 	if(!(beta==0.0 || pC==pD))
3431 		{
3432 		for(jj=0; jj<m; jj++)
3433 			COPY(&m, pC+jj*sC->m, &i1, pD+jj*sD->m, &i1);
3434 		}
3435 	if(pA==pB)
3436 		{
3437 		SYRK(&cl, &ct, &m, &k, &alpha, pA, &lda, &beta, pD, &ldd);
3438 		}
3439 	else
3440 		{
3441 		GEMM(&ct, &cn, &m, &m, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
3442 		}
3443 	return;
3444 	}
3445 
3446 
3447 
3448 // dsyrk upper not-transposed (allowing for different factors => use dgemm !!!)
SYRK_UN(int m,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)3449 void SYRK_UN(int m, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
3450 	{
3451 
3452 	// invalidate stored inverse diagonal of result matrix
3453 	sD->use_dA = 0;
3454 
3455 	int jj;
3456 	char cl = 'l';
3457 	char cn = 'n';
3458 	char cr = 'r';
3459 	char ct = 't';
3460 	char cu = 'u';
3461 	REAL *pA = sA->pA + ai + aj*sA->m;
3462 	REAL *pB = sB->pA + bi + bj*sB->m;
3463 	REAL *pC = sC->pA + ci + cj*sC->m;
3464 	REAL *pD = sD->pA + di + dj*sD->m;
3465 	int i1 = 1;
3466 	int lda = sA->m;
3467 	int ldb = sB->m;
3468 	int ldc = sC->m;
3469 	int ldd = sD->m;
3470 	if(!(beta==0.0 || pC==pD))
3471 		{
3472 		for(jj=0; jj<m; jj++)
3473 			COPY(&m, pC+jj*sC->m, &i1, pD+jj*sD->m, &i1);
3474 		}
3475 	if(pA==pB)
3476 		{
3477 		SYRK(&cu, &cn, &m, &k, &alpha, pA, &lda, &beta, pD, &ldd);
3478 		}
3479 	else
3480 		{
3481 		GEMM(&cn, &ct, &m, &m, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
3482 		}
3483 	return;
3484 	}
3485 
3486 
3487 
3488 // dsyrk upper transposed (allowing for different factors => use dgemm !!!)
SYRK_UT(int m,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)3489 void SYRK_UT(int m, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
3490 	{
3491 
3492 	// invalidate stored inverse diagonal of result matrix
3493 	sD->use_dA = 0;
3494 
3495 	int jj;
3496 	char cl = 'l';
3497 	char cn = 'n';
3498 	char cr = 'r';
3499 	char ct = 't';
3500 	char cu = 'u';
3501 	REAL *pA = sA->pA + ai + aj*sA->m;
3502 	REAL *pB = sB->pA + bi + bj*sB->m;
3503 	REAL *pC = sC->pA + ci + cj*sC->m;
3504 	REAL *pD = sD->pA + di + dj*sD->m;
3505 	int i1 = 1;
3506 	int lda = sA->m;
3507 	int ldb = sB->m;
3508 	int ldc = sC->m;
3509 	int ldd = sD->m;
3510 	if(!(beta==0.0 || pC==pD))
3511 		{
3512 		for(jj=0; jj<m; jj++)
3513 			COPY(&m, pC+jj*sC->m, &i1, pD+jj*sD->m, &i1);
3514 		}
3515 	if(pA==pB)
3516 		{
3517 		SYRK(&cu, &ct, &m, &k, &alpha, pA, &lda, &beta, pD, &ldd);
3518 		}
3519 	else
3520 		{
3521 		GEMM(&ct, &cn, &m, &m, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
3522 		}
3523 	return;
3524 	}
3525 
3526 
3527 
3528 #else
3529 
3530 #error : wrong LA choice
3531 
3532 #endif
3533