1 /**************************************************************************************************
2 * *
3 * This file is part of BLASFEO. *
4 * *
5 * BLASFEO -- BLAS for embedded optimization. *
6 * Copyright (C) 2019 by Gianluca Frison. *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8 * All rights reserved. *
9 * *
10 * The 2-Clause BSD License *
11 * *
12 * Redistribution and use in source and binary forms, with or without *
13 * modification, are permitted provided that the following conditions are met: *
14 * *
15 * 1. Redistributions of source code must retain the above copyright notice, this *
16 * list of conditions and the following disclaimer. *
17 * 2. Redistributions in binary form must reproduce the above copyright notice, *
18 * this list of conditions and the following disclaimer in the documentation *
19 * and/or other materials provided with the distribution. *
20 * *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
31 * *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de *
33 * *
34 **************************************************************************************************/
35
36
37
38 #if defined(LA_REFERENCE) | defined(TESTING_MODE)
39
40
41 // dgemm nn
GEMM_NN(int m,int n,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)42 void GEMM_NN(int m, int n, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
43 {
44 if(m<=0 | n<=0)
45 return;
46
47 // invalidate stored inverse diagonal of result matrix
48 sD->use_dA = 0;
49
50 int ii, jj, kk;
51 REAL
52 c_00, c_01,
53 c_10, c_11;
54 int lda = sA->m;
55 int ldb = sB->m;
56 int ldc = sC->m;
57 int ldd = sD->m;
58 REAL *pA = sA->pA + ai + aj*lda;
59 REAL *pB = sB->pA + bi + bj*ldb;
60 REAL *pC = sC->pA + ci + cj*ldc;
61 REAL *pD = sD->pA + di + dj*ldd;
62 jj = 0;
63 for(; jj<n-1; jj+=2)
64 {
65 ii = 0;
66 for(; ii<m-1; ii+=2)
67 {
68 c_00 = 0.0; ;
69 c_10 = 0.0; ;
70 c_01 = 0.0; ;
71 c_11 = 0.0; ;
72 for(kk=0; kk<k; kk++)
73 {
74 c_00 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+0)];
75 c_10 += pA[(ii+1)+lda*kk] * pB[kk+ldb*(jj+0)];
76 c_01 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+1)];
77 c_11 += pA[(ii+1)+lda*kk] * pB[kk+ldb*(jj+1)];
78 }
79 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
80 pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
81 pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
82 pD[(ii+1)+ldd*(jj+1)] = alpha * c_11 + beta * pC[(ii+1)+ldc*(jj+1)];
83 }
84 for(; ii<m; ii++)
85 {
86 c_00 = 0.0; ;
87 c_01 = 0.0; ;
88 for(kk=0; kk<k; kk++)
89 {
90 c_00 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+0)];
91 c_01 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+1)];
92 }
93 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
94 pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
95 }
96 }
97 for(; jj<n; jj++)
98 {
99 ii = 0;
100 for(; ii<m-1; ii+=2)
101 {
102 c_00 = 0.0; ;
103 c_10 = 0.0; ;
104 for(kk=0; kk<k; kk++)
105 {
106 c_00 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+0)];
107 c_10 += pA[(ii+1)+lda*kk] * pB[kk+ldb*(jj+0)];
108 }
109 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
110 pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
111 }
112 for(; ii<m; ii++)
113 {
114 c_00 = 0.0; ;
115 for(kk=0; kk<k; kk++)
116 {
117 c_00 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+0)];
118 }
119 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
120 }
121 }
122 return;
123 }
124
125
126
127 // dgemm nt
GEMM_NT(int m,int n,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)128 void GEMM_NT(int m, int n, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
129 {
130 if(m<=0 | n<=0)
131 return;
132
133 // invalidate stored inverse diagonal of result matrix
134 sD->use_dA = 0;
135
136 int ii, jj, kk;
137 REAL
138 c_00, c_01,
139 c_10, c_11;
140 int lda = sA->m;
141 int ldb = sB->m;
142 int ldc = sC->m;
143 int ldd = sD->m;
144 REAL *pA = sA->pA + ai + aj*lda;
145 REAL *pB = sB->pA + bi + bj*ldb;
146 REAL *pC = sC->pA + ci + cj*ldc;
147 REAL *pD = sD->pA + di + dj*ldd;
148 jj = 0;
149 for(; jj<n-1; jj+=2)
150 {
151 ii = 0;
152 for(; ii<m-1; ii+=2)
153 {
154 c_00 = 0.0;
155 c_10 = 0.0;
156 c_01 = 0.0;
157 c_11 = 0.0;
158 for(kk=0; kk<k; kk++)
159 {
160 c_00 += pA[(ii+0)+lda*kk] * pB[(jj+0)+ldb*kk];
161 c_10 += pA[(ii+1)+lda*kk] * pB[(jj+0)+ldb*kk];
162 c_01 += pA[(ii+0)+lda*kk] * pB[(jj+1)+ldb*kk];
163 c_11 += pA[(ii+1)+lda*kk] * pB[(jj+1)+ldb*kk];
164 }
165 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
166 pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
167 pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
168 pD[(ii+1)+ldd*(jj+1)] = alpha * c_11 + beta * pC[(ii+1)+ldc*(jj+1)];
169 }
170 for(; ii<m; ii++)
171 {
172 c_00 = 0.0;
173 c_01 = 0.0;
174 for(kk=0; kk<k; kk++)
175 {
176 c_00 += pA[(ii+0)+lda*kk] * pB[(jj+0)+ldb*kk];
177 c_01 += pA[(ii+0)+lda*kk] * pB[(jj+1)+ldb*kk];
178 }
179 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
180 pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
181 }
182 }
183 for(; jj<n; jj++)
184 {
185 ii = 0;
186 for(; ii<m-1; ii+=2)
187 {
188 c_00 = 0.0;
189 c_10 = 0.0;
190 for(kk=0; kk<k; kk++)
191 {
192 c_00 += pA[(ii+0)+lda*kk] * pB[(jj+0)+ldb*kk];
193 c_10 += pA[(ii+1)+lda*kk] * pB[(jj+0)+ldb*kk];
194 }
195 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
196 pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
197 }
198 for(; ii<m; ii++)
199 {
200 c_00 = 0.0;
201 for(kk=0; kk<k; kk++)
202 {
203 c_00 += pA[(ii+0)+lda*kk] * pB[(jj+0)+ldb*kk];
204 }
205 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
206 }
207 }
208 return;
209 }
210
211
212
213 // dgemm tn
GEMM_TN(int m,int n,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)214 void GEMM_TN(int m, int n, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
215 {
216 if(m<=0 | n<=0)
217 return;
218
219 // invalidate stored inverse diagonal of result matrix
220 sD->use_dA = 0;
221
222 int ii, jj, kk;
223 REAL
224 c_00, c_01,
225 c_10, c_11;
226 int lda = sA->m;
227 int ldb = sB->m;
228 int ldc = sC->m;
229 int ldd = sD->m;
230 REAL *pA = sA->pA + ai + aj*lda;
231 REAL *pB = sB->pA + bi + bj*ldb;
232 REAL *pC = sC->pA + ci + cj*ldc;
233 REAL *pD = sD->pA + di + dj*ldd;
234 jj = 0;
235 for(; jj<n-1; jj+=2)
236 {
237 ii = 0;
238 for(; ii<m-1; ii+=2)
239 {
240 c_00 = 0.0; ;
241 c_10 = 0.0; ;
242 c_01 = 0.0; ;
243 c_11 = 0.0; ;
244 for(kk=0; kk<k; kk++)
245 {
246 c_00 += pA[kk+lda*(ii+0)] * pB[kk+ldb*(jj+0)];
247 c_10 += pA[kk+lda*(ii+1)] * pB[kk+ldb*(jj+0)];
248 c_01 += pA[kk+lda*(ii+0)] * pB[kk+ldb*(jj+1)];
249 c_11 += pA[kk+lda*(ii+1)] * pB[kk+ldb*(jj+1)];
250 }
251 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
252 pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
253 pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
254 pD[(ii+1)+ldd*(jj+1)] = alpha * c_11 + beta * pC[(ii+1)+ldc*(jj+1)];
255 }
256 for(; ii<m; ii++)
257 {
258 c_00 = 0.0; ;
259 c_01 = 0.0; ;
260 for(kk=0; kk<k; kk++)
261 {
262 c_00 += pA[kk+lda*(ii+0)] * pB[kk+ldb*(jj+0)];
263 c_01 += pA[kk+lda*(ii+0)] * pB[kk+ldb*(jj+1)];
264 }
265 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
266 pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
267 }
268 }
269 for(; jj<n; jj++)
270 {
271 ii = 0;
272 for(; ii<m-1; ii+=2)
273 {
274 c_00 = 0.0; ;
275 c_10 = 0.0; ;
276 for(kk=0; kk<k; kk++)
277 {
278 c_00 += pA[kk+lda*(ii+0)] * pB[kk+ldb*(jj+0)];
279 c_10 += pA[kk+lda*(ii+1)] * pB[kk+ldb*(jj+0)];
280 }
281 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
282 pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
283 }
284 for(; ii<m; ii++)
285 {
286 c_00 = 0.0; ;
287 for(kk=0; kk<k; kk++)
288 {
289 c_00 += pA[kk+lda*(ii+0)] * pB[kk+ldb*(jj+0)];
290 }
291 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
292 }
293 }
294 return;
295 }
296
297
298
299 // dgemm tt
GEMM_TT(int m,int n,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)300 void GEMM_TT(int m, int n, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
301 {
302 if(m<=0 | n<=0)
303 return;
304
305 // invalidate stored inverse diagonal of result matrix
306 sD->use_dA = 0;
307
308 int ii, jj, kk;
309 REAL
310 c_00, c_01,
311 c_10, c_11;
312 int lda = sA->m;
313 int ldb = sB->m;
314 int ldc = sC->m;
315 int ldd = sD->m;
316 REAL *pA = sA->pA + ai + aj*lda;
317 REAL *pB = sB->pA + bi + bj*ldb;
318 REAL *pC = sC->pA + ci + cj*ldc;
319 REAL *pD = sD->pA + di + dj*ldd;
320 jj = 0;
321 for(; jj<n-1; jj+=2)
322 {
323 ii = 0;
324 for(; ii<m-1; ii+=2)
325 {
326 c_00 = 0.0; ;
327 c_10 = 0.0; ;
328 c_01 = 0.0; ;
329 c_11 = 0.0; ;
330 for(kk=0; kk<k; kk++)
331 {
332 c_00 += pA[kk+lda*(ii+0)] * pB[(jj+0)+ldb*kk];
333 c_10 += pA[kk+lda*(ii+1)] * pB[(jj+0)+ldb*kk];
334 c_01 += pA[kk+lda*(ii+0)] * pB[(jj+1)+ldb*kk];
335 c_11 += pA[kk+lda*(ii+1)] * pB[(jj+1)+ldb*kk];
336 }
337 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
338 pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
339 pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
340 pD[(ii+1)+ldd*(jj+1)] = alpha * c_11 + beta * pC[(ii+1)+ldc*(jj+1)];
341 }
342 for(; ii<m; ii++)
343 {
344 c_00 = 0.0; ;
345 c_01 = 0.0; ;
346 for(kk=0; kk<k; kk++)
347 {
348 c_00 += pA[kk+lda*(ii+0)] * pB[(jj+0)+ldb*kk];
349 c_01 += pA[kk+lda*(ii+0)] * pB[(jj+1)+ldb*kk];
350 }
351 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
352 pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
353 }
354 }
355 for(; jj<n; jj++)
356 {
357 ii = 0;
358 for(; ii<m-1; ii+=2)
359 {
360 c_00 = 0.0; ;
361 c_10 = 0.0; ;
362 for(kk=0; kk<k; kk++)
363 {
364 c_00 += pA[kk+lda*(ii+0)] * pB[(jj+0)+ldb*kk];
365 c_10 += pA[kk+lda*(ii+1)] * pB[(jj+0)+ldb*kk];
366 }
367 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
368 pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
369 }
370 for(; ii<m; ii++)
371 {
372 c_00 = 0.0; ;
373 for(kk=0; kk<k; kk++)
374 {
375 c_00 += pA[kk+lda*(ii+0)] * pB[(jj+0)+ldb*kk];
376 }
377 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
378 }
379 }
380 return;
381 }
382
383
384
385 // dtrsm_left_lower_nottransposed_notunit
TRSM_LLNN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)386 void TRSM_LLNN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
387 {
388 if(m<=0 | n<=0)
389 return;
390
391 // invalidate stored inverse diagonal of result matrix
392 sD->use_dA = 0;
393
394 int ii, jj, kk;
395 REAL
396 d_00, d_01,
397 d_10, d_11;
398 int lda = sA->m;
399 int ldb = sB->m;
400 int ldd = sD->m;
401 REAL *pA = sA->pA + ai + aj*lda; // triangular
402 REAL *pB = sB->pA + bi + bj*ldb;
403 REAL *pD = sD->pA + di + dj*ldd;
404 REAL *dA = sA->dA;
405 if(ai==0 & aj==0)
406 {
407 if(sA->use_dA<n)
408 {
409 for(ii=0; ii<n; ii++)
410 dA[ii] = 1.0 / pA[ii+lda*ii];
411 sA->use_dA = n;
412 }
413 }
414 else
415 {
416 for(ii=0; ii<n; ii++)
417 dA[ii] = 1.0 / pA[ii+lda*ii];
418 sA->use_dA = 0; // nonzero offset makes diagonal dirty
419 }
420 // solve
421 jj = 0;
422 for(; jj<n-1; jj+=2)
423 {
424 ii = 0;
425 for(; ii<m-1; ii+=2)
426 {
427 d_00 = alpha * pB[ii+0+ldb*(jj+0)];
428 d_10 = alpha * pB[ii+1+ldb*(jj+0)];
429 d_01 = alpha * pB[ii+0+ldb*(jj+1)];
430 d_11 = alpha * pB[ii+1+ldb*(jj+1)];
431 kk = 0;
432 for(; kk<ii; kk++)
433 {
434 d_00 -= pA[ii+0+lda*kk] * pD[kk+ldd*(jj+0)];
435 d_10 -= pA[ii+1+lda*kk] * pD[kk+ldd*(jj+0)];
436 d_01 -= pA[ii+0+lda*kk] * pD[kk+ldd*(jj+1)];
437 d_11 -= pA[ii+1+lda*kk] * pD[kk+ldd*(jj+1)];
438 }
439 d_00 *= dA[ii+0];
440 d_01 *= dA[ii+0];
441 d_10 -= pA[ii+1+lda*ii] * d_00;
442 d_11 -= pA[ii+1+lda*ii] * d_01;
443 d_10 *= dA[ii+1];
444 d_11 *= dA[ii+1];
445 pD[ii+0+ldd*(jj+0)] = d_00;
446 pD[ii+1+ldd*(jj+0)] = d_10;
447 pD[ii+0+ldd*(jj+1)] = d_01;
448 pD[ii+1+ldd*(jj+1)] = d_11;
449 }
450 for(; ii<m; ii++)
451 {
452 d_00 = alpha * pB[ii+ldb*(jj+0)];
453 d_01 = alpha * pB[ii+ldb*(jj+1)];
454 for(kk=0; kk<ii; kk++)
455 {
456 d_00 -= pA[ii+lda*kk] * pD[kk+ldd*(jj+0)];
457 d_01 -= pA[ii+lda*kk] * pD[kk+ldd*(jj+1)];
458 }
459 d_00 *= dA[ii+0];
460 d_01 *= dA[ii+0];
461 pD[ii+ldd*(jj+0)] = d_00;
462 pD[ii+ldd*(jj+1)] = d_01;
463 }
464 }
465 for(; jj<n; jj++)
466 {
467 ii = 0;
468 for(; ii<m-1; ii+=2)
469 {
470 d_00 = alpha * pB[ii+0+ldb*jj];
471 d_10 = alpha * pB[ii+1+ldb*jj];
472 for(kk=0; kk<ii; kk++)
473 {
474 d_00 -= pA[ii+0+lda*kk] * pD[kk+ldd*jj];
475 d_10 -= pA[ii+1+lda*kk] * pD[kk+ldd*jj];
476 }
477 d_00 *= dA[ii+0];
478 d_10 -= pA[ii+1+lda*kk] * d_00;
479 d_10 *= dA[ii+1];
480 pD[ii+0+ldd*jj] = d_00;
481 pD[ii+1+ldd*jj] = d_10;
482 }
483 for(; ii<m; ii++)
484 {
485 d_00 = alpha * pB[ii+ldb*jj];
486 for(kk=0; kk<ii; kk++)
487 {
488 d_00 -= pA[ii+lda*kk] * pD[kk+ldd*jj];
489 }
490 d_00 *= dA[ii+0];
491 pD[ii+ldd*jj] = d_00;
492 }
493 }
494 return;
495 }
496
497
498
499 // dtrsm_left_lower_nottransposed_unit
TRSM_LLNU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)500 void TRSM_LLNU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
501 {
502 if(m<=0 | n<=0)
503 return;
504
505 // invalidate stored inverse diagonal of result matrix
506 sD->use_dA = 0;
507
508 int ii, jj, kk;
509 REAL
510 d_00, d_01,
511 d_10, d_11;
512 int lda = sA->m;
513 int ldb = sB->m;
514 int ldd = sD->m;
515 REAL *pA = sA->pA + ai + aj*lda; // triangular
516 REAL *pB = sB->pA + bi + bj*ldb;
517 REAL *pD = sD->pA + di + dj*ldd;
518 // solve
519 jj = 0;
520 for(; jj<n-1; jj+=2)
521 {
522 ii = 0;
523 for(; ii<m-1; ii+=2)
524 {
525 d_00 = alpha * pB[ii+0+ldb*(jj+0)];
526 d_10 = alpha * pB[ii+1+ldb*(jj+0)];
527 d_01 = alpha * pB[ii+0+ldb*(jj+1)];
528 d_11 = alpha * pB[ii+1+ldb*(jj+1)];
529 kk = 0;
530 for(; kk<ii; kk++)
531 {
532 d_00 -= pA[ii+0+lda*kk] * pD[kk+ldd*(jj+0)];
533 d_10 -= pA[ii+1+lda*kk] * pD[kk+ldd*(jj+0)];
534 d_01 -= pA[ii+0+lda*kk] * pD[kk+ldd*(jj+1)];
535 d_11 -= pA[ii+1+lda*kk] * pD[kk+ldd*(jj+1)];
536 }
537 d_10 -= pA[ii+1+lda*kk] * d_00;
538 d_11 -= pA[ii+1+lda*kk] * d_01;
539 pD[ii+0+ldd*(jj+0)] = d_00;
540 pD[ii+1+ldd*(jj+0)] = d_10;
541 pD[ii+0+ldd*(jj+1)] = d_01;
542 pD[ii+1+ldd*(jj+1)] = d_11;
543 }
544 for(; ii<m; ii++)
545 {
546 d_00 = alpha * pB[ii+ldb*(jj+0)];
547 d_01 = alpha * pB[ii+ldb*(jj+1)];
548 for(kk=0; kk<ii; kk++)
549 {
550 d_00 -= pA[ii+lda*kk] * pD[kk+ldd*(jj+0)];
551 d_01 -= pA[ii+lda*kk] * pD[kk+ldd*(jj+1)];
552 }
553 pD[ii+ldd*(jj+0)] = d_00;
554 pD[ii+ldd*(jj+1)] = d_01;
555 }
556 }
557 for(; jj<n; jj++)
558 {
559 ii = 0;
560 for(; ii<m-1; ii+=2)
561 {
562 d_00 = alpha * pB[ii+0+ldb*jj];
563 d_10 = alpha * pB[ii+1+ldb*jj];
564 for(kk=0; kk<ii; kk++)
565 {
566 d_00 -= pA[ii+0+lda*kk] * pD[kk+ldd*jj];
567 d_10 -= pA[ii+1+lda*kk] * pD[kk+ldd*jj];
568 }
569 d_10 -= pA[ii+1+lda*kk] * d_00;
570 pD[ii+0+ldd*jj] = d_00;
571 pD[ii+1+ldd*jj] = d_10;
572 }
573 for(; ii<m; ii++)
574 {
575 d_00 = alpha * pB[ii+ldb*jj];
576 for(kk=0; kk<ii; kk++)
577 {
578 d_00 -= pA[ii+lda*kk] * pD[kk+ldd*jj];
579 }
580 pD[ii+ldd*jj] = d_00;
581 }
582 }
583 return;
584 }
585
586
587
588 // dtrsm_lltn
TRSM_LLTN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)589 void TRSM_LLTN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
590 {
591 if(m<=0 | n<=0)
592 return;
593
594 // invalidate stored inverse diagonal of result matrix
595 sD->use_dA = 0;
596
597 int ii, jj, kk, id;
598 REAL
599 d_00, d_01,
600 d_10, d_11;
601 int lda = sA->m;
602 int ldb = sB->m;
603 int ldd = sD->m;
604 REAL *pA = sA->pA + ai + aj*lda; // triangular
605 REAL *pB = sB->pA + bi + bj*ldb;
606 REAL *pD = sD->pA + di + dj*ldd;
607 REAL *dA = sA->dA;
608 if(ai==0 & aj==0)
609 {
610 if (sA->use_dA<m)
611 {
612 // invert diagonal of pA
613 for(ii=0; ii<m; ii++)
614 dA[ii] = 1.0/pA[ii+lda*ii];
615 // use only now
616 sA->use_dA = m;
617 }
618 }
619 else
620 {
621 for(ii=0; ii<m; ii++)
622 dA[ii] = 1.0 / pA[ii+lda*ii];
623 sA->use_dA = 0; // nonzero offset makes diagonal dirty
624 }
625 // solve
626 jj = 0;
627 for(; jj<n-1; jj+=2)
628 {
629 ii = 0;
630 for(; ii<m-1; ii+=2)
631 {
632 id = m-ii-2;
633 d_00 = alpha * pB[id+0+ldb*(jj+0)];
634 d_10 = alpha * pB[id+1+ldb*(jj+0)];
635 d_01 = alpha * pB[id+0+ldb*(jj+1)];
636 d_11 = alpha * pB[id+1+ldb*(jj+1)];
637 kk = id+2;
638 for(; kk<m; kk++)
639 {
640 d_00 -= pA[kk+0+lda*(id+0)] * pD[kk+0+ldd*(jj+0)];
641 d_10 -= pA[kk+0+lda*(id+1)] * pD[kk+0+ldd*(jj+0)];
642 d_01 -= pA[kk+0+lda*(id+0)] * pD[kk+0+ldd*(jj+1)];
643 d_11 -= pA[kk+0+lda*(id+1)] * pD[kk+0+ldd*(jj+1)];
644 }
645 d_10 *= dA[id+1];
646 d_11 *= dA[id+1];
647 d_00 -= pA[id+1+lda*(id+0)] * d_10;
648 d_01 -= pA[id+1+lda*(id+0)] * d_11;
649 d_00 *= dA[id+0];
650 d_01 *= dA[id+0];
651 pD[id+0+ldd*(jj+0)] = d_00;
652 pD[id+1+ldd*(jj+0)] = d_10;
653 pD[id+0+ldd*(jj+1)] = d_01;
654 pD[id+1+ldd*(jj+1)] = d_11;
655 }
656 for(; ii<m; ii++)
657 {
658 id = m-ii-1;
659 d_00 = alpha * pB[id+0+ldb*(jj+0)];
660 d_01 = alpha * pB[id+0+ldb*(jj+1)];
661 kk = id+1;
662 for(; kk<m; kk++)
663 {
664 d_00 -= pA[kk+0+lda*(id+0)] * pD[kk+0+ldd*(jj+0)];
665 d_01 -= pA[kk+0+lda*(id+0)] * pD[kk+0+ldd*(jj+1)];
666 }
667 d_00 *= dA[id+0];
668 d_01 *= dA[id+0];
669 pD[id+0+ldd*(jj+0)] = d_00;
670 pD[id+0+ldd*(jj+1)] = d_01;
671 }
672 }
673 for(; jj<n; jj++)
674 {
675 ii = 0;
676 for(; ii<m-1; ii+=2)
677 {
678 id = m-ii-2;
679 d_00 = alpha * pB[id+0+ldb*(jj+0)];
680 d_10 = alpha * pB[id+1+ldb*(jj+0)];
681 kk = id+2;
682 for(; kk<m; kk++)
683 {
684 d_00 -= pA[kk+0+lda*(id+0)] * pD[kk+0+ldd*(jj+0)];
685 d_10 -= pA[kk+0+lda*(id+1)] * pD[kk+0+ldd*(jj+0)];
686 }
687 d_10 *= dA[id+1];
688 d_00 -= pA[id+1+lda*(id+0)] * d_10;
689 d_00 *= dA[id+0];
690 pD[id+0+ldd*(jj+0)] = d_00;
691 pD[id+1+ldd*(jj+0)] = d_10;
692 }
693 for(; ii<m; ii++)
694 {
695 id = m-ii-1;
696 d_00 = alpha * pB[id+0+ldb*(jj+0)];
697 kk = id+1;
698 for(; kk<m; kk++)
699 {
700 d_00 -= pA[kk+0+lda*(id+0)] * pD[kk+0+ldd*(jj+0)];
701 }
702 d_00 *= dA[id+0];
703 pD[id+0+ldd*(jj+0)] = d_00;
704 }
705 }
706 return;
707 }
708
709
710
711 // dtrsm_lltu
TRSM_LLTU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)712 void TRSM_LLTU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
713 {
714 if(m<=0 | n<=0)
715 return;
716
717 // invalidate stored inverse diagonal of result matrix
718 sD->use_dA = 0;
719
720 int ii, jj, kk, id;
721 REAL
722 d_00, d_01,
723 d_10, d_11;
724 int lda = sA->m;
725 int ldb = sB->m;
726 int ldd = sD->m;
727 REAL *pA = sA->pA + ai + aj*lda; // triangular
728 REAL *pB = sB->pA + bi + bj*ldb;
729 REAL *pD = sD->pA + di + dj*ldd;
730 // solve
731 jj = 0;
732 for(; jj<n-1; jj+=2)
733 {
734 ii = 0;
735 for(; ii<m-1; ii+=2)
736 {
737 id = m-ii-2;
738 d_00 = alpha * pB[id+0+ldb*(jj+0)];
739 d_10 = alpha * pB[id+1+ldb*(jj+0)];
740 d_01 = alpha * pB[id+0+ldb*(jj+1)];
741 d_11 = alpha * pB[id+1+ldb*(jj+1)];
742 kk = id+2;
743
744 for(; kk<m; kk++)
745 {
746 d_00 -= pA[kk+0+lda*(id+0)] * pD[kk+0+ldd*(jj+0)];
747 d_10 -= pA[kk+0+lda*(id+1)] * pD[kk+0+ldd*(jj+0)];
748 d_01 -= pA[kk+0+lda*(id+0)] * pD[kk+0+ldd*(jj+1)];
749 d_11 -= pA[kk+0+lda*(id+1)] * pD[kk+0+ldd*(jj+1)];
750 }
751 d_00 -= pA[id+1+lda*(id+0)] * d_10;
752 d_01 -= pA[id+1+lda*(id+0)] * d_11;
753 pD[id+0+ldd*(jj+0)] = d_00;
754 pD[id+1+ldd*(jj+0)] = d_10;
755 pD[id+0+ldd*(jj+1)] = d_01;
756 pD[id+1+ldd*(jj+1)] = d_11;
757 }
758 for(; ii<m; ii++)
759 {
760 id = m-ii-1;
761 d_00 = alpha * pB[id+0+ldb*(jj+0)];
762 d_01 = alpha * pB[id+0+ldb*(jj+1)];
763 kk = id+1;
764 for(; kk<m; kk++)
765 {
766 d_00 -= pA[kk+0+lda*(id+0)] * pD[kk+0+ldd*(jj+0)];
767 d_01 -= pA[kk+0+lda*(id+0)] * pD[kk+0+ldd*(jj+1)];
768 }
769 pD[id+0+ldd*(jj+0)] = d_00;
770 pD[id+0+ldd*(jj+1)] = d_01;
771 }
772 }
773 for(; jj<n; jj++)
774 {
775 ii = 0;
776 for(; ii<m-1; ii+=2)
777 {
778 id = m-ii-2;
779 d_00 = alpha * pB[id+0+ldb*(jj+0)];
780 d_10 = alpha * pB[id+1+ldb*(jj+0)];
781 kk = id+2;
782 for(; kk<m; kk++)
783 {
784 d_00 -= pA[kk+0+lda*(id+0)] * pD[kk+0+ldd*(jj+0)];
785 d_10 -= pA[kk+0+lda*(id+1)] * pD[kk+0+ldd*(jj+0)];
786 }
787 d_00 -= pA[id+1+lda*(id+0)] * d_10;
788 pD[id+0+ldd*(jj+0)] = d_00;
789 pD[id+1+ldd*(jj+0)] = d_10;
790 }
791 for(; ii<m; ii++)
792 {
793 id = m-ii-1;
794 d_00 = alpha * pB[id+0+ldb*(jj+0)];
795 kk = id+1;
796 for(; kk<m; kk++)
797 {
798 d_00 -= pA[kk+0+lda*(id+0)] * pD[kk+0+ldd*(jj+0)];
799 }
800 pD[id+0+ldd*(jj+0)] = d_00;
801 }
802 }
803 return;
804 }
805
806
807
808 // dtrsm_left_upper_nottransposed_notunit
TRSM_LUNN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)809 void TRSM_LUNN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
810 {
811 if(m<=0 | n<=0)
812 return;
813
814 // invalidate stored inverse diagonal of result matrix
815 sD->use_dA = 0;
816
817 int ii, jj, kk, id;
818 REAL
819 d_00, d_01,
820 d_10, d_11;
821 int lda = sA->m;
822 int ldb = sB->m;
823 int ldd = sD->m;
824 REAL *pA = sA->pA + ai + aj*lda; // triangular
825 REAL *pB = sB->pA + bi + bj*ldb;
826 REAL *pD = sD->pA + di + dj*ldd;
827 REAL *dA = sA->dA;
828 if(ai==0 & aj==0)
829 {
830 if (sA->use_dA<m)
831 {
832 // invert diagonal of pA
833 for(ii=0; ii<m; ii++)
834 dA[ii] = 1.0/pA[ii+lda*ii];
835 // use only now
836 sA->use_dA = m;
837 }
838 }
839 else
840 {
841 for(ii=0; ii<m; ii++)
842 dA[ii] = 1.0 / pA[ii+lda*ii];
843 sA->use_dA = 0; // nonzero offset makes diagonal dirty
844 }
845 // solve
846 jj = 0;
847 for(; jj<n-1; jj+=2)
848 {
849 ii = 0;
850 for(; ii<m-1; ii+=2)
851 {
852 id = m-ii-2;
853 d_00 = alpha * pB[id+0+ldb*(jj+0)];
854 d_10 = alpha * pB[id+1+ldb*(jj+0)];
855 d_01 = alpha * pB[id+0+ldb*(jj+1)];
856 d_11 = alpha * pB[id+1+ldb*(jj+1)];
857 kk = id+2;
858 for(; kk<m; kk++)
859 {
860 d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
861 d_10 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
862 d_01 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
863 d_11 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
864 }
865 d_10 *= dA[id+1];
866 d_11 *= dA[id+1];
867 d_00 -= pA[id+0+lda*(id+1)] * d_10;
868 d_01 -= pA[id+0+lda*(id+1)] * d_11;
869 d_00 *= dA[id+0];
870 d_01 *= dA[id+0];
871 pD[id+0+ldd*(jj+0)] = d_00;
872 pD[id+1+ldd*(jj+0)] = d_10;
873 pD[id+0+ldd*(jj+1)] = d_01;
874 pD[id+1+ldd*(jj+1)] = d_11;
875 }
876 for(; ii<m; ii++)
877 {
878 id = m-ii-1;
879 d_00 = alpha * pB[id+0+ldb*(jj+0)];
880 d_01 = alpha * pB[id+0+ldb*(jj+1)];
881 kk = id+1;
882 for(; kk<m; kk++)
883 {
884 d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
885 d_01 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
886 }
887 d_00 *= dA[id+0];
888 d_01 *= dA[id+0];
889 pD[id+0+ldd*(jj+0)] = d_00;
890 pD[id+0+ldd*(jj+1)] = d_01;
891 }
892 }
893 for(; jj<n; jj++)
894 {
895 ii = 0;
896 for(; ii<m-1; ii+=2)
897 {
898 id = m-ii-2;
899 d_00 = alpha * pB[id+0+ldb*(jj+0)];
900 d_10 = alpha * pB[id+1+ldb*(jj+0)];
901 kk = id+2;
902 for(; kk<m; kk++)
903 {
904 d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
905 d_10 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
906 }
907 d_10 *= dA[id+1];
908 d_00 -= pA[id+0+lda*(id+1)] * d_10;
909 d_00 *= dA[id+0];
910 pD[id+0+ldd*(jj+0)] = d_00;
911 pD[id+1+ldd*(jj+0)] = d_10;
912 }
913 for(; ii<m; ii++)
914 {
915 id = m-ii-1;
916 d_00 = alpha * pB[id+0+ldb*(jj+0)];
917 kk = id+1;
918 for(; kk<m; kk++)
919 {
920 d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
921 }
922 d_00 *= dA[id+0];
923 pD[id+0+ldd*(jj+0)] = d_00;
924 }
925 }
926 return;
927 }
928
929
930
931 // dtrsm_lunu
TRSM_LUNU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)932 void TRSM_LUNU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
933 {
934 if(m<=0 | n<=0)
935 return;
936
937 // invalidate stored inverse diagonal of result matrix
938 sD->use_dA = 0;
939
940 int ii, jj, kk, id;
941 REAL
942 d_00, d_01,
943 d_10, d_11;
944 int lda = sA->m;
945 int ldb = sB->m;
946 int ldd = sD->m;
947 REAL *pA = sA->pA + ai + aj*lda; // triangular
948 REAL *pB = sB->pA + bi + bj*ldb;
949 REAL *pD = sD->pA + di + dj*ldd;
950 // solve
951 jj = 0;
952 for(; jj<n-1; jj+=2)
953 {
954 ii = 0;
955 for(; ii<m-1; ii+=2)
956 {
957 id = m-ii-2;
958 d_00 = alpha * pB[id+0+ldb*(jj+0)];
959 d_10 = alpha * pB[id+1+ldb*(jj+0)];
960 d_01 = alpha * pB[id+0+ldb*(jj+1)];
961 d_11 = alpha * pB[id+1+ldb*(jj+1)];
962 kk = id+2;
963 for(; kk<m; kk++)
964 {
965 d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
966 d_10 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
967 d_01 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
968 d_11 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
969 }
970 d_00 -= pA[id+0+lda*(id+1)] * d_10;
971 d_01 -= pA[id+0+lda*(id+1)] * d_11;
972 pD[id+0+ldd*(jj+0)] = d_00;
973 pD[id+1+ldd*(jj+0)] = d_10;
974 pD[id+0+ldd*(jj+1)] = d_01;
975 pD[id+1+ldd*(jj+1)] = d_11;
976 }
977 for(; ii<m; ii++)
978 {
979 id = m-ii-1;
980 d_00 = alpha * pB[id+0+ldb*(jj+0)];
981 d_01 = alpha * pB[id+0+ldb*(jj+1)];
982 kk = id+1;
983 for(; kk<m; kk++)
984 {
985 d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
986 d_01 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
987 }
988 pD[id+0+ldd*(jj+0)] = d_00;
989 pD[id+0+ldd*(jj+1)] = d_01;
990 }
991 }
992 for(; jj<n; jj++)
993 {
994 ii = 0;
995 for(; ii<m-1; ii+=2)
996 {
997 id = m-ii-2;
998 d_00 = alpha * pB[id+0+ldb*(jj+0)];
999 d_10 = alpha * pB[id+1+ldb*(jj+0)];
1000 kk = id+2;
1001 for(; kk<m; kk++)
1002 {
1003 d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
1004 d_10 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
1005 }
1006 d_00 -= pA[id+0+lda*(id+1)] * d_10;
1007 pD[id+0+ldd*(jj+0)] = d_00;
1008 pD[id+1+ldd*(jj+0)] = d_10;
1009 }
1010 for(; ii<m; ii++)
1011 {
1012 id = m-ii-1;
1013 d_00 = alpha * pB[id+0+ldb*(jj+0)];
1014 kk = id+1;
1015 for(; kk<m; kk++)
1016 {
1017 d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
1018 }
1019 pD[id+0+ldd*(jj+0)] = d_00;
1020 }
1021 }
1022 return;
1023 return;
1024 }
1025
1026
1027
1028 // dtrsm_lutn
TRSM_LUTN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)1029 void TRSM_LUTN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
1030 {
1031 if(m<=0 | n<=0)
1032 return;
1033
1034 // invalidate stored inverse diagonal of result matrix
1035 sD->use_dA = 0;
1036
1037 int ii, jj, kk;
1038 REAL
1039 d_00, d_01,
1040 d_10, d_11;
1041 int lda = sA->m;
1042 int ldb = sB->m;
1043 int ldd = sD->m;
1044 REAL *pA = sA->pA + ai + aj*lda; // triangular
1045 REAL *pB = sB->pA + bi + bj*ldb;
1046 REAL *pD = sD->pA + di + dj*ldd;
1047 REAL *dA = sA->dA;
1048 if(ai==0 & aj==0)
1049 {
1050 if(sA->use_dA<n)
1051 {
1052 for(ii=0; ii<n; ii++)
1053 dA[ii] = 1.0 / pA[ii+lda*ii];
1054 sA->use_dA = n;
1055 }
1056 }
1057 else
1058 {
1059 for(ii=0; ii<n; ii++)
1060 dA[ii] = 1.0 / pA[ii+lda*ii];
1061 sA->use_dA = 0; // nonzero offset makes diagonal dirty
1062 }
1063 // solve
1064 jj = 0;
1065 for(; jj<n-1; jj+=2)
1066 {
1067 ii = 0;
1068 for(; ii<m-1; ii+=2)
1069 {
1070 d_00 = alpha * pB[ii+0+ldb*(jj+0)];
1071 d_10 = alpha * pB[ii+1+ldb*(jj+0)];
1072 d_01 = alpha * pB[ii+0+ldb*(jj+1)];
1073 d_11 = alpha * pB[ii+1+ldb*(jj+1)];
1074 kk = 0;
1075 for(; kk<ii; kk++)
1076 {
1077 d_00 -= pA[kk+lda*(ii+0)] * pD[kk+ldd*(jj+0)];
1078 d_10 -= pA[kk+lda*(ii+1)] * pD[kk+ldd*(jj+0)];
1079 d_01 -= pA[kk+lda*(ii+0)] * pD[kk+ldd*(jj+1)];
1080 d_11 -= pA[kk+lda*(ii+1)] * pD[kk+ldd*(jj+1)];
1081 }
1082 d_00 *= dA[ii+0];
1083 d_01 *= dA[ii+0];
1084 d_10 -= pA[ii+lda*(ii+1)] * d_00;
1085 d_11 -= pA[ii+lda*(ii+1)] * d_01;
1086 d_10 *= dA[ii+1];
1087 d_11 *= dA[ii+1];
1088 pD[ii+0+ldd*(jj+0)] = d_00;
1089 pD[ii+1+ldd*(jj+0)] = d_10;
1090 pD[ii+0+ldd*(jj+1)] = d_01;
1091 pD[ii+1+ldd*(jj+1)] = d_11;
1092 }
1093 for(; ii<m; ii++)
1094 {
1095 d_00 = alpha * pB[ii+ldb*(jj+0)];
1096 d_01 = alpha * pB[ii+ldb*(jj+1)];
1097 for(kk=0; kk<ii; kk++)
1098 {
1099 d_00 -= pA[kk+lda*ii] * pD[kk+ldd*(jj+0)];
1100 d_01 -= pA[kk+lda*ii] * pD[kk+ldd*(jj+1)];
1101 }
1102 d_00 *= dA[ii+0];
1103 d_01 *= dA[ii+0];
1104 pD[ii+ldd*(jj+0)] = d_00;
1105 pD[ii+ldd*(jj+1)] = d_01;
1106 }
1107 }
1108 for(; jj<n; jj++)
1109 {
1110 ii = 0;
1111 for(; ii<m-1; ii+=2)
1112 {
1113 d_00 = alpha * pB[ii+0+ldb*jj];
1114 d_10 = alpha * pB[ii+1+ldb*jj];
1115 for(kk=0; kk<ii; kk++)
1116 {
1117 d_00 -= pA[kk+lda*(ii+0)] * pD[kk+ldd*jj];
1118 d_10 -= pA[kk+lda*(ii+1)] * pD[kk+ldd*jj];
1119 }
1120 d_00 *= dA[ii+0];
1121 d_10 -= pA[kk+lda*(ii+1)] * d_00;
1122 d_10 *= dA[ii+1];
1123 pD[ii+0+ldd*jj] = d_00;
1124 pD[ii+1+ldd*jj] = d_10;
1125 }
1126 for(; ii<m; ii++)
1127 {
1128 d_00 = alpha * pB[ii+ldb*jj];
1129 for(kk=0; kk<ii; kk++)
1130 {
1131 d_00 -= pA[kk+lda*ii] * pD[kk+ldd*jj];
1132 }
1133 d_00 *= dA[ii+0];
1134 pD[ii+ldd*jj] = d_00;
1135 }
1136 }
1137 return;
1138 }
1139
1140
1141
1142 // dtrsm_lutu
TRSM_LUTU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)1143 void TRSM_LUTU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
1144 {
1145 if(m<=0 | n<=0)
1146 return;
1147
1148 // invalidate stored inverse diagonal of result matrix
1149 sD->use_dA = 0;
1150
1151 int ii, jj, kk;
1152 REAL
1153 d_00, d_01,
1154 d_10, d_11;
1155 int lda = sA->m;
1156 int ldb = sB->m;
1157 int ldd = sD->m;
1158 REAL *pA = sA->pA + ai + aj*lda; // triangular
1159 REAL *pB = sB->pA + bi + bj*ldb;
1160 REAL *pD = sD->pA + di + dj*ldd;
1161 // solve
1162 jj = 0;
1163 for(; jj<n-1; jj+=2)
1164 {
1165 ii = 0;
1166 for(; ii<m-1; ii+=2)
1167 {
1168 d_00 = alpha * pB[ii+0+ldb*(jj+0)];
1169 d_10 = alpha * pB[ii+1+ldb*(jj+0)];
1170 d_01 = alpha * pB[ii+0+ldb*(jj+1)];
1171 d_11 = alpha * pB[ii+1+ldb*(jj+1)];
1172 kk = 0;
1173 for(; kk<ii; kk++)
1174 {
1175 d_00 -= pA[kk+lda*(ii+0)] * pD[kk+ldd*(jj+0)];
1176 d_10 -= pA[kk+lda*(ii+1)] * pD[kk+ldd*(jj+0)];
1177 d_01 -= pA[kk+lda*(ii+0)] * pD[kk+ldd*(jj+1)];
1178 d_11 -= pA[kk+lda*(ii+1)] * pD[kk+ldd*(jj+1)];
1179 }
1180 d_10 -= pA[ii+lda*(ii+1)] * d_00;
1181 d_11 -= pA[ii+lda*(ii+1)] * d_01;
1182 pD[ii+0+ldd*(jj+0)] = d_00;
1183 pD[ii+1+ldd*(jj+0)] = d_10;
1184 pD[ii+0+ldd*(jj+1)] = d_01;
1185 pD[ii+1+ldd*(jj+1)] = d_11;
1186 }
1187 for(; ii<m; ii++)
1188 {
1189 d_00 = alpha * pB[ii+ldb*(jj+0)];
1190 d_01 = alpha * pB[ii+ldb*(jj+1)];
1191 for(kk=0; kk<ii; kk++)
1192 {
1193 d_00 -= pA[kk+lda*ii] * pD[kk+ldd*(jj+0)];
1194 d_01 -= pA[kk+lda*ii] * pD[kk+ldd*(jj+1)];
1195 }
1196 pD[ii+ldd*(jj+0)] = d_00;
1197 pD[ii+ldd*(jj+1)] = d_01;
1198 }
1199 }
1200 for(; jj<n; jj++)
1201 {
1202 ii = 0;
1203 for(; ii<m-1; ii+=2)
1204 {
1205 d_00 = alpha * pB[ii+0+ldb*jj];
1206 d_10 = alpha * pB[ii+1+ldb*jj];
1207 for(kk=0; kk<ii; kk++)
1208 {
1209 d_00 -= pA[kk+lda*(ii+0)] * pD[kk+ldd*jj];
1210 d_10 -= pA[kk+lda*(ii+1)] * pD[kk+ldd*jj];
1211 }
1212 d_10 -= pA[kk+lda*(ii+1)] * d_00;
1213 pD[ii+0+ldd*jj] = d_00;
1214 pD[ii+1+ldd*jj] = d_10;
1215 }
1216 for(; ii<m; ii++)
1217 {
1218 d_00 = alpha * pB[ii+ldb*jj];
1219 for(kk=0; kk<ii; kk++)
1220 {
1221 d_00 -= pA[kk+lda*ii] * pD[kk+ldd*jj];
1222 }
1223 pD[ii+ldd*jj] = d_00;
1224 }
1225 }
1226 return;
1227 }
1228
1229
1230
1231 // dtrsm_rlnn
TRSM_RLNN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)1232 void TRSM_RLNN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
1233 {
1234 if(m<=0 | n<=0)
1235 return;
1236
1237 // invalidate stored inverse diagonal of result matrix
1238 sD->use_dA = 0;
1239
1240 char cl = 'l';
1241 char cn = 'n';
1242 char cr = 'r';
1243 char ct = 't';
1244 char cu = 'u';
1245 int i1 = 1;
1246 int ii, jj, kk, id;
1247 int lda = sA->m;
1248 int ldb = sB->m;
1249 int ldd = sD->m;
1250 REAL
1251 d_00, d_01,
1252 d_10, d_11;
1253 REAL *pA = sA->pA+ai+aj*lda;
1254 REAL *pB = sB->pA+bi+bj*ldb;
1255 REAL *pD = sD->pA+di+dj*ldd;
1256 REAL *dA = sA->dA;
1257 if(ai==0 & aj==0)
1258 {
1259 if (sA->use_dA<n)
1260 {
1261 // invert diagonal of pA
1262 for(ii=0; ii<n; ii++)
1263 dA[ii] = 1.0/pA[ii+lda*ii];
1264 // use only now
1265 sA->use_dA = n;
1266 }
1267 }
1268 else
1269 {
1270 for(ii=0; ii<n; ii++)
1271 dA[ii] = 1.0 / pA[ii+lda*ii];
1272 sA->use_dA = 0; // nonzero offset makes diagonal dirty
1273 }
1274 // solve
1275 jj = 0;
1276 for(; jj<n-1; jj+=2)
1277 {
1278 ii = 0;
1279 id = n-jj-2;
1280 for(; ii<m-1; ii+=2)
1281 {
1282 d_00 = alpha * pB[ii+0+ldb*(id+0)];
1283 d_10 = alpha * pB[ii+1+ldb*(id+0)];
1284 d_01 = alpha * pB[ii+0+ldb*(id+1)];
1285 d_11 = alpha * pB[ii+1+ldb*(id+1)];
1286 kk = id+2;
1287 for(; kk<n; kk++)
1288 {
1289 d_00 -= pA[kk+0+lda*(id+0)] * pD[ii+0+ldd*(kk+0)];
1290 d_10 -= pA[kk+0+lda*(id+0)] * pD[ii+1+ldd*(kk+0)];
1291 d_01 -= pA[kk+0+lda*(id+1)] * pD[ii+0+ldd*(kk+0)];
1292 d_11 -= pA[kk+0+lda*(id+1)] * pD[ii+1+ldd*(kk+0)];
1293 }
1294 d_01 *= dA[id+1];
1295 d_11 *= dA[id+1];
1296 d_00 -= pA[id+1+lda*(id+0)] * d_01;
1297 d_10 -= pA[id+1+lda*(id+0)] * d_11;
1298 d_00 *= dA[id+0];
1299 d_10 *= dA[id+0];
1300 pD[ii+0+ldd*(id+0)] = d_00;
1301 pD[ii+1+ldd*(id+0)] = d_10;
1302 pD[ii+0+ldd*(id+1)] = d_01;
1303 pD[ii+1+ldd*(id+1)] = d_11;
1304 }
1305 for(; ii<m; ii++)
1306 {
1307 d_00 = alpha * pB[ii+0+ldb*(id+0)];
1308 d_01 = alpha * pB[ii+0+ldb*(id+1)];
1309 kk = id+2;
1310 for(; kk<n; kk++)
1311 {
1312 d_00 -= pA[kk+0+lda*(id+0)] * pD[ii+0+ldd*(kk+0)];
1313 d_01 -= pA[kk+0+lda*(id+1)] * pD[ii+0+ldd*(kk+0)];
1314 }
1315 d_01 *= dA[id+1];
1316 d_00 -= pA[id+1+lda*(id+0)] * d_01;
1317 d_00 *= dA[id+0];
1318 pD[ii+0+ldd*(id+0)] = d_00;
1319 pD[ii+0+ldd*(id+1)] = d_01;
1320 }
1321 }
1322 for(; jj<n; jj++)
1323 {
1324 ii = 0;
1325 id = n-jj-1;
1326 for(; ii<m-1; ii+=2)
1327 {
1328 d_00 = alpha * pB[ii+0+ldb*(id+0)];
1329 d_10 = alpha * pB[ii+1+ldb*(id+0)];
1330 kk = id+1;
1331 for(; kk<n; kk++)
1332 {
1333 d_00 -= pA[kk+0+lda*(id+0)] * pD[ii+0+ldd*(kk+0)];
1334 d_10 -= pA[kk+0+lda*(id+0)] * pD[ii+1+ldd*(kk+0)];
1335 }
1336 d_00 *= dA[id+0];
1337 d_10 *= dA[id+0];
1338 pD[ii+0+ldd*(id+0)] = d_00;
1339 pD[ii+1+ldd*(id+0)] = d_10;
1340 }
1341 for(; ii<m; ii++)
1342 {
1343 d_00 = alpha * pB[ii+ldb*(id)];
1344 kk = id+1;
1345 for(; kk<n; kk++)
1346 d_00 -= pA[kk+lda*(id)] * pD[ii+ldd*(kk)];
1347 pD[ii+ldd*(id)] = d_00 * dA[id];
1348 }
1349 }
1350 return;
1351 }
1352
1353
1354
1355 // dtrsm_rlnu
TRSM_RLNU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)1356 void TRSM_RLNU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
1357 {
1358 if(m<=0 | n<=0)
1359 return;
1360
1361 // invalidate stored inverse diagonal of result matrix
1362 sD->use_dA = 0;
1363
1364 char cl = 'l';
1365 char cn = 'n';
1366 char cr = 'r';
1367 char ct = 't';
1368 char cu = 'u';
1369 int i1 = 1;
1370 int ii, jj, kk, id;
1371 int lda = sA->m;
1372 int ldb = sB->m;
1373 int ldd = sD->m;
1374 REAL
1375 d_00, d_01,
1376 d_10, d_11;
1377 REAL *pA = sA->pA+ai+aj*lda;
1378 REAL *pB = sB->pA+bi+bj*ldb;
1379 REAL *pD = sD->pA+di+dj*ldd;
1380 // solve
1381 jj = 0;
1382 for(; jj<n-1; jj+=2)
1383 {
1384 ii = 0;
1385 id = n-jj-2;
1386 for(; ii<m-1; ii+=2)
1387 {
1388 d_00 = alpha * pB[ii+0+ldb*(id+0)];
1389 d_10 = alpha * pB[ii+1+ldb*(id+0)];
1390 d_01 = alpha * pB[ii+0+ldb*(id+1)];
1391 d_11 = alpha * pB[ii+1+ldb*(id+1)];
1392 kk = id+2;
1393 for(; kk<n; kk++)
1394 {
1395 d_00 -= pA[kk+0+lda*(id+0)] * pD[ii+0+ldd*(kk+0)];
1396 d_10 -= pA[kk+0+lda*(id+0)] * pD[ii+1+ldd*(kk+0)];
1397 d_01 -= pA[kk+0+lda*(id+1)] * pD[ii+0+ldd*(kk+0)];
1398 d_11 -= pA[kk+0+lda*(id+1)] * pD[ii+1+ldd*(kk+0)];
1399 }
1400 d_00 -= pA[id+1+lda*(id+0)] * d_01;
1401 d_10 -= pA[id+1+lda*(id+0)] * d_11;
1402 pD[ii+0+ldd*(id+0)] = d_00;
1403 pD[ii+1+ldd*(id+0)] = d_10;
1404 pD[ii+0+ldd*(id+1)] = d_01;
1405 pD[ii+1+ldd*(id+1)] = d_11;
1406 }
1407 for(; ii<m; ii++)
1408 {
1409 d_00 = alpha * pB[ii+0+ldb*(id+0)];
1410 d_01 = alpha * pB[ii+0+ldb*(id+1)];
1411 kk = id+2;
1412 for(; kk<n; kk++)
1413 {
1414 d_00 -= pA[kk+0+lda*(id+0)] * pD[ii+0+ldd*(kk+0)];
1415 d_01 -= pA[kk+0+lda*(id+1)] * pD[ii+0+ldd*(kk+0)];
1416 }
1417 d_00 -= pA[id+1+lda*(id+0)] * d_01;
1418 pD[ii+0+ldd*(id+0)] = d_00;
1419 pD[ii+0+ldd*(id+1)] = d_01;
1420 }
1421 }
1422 for(; jj<n; jj++)
1423 {
1424 ii = 0;
1425 id = n-jj-1;
1426 for(; ii<m-1; ii+=2)
1427 {
1428 d_00 = alpha * pB[ii+0+ldb*(id+0)];
1429 d_10 = alpha * pB[ii+1+ldb*(id+0)];
1430 kk = id+1;
1431 for(; kk<n; kk++)
1432 {
1433 d_00 -= pA[kk+0+lda*(id+0)] * pD[ii+0+ldd*(kk+0)];
1434 d_10 -= pA[kk+0+lda*(id+0)] * pD[ii+1+ldd*(kk+0)];
1435 }
1436 pD[ii+0+ldd*(id+0)] = d_00;
1437 pD[ii+1+ldd*(id+0)] = d_10;
1438 }
1439 for(; ii<m; ii++)
1440 {
1441 d_00 = alpha * pB[ii+ldb*(id)];
1442 kk = id+1;
1443 for(; kk<n; kk++)
1444 d_00 -= pA[kk+lda*(id)] * pD[ii+ldd*(kk)];
1445 }
1446 }
1447 return;
1448 }
1449
1450
1451
1452 // dtrsm_right_lower_transposed_not-unit
TRSM_RLTN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)1453 void TRSM_RLTN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
1454 {
1455 if(m<=0 | n<=0)
1456 return;
1457
1458 // invalidate stored inverse diagonal of result matrix
1459 sD->use_dA = 0;
1460
1461 int ii, jj, kk;
1462 int lda = sA->m;
1463 int ldb = sB->m;
1464 int ldd = sD->m;
1465 REAL *pA = sA->pA + ai + aj*lda;
1466 REAL *pB = sB->pA + bi + bj*ldb;
1467 REAL *pD = sD->pA + di + dj*ldd;
1468 REAL *dA = sA->dA;
1469 if(ai==0 & aj==0)
1470 {
1471 if(sA->use_dA<n)
1472 {
1473 for(ii=0; ii<n; ii++)
1474 dA[ii] = 1.0 / pA[ii+lda*ii];
1475 sA->use_dA = n;
1476 }
1477 }
1478 else
1479 {
1480 for(ii=0; ii<n; ii++)
1481 dA[ii] = 1.0 / pA[ii+lda*ii];
1482 sA->use_dA = 0; // nonzero offset makes diagonal dirty
1483 }
1484 REAL
1485 f_00_inv,
1486 f_10, f_11_inv,
1487 c_00, c_01,
1488 c_10, c_11;
1489 // solve
1490 jj = 0;
1491 for(; jj<n-1; jj+=2)
1492 {
1493 f_00_inv = dA[jj+0];
1494 f_10 = pA[jj+1+lda*(jj+0)];
1495 f_11_inv = dA[jj+1];
1496 ii = 0;
1497 for(; ii<m-1; ii+=2)
1498 {
1499 c_00 = alpha * pB[ii+0+ldb*(jj+0)];
1500 c_10 = alpha * pB[ii+1+ldb*(jj+0)];
1501 c_01 = alpha * pB[ii+0+ldb*(jj+1)];
1502 c_11 = alpha * pB[ii+1+ldb*(jj+1)];
1503 for(kk=0; kk<jj; kk++)
1504 {
1505 c_00 -= pD[ii+0+ldd*kk] * pA[jj+0+lda*kk];
1506 c_10 -= pD[ii+1+ldd*kk] * pA[jj+0+lda*kk];
1507 c_01 -= pD[ii+0+ldd*kk] * pA[jj+1+lda*kk];
1508 c_11 -= pD[ii+1+ldd*kk] * pA[jj+1+lda*kk];
1509 }
1510 c_00 *= f_00_inv;
1511 c_10 *= f_00_inv;
1512 pD[ii+0+ldd*(jj+0)] = c_00;
1513 pD[ii+1+ldd*(jj+0)] = c_10;
1514 c_01 -= c_00 * f_10;
1515 c_11 -= c_10 * f_10;
1516 c_01 *= f_11_inv;
1517 c_11 *= f_11_inv;
1518 pD[ii+0+ldd*(jj+1)] = c_01;
1519 pD[ii+1+ldd*(jj+1)] = c_11;
1520 }
1521 for(; ii<m; ii++)
1522 {
1523 c_00 = alpha * pB[ii+0+ldb*(jj+0)];
1524 c_01 = alpha * pB[ii+0+ldb*(jj+1)];
1525 for(kk=0; kk<jj; kk++)
1526 {
1527 c_00 -= pD[ii+0+ldd*kk] * pA[jj+0+lda*kk];
1528 c_01 -= pD[ii+0+ldd*kk] * pA[jj+1+lda*kk];
1529 }
1530 c_00 *= f_00_inv;
1531 pD[ii+0+ldd*(jj+0)] = c_00;
1532 c_01 -= c_00 * f_10;
1533 c_01 *= f_11_inv;
1534 pD[ii+0+ldd*(jj+1)] = c_01;
1535 }
1536 }
1537 for(; jj<n; jj++)
1538 {
1539 // factorize diagonal
1540 f_00_inv = dA[jj];
1541 for(ii=0; ii<m; ii++)
1542 {
1543 c_00 = alpha * pB[ii+ldb*jj];
1544 for(kk=0; kk<jj; kk++)
1545 {
1546 c_00 -= pD[ii+ldd*kk] * pA[jj+lda*kk];
1547 }
1548 c_00 *= f_00_inv;
1549 pD[ii+ldd*jj] = c_00;
1550 }
1551 }
1552 return;
1553 }
1554
1555
1556
1557 // dtrsm_right_lower_transposed_unit
TRSM_RLTU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)1558 void TRSM_RLTU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
1559 {
1560 if(m<=0 | n<=0)
1561 return;
1562
1563 // invalidate stored inverse diagonal of result matrix
1564 sD->use_dA = 0;
1565
1566 int ii, jj, kk;
1567 int lda = sA->m;
1568 int ldb = sB->m;
1569 int ldd = sD->m;
1570 REAL *pA = sA->pA + ai + aj*lda;
1571 REAL *pB = sB->pA + bi + bj*ldb;
1572 REAL *pD = sD->pA + di + dj*ldd;
1573 REAL
1574 f_10,
1575 c_00, c_01,
1576 c_10, c_11;
1577 // solve
1578 jj = 0;
1579 for(; jj<n-1; jj+=2)
1580 {
1581 f_10 = pA[jj+1+lda*(jj+0)];
1582 ii = 0;
1583 for(; ii<m-1; ii+=2)
1584 {
1585 c_00 = alpha * pB[ii+0+ldb*(jj+0)];
1586 c_10 = alpha * pB[ii+1+ldb*(jj+0)];
1587 c_01 = alpha * pB[ii+0+ldb*(jj+1)];
1588 c_11 = alpha * pB[ii+1+ldb*(jj+1)];
1589 for(kk=0; kk<jj; kk++)
1590 {
1591 c_00 -= pD[ii+0+ldd*kk] * pA[jj+0+lda*kk];
1592 c_10 -= pD[ii+1+ldd*kk] * pA[jj+0+lda*kk];
1593 c_01 -= pD[ii+0+ldd*kk] * pA[jj+1+lda*kk];
1594 c_11 -= pD[ii+1+ldd*kk] * pA[jj+1+lda*kk];
1595 }
1596 pD[ii+0+ldd*(jj+0)] = c_00;
1597 pD[ii+1+ldd*(jj+0)] = c_10;
1598 c_01 -= c_00 * f_10;
1599 c_11 -= c_10 * f_10;
1600 pD[ii+0+ldd*(jj+1)] = c_01;
1601 pD[ii+1+ldd*(jj+1)] = c_11;
1602 }
1603 for(; ii<m; ii++)
1604 {
1605 c_00 = alpha * pB[ii+0+ldb*(jj+0)];
1606 c_01 = alpha * pB[ii+0+ldb*(jj+1)];
1607 for(kk=0; kk<jj; kk++)
1608 {
1609 c_00 -= pD[ii+0+ldd*kk] * pA[jj+0+lda*kk];
1610 c_01 -= pD[ii+0+ldd*kk] * pA[jj+1+lda*kk];
1611 }
1612 pD[ii+0+ldd*(jj+0)] = c_00;
1613 c_01 -= c_00 * f_10;
1614 pD[ii+0+ldd*(jj+1)] = c_01;
1615 }
1616 }
1617 for(; jj<n; jj++)
1618 {
1619 for(ii=0; ii<m; ii++)
1620 {
1621 c_00 = alpha * pB[ii+ldb*jj];
1622 for(kk=0; kk<jj; kk++)
1623 {
1624 c_00 -= pD[ii+ldd*kk] * pA[jj+lda*kk];
1625 }
1626 pD[ii+ldd*jj] = c_00;
1627 }
1628 }
1629 return;
1630 }
1631
1632
1633
1634 // dtrsm_runn
TRSM_RUNN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)1635 void TRSM_RUNN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
1636 {
1637 if(m<=0 | n<=0)
1638 return;
1639
1640 // invalidate stored inverse diagonal of result matrix
1641 sD->use_dA = 0;
1642
1643 int ii, jj, kk;
1644 int lda = sA->m;
1645 int ldb = sB->m;
1646 int ldd = sD->m;
1647 REAL *pA = sA->pA + ai + aj*lda;
1648 REAL *pB = sB->pA + bi + bj*ldb;
1649 REAL *pD = sD->pA + di + dj*ldd;
1650 REAL *dA = sA->dA;
1651 if(ai==0 & aj==0)
1652 {
1653 if(sA->use_dA<n)
1654 {
1655 for(ii=0; ii<n; ii++)
1656 dA[ii] = 1.0 / pA[ii+lda*ii];
1657 sA->use_dA = n;
1658 }
1659 }
1660 else
1661 {
1662 for(ii=0; ii<n; ii++)
1663 dA[ii] = 1.0 / pA[ii+lda*ii];
1664 sA->use_dA = 0; // nonzero offset makes diagonal dirty
1665 }
1666 REAL
1667 f_00_inv,
1668 f_10, f_11_inv,
1669 c_00, c_01,
1670 c_10, c_11;
1671 // solve
1672 jj = 0;
1673 for(; jj<n-1; jj+=2)
1674 {
1675 f_00_inv = dA[jj+0];
1676 f_10 = pA[jj+0+lda*(jj+1)];
1677 f_11_inv = dA[jj+1];
1678 ii = 0;
1679 for(; ii<m-1; ii+=2)
1680 {
1681 c_00 = alpha * pB[ii+0+ldb*(jj+0)];
1682 c_10 = alpha * pB[ii+1+ldb*(jj+0)];
1683 c_01 = alpha * pB[ii+0+ldb*(jj+1)];
1684 c_11 = alpha * pB[ii+1+ldb*(jj+1)];
1685 for(kk=0; kk<jj; kk++)
1686 {
1687 c_00 -= pD[ii+0+ldd*kk] * pA[kk+lda*(jj+0)];
1688 c_10 -= pD[ii+1+ldd*kk] * pA[kk+lda*(jj+0)];
1689 c_01 -= pD[ii+0+ldd*kk] * pA[kk+lda*(jj+1)];
1690 c_11 -= pD[ii+1+ldd*kk] * pA[kk+lda*(jj+1)];
1691 }
1692 c_00 *= f_00_inv;
1693 c_10 *= f_00_inv;
1694 pD[ii+0+ldd*(jj+0)] = c_00;
1695 pD[ii+1+ldd*(jj+0)] = c_10;
1696 c_01 -= c_00 * f_10;
1697 c_11 -= c_10 * f_10;
1698 c_01 *= f_11_inv;
1699 c_11 *= f_11_inv;
1700 pD[ii+0+ldd*(jj+1)] = c_01;
1701 pD[ii+1+ldd*(jj+1)] = c_11;
1702 }
1703 for(; ii<m; ii++)
1704 {
1705 c_00 = alpha * pB[ii+0+ldb*(jj+0)];
1706 c_01 = alpha * pB[ii+0+ldb*(jj+1)];
1707 for(kk=0; kk<jj; kk++)
1708 {
1709 c_00 -= pD[ii+0+ldd*kk] * pA[kk+lda*(jj+0)];
1710 c_01 -= pD[ii+0+ldd*kk] * pA[kk+lda*(jj+1)];
1711 }
1712 c_00 *= f_00_inv;
1713 pD[ii+0+ldd*(jj+0)] = c_00;
1714 c_01 -= c_00 * f_10;
1715 c_01 *= f_11_inv;
1716 pD[ii+0+ldd*(jj+1)] = c_01;
1717 }
1718 }
1719 for(; jj<n; jj++)
1720 {
1721 // factorize diagonal
1722 f_00_inv = dA[jj];
1723 for(ii=0; ii<m; ii++)
1724 {
1725 c_00 = alpha * pB[ii+ldb*jj];
1726 for(kk=0; kk<jj; kk++)
1727 {
1728 c_00 -= pD[ii+ldd*kk] * pA[kk+lda*jj];
1729 }
1730 c_00 *= f_00_inv;
1731 pD[ii+ldd*jj] = c_00;
1732 }
1733 }
1734 return;
1735 }
1736
1737
1738
1739 // dtrsm_runu
TRSM_RUNU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)1740 void TRSM_RUNU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
1741 {
1742 if(m<=0 | n<=0)
1743 return;
1744
1745 // invalidate stored inverse diagonal of result matrix
1746 sD->use_dA = 0;
1747
1748 int ii, jj, kk;
1749 int lda = sA->m;
1750 int ldb = sB->m;
1751 int ldd = sD->m;
1752 REAL *pA = sA->pA + ai + aj*lda;
1753 REAL *pB = sB->pA + bi + bj*ldb;
1754 REAL *pD = sD->pA + di + dj*ldd;
1755 REAL
1756 f_10,
1757 c_00, c_01,
1758 c_10, c_11;
1759 // solve
1760 jj = 0;
1761 for(; jj<n-1; jj+=2)
1762 {
1763 f_10 = pA[jj+0+lda*(jj+1)];
1764 ii = 0;
1765 for(; ii<m-1; ii+=2)
1766 {
1767 c_00 = alpha * pB[ii+0+ldb*(jj+0)];
1768 c_10 = alpha * pB[ii+1+ldb*(jj+0)];
1769 c_01 = alpha * pB[ii+0+ldb*(jj+1)];
1770 c_11 = alpha * pB[ii+1+ldb*(jj+1)];
1771 for(kk=0; kk<jj; kk++)
1772 {
1773 c_00 -= pD[ii+0+ldd*kk] * pA[kk+lda*(jj+0)];
1774 c_10 -= pD[ii+1+ldd*kk] * pA[kk+lda*(jj+0)];
1775 c_01 -= pD[ii+0+ldd*kk] * pA[kk+lda*(jj+1)];
1776 c_11 -= pD[ii+1+ldd*kk] * pA[kk+lda*(jj+1)];
1777 }
1778 pD[ii+0+ldd*(jj+0)] = c_00;
1779 pD[ii+1+ldd*(jj+0)] = c_10;
1780 c_01 -= c_00 * f_10;
1781 c_11 -= c_10 * f_10;
1782 pD[ii+0+ldd*(jj+1)] = c_01;
1783 pD[ii+1+ldd*(jj+1)] = c_11;
1784 }
1785 for(; ii<m; ii++)
1786 {
1787 c_00 = alpha * pB[ii+0+ldb*(jj+0)];
1788 c_01 = alpha * pB[ii+0+ldb*(jj+1)];
1789 for(kk=0; kk<jj; kk++)
1790 {
1791 c_00 -= pD[ii+0+ldd*kk] * pA[kk+lda*(jj+0)];
1792 c_01 -= pD[ii+0+ldd*kk] * pA[kk+lda*(jj+1)];
1793 }
1794 pD[ii+0+ldd*(jj+0)] = c_00;
1795 c_01 -= c_00 * f_10;
1796 pD[ii+0+ldd*(jj+1)] = c_01;
1797 }
1798 }
1799 for(; jj<n; jj++)
1800 {
1801 // factorize diagonal
1802 for(ii=0; ii<m; ii++)
1803 {
1804 c_00 = alpha * pB[ii+ldb*jj];
1805 for(kk=0; kk<jj; kk++)
1806 {
1807 c_00 -= pD[ii+ldd*kk] * pA[kk+lda*jj];
1808 }
1809 pD[ii+ldd*jj] = c_00;
1810 }
1811 }
1812 return;
1813 }
1814
1815
1816
1817 // dtrsm_right_upper_transposed_notunit
TRSM_RUTN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)1818 void TRSM_RUTN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
1819 {
1820 if(m<=0 | n<=0)
1821 return;
1822
1823 // invalidate stored inverse diagonal of result matrix
1824 sD->use_dA = 0;
1825
1826 char cl = 'l';
1827 char cn = 'n';
1828 char cr = 'r';
1829 char ct = 't';
1830 char cu = 'u';
1831 int i1 = 1;
1832 int ii, jj, kk, id;
1833 int lda = sA->m;
1834 int ldb = sB->m;
1835 int ldd = sD->m;
1836 REAL
1837 d_00, d_01,
1838 d_10, d_11;
1839 REAL *pA = sA->pA+ai+aj*lda;
1840 REAL *pB = sB->pA+bi+bj*ldb;
1841 REAL *pD = sD->pA+di+dj*ldd;
1842 REAL *dA = sA->dA;
1843 if(ai==0 & aj==0)
1844 {
1845 if (sA->use_dA<n)
1846 {
1847 // invert diagonal of pA
1848 for(ii=0; ii<n; ii++)
1849 dA[ii] = 1.0/pA[ii+lda*ii];
1850 // use only now
1851 sA->use_dA = n;
1852 }
1853 }
1854 else
1855 {
1856 for(ii=0; ii<n; ii++)
1857 dA[ii] = 1.0 / pA[ii+lda*ii];
1858 sA->use_dA = 0; // nonzero offset makes diagonal dirty
1859 }
1860 // solve
1861 jj = 0;
1862 for(; jj<n-1; jj+=2)
1863 {
1864 ii = 0;
1865 id = n-jj-2;
1866 for(; ii<m-1; ii+=2)
1867 {
1868 d_00 = alpha * pB[ii+0+ldb*(id+0)];
1869 d_10 = alpha * pB[ii+1+ldb*(id+0)];
1870 d_01 = alpha * pB[ii+0+ldb*(id+1)];
1871 d_11 = alpha * pB[ii+1+ldb*(id+1)];
1872 kk = id+2;
1873 for(; kk<n; kk++)
1874 {
1875 d_00 -= pA[id+0+lda*(kk+0)] * pD[ii+0+ldd*(kk+0)];
1876 d_10 -= pA[id+0+lda*(kk+0)] * pD[ii+1+ldd*(kk+0)];
1877 d_01 -= pA[id+1+lda*(kk+0)] * pD[ii+0+ldd*(kk+0)];
1878 d_11 -= pA[id+1+lda*(kk+0)] * pD[ii+1+ldd*(kk+0)];
1879 }
1880 d_01 *= dA[id+1];
1881 d_11 *= dA[id+1];
1882 d_00 -= pA[id+0+lda*(id+1)] * d_01;
1883 d_10 -= pA[id+0+lda*(id+1)] * d_11;
1884 d_00 *= dA[id+0];
1885 d_10 *= dA[id+0];
1886 pD[ii+0+ldd*(id+0)] = d_00;
1887 pD[ii+1+ldd*(id+0)] = d_10;
1888 pD[ii+0+ldd*(id+1)] = d_01;
1889 pD[ii+1+ldd*(id+1)] = d_11;
1890 }
1891 for(; ii<m; ii++)
1892 {
1893 d_00 = alpha * pB[ii+0+ldb*(id+0)];
1894 d_01 = alpha * pB[ii+0+ldb*(id+1)];
1895 kk = id+2;
1896 for(; kk<n; kk++)
1897 {
1898 d_00 -= pA[id+0+lda*(kk+0)] * pD[ii+0+ldd*(kk+0)];
1899 d_01 -= pA[id+1+lda*(kk+0)] * pD[ii+0+ldd*(kk+0)];
1900 }
1901 d_01 *= dA[id+1];
1902 d_00 -= pA[id+0+lda*(id+1)] * d_01;
1903 d_00 *= dA[id+0];
1904 pD[ii+0+ldd*(id+0)] = d_00;
1905 pD[ii+0+ldd*(id+1)] = d_01;
1906 }
1907 }
1908 for(; jj<n; jj++)
1909 {
1910 ii = 0;
1911 id = n-jj-1;
1912 for(; ii<m-1; ii+=2)
1913 {
1914 d_00 = alpha * pB[ii+0+ldb*(id+0)];
1915 d_10 = alpha * pB[ii+1+ldb*(id+0)];
1916 kk = id+1;
1917 for(; kk<n; kk++)
1918 {
1919 d_00 -= pA[id+0+lda*(kk+0)] * pD[ii+0+ldd*(kk+0)];
1920 d_10 -= pA[id+0+lda*(kk+0)] * pD[ii+1+ldd*(kk+0)];
1921 }
1922 d_00 *= dA[id+0];
1923 d_10 *= dA[id+0];
1924 pD[ii+0+ldd*(id+0)] = d_00;
1925 pD[ii+1+ldd*(id+0)] = d_10;
1926 }
1927 for(; ii<m; ii++)
1928 {
1929 d_00 = alpha * pB[ii+ldb*(id)];
1930 kk = id+1;
1931 for(; kk<n; kk++)
1932 d_00 -= pA[id+lda*(kk)] * pD[ii+ldd*(kk)];
1933 pD[ii+ldd*(id)] = d_00 * dA[id];
1934 }
1935 }
1936 return;
1937 }
1938
1939
1940
1941 // dtrsm_rutu
TRSM_RUTU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)1942 void TRSM_RUTU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
1943 {
1944 if(m<=0 | n<=0)
1945 return;
1946
1947 // invalidate stored inverse diagonal of result matrix
1948 sD->use_dA = 0;
1949
1950 char cl = 'l';
1951 char cn = 'n';
1952 char cr = 'r';
1953 char ct = 't';
1954 char cu = 'u';
1955 int i1 = 1;
1956 int ii, jj, kk, id;
1957 int lda = sA->m;
1958 int ldb = sB->m;
1959 int ldd = sD->m;
1960 REAL
1961 d_00, d_01,
1962 d_10, d_11;
1963 REAL *pA = sA->pA+ai+aj*lda;
1964 REAL *pB = sB->pA+bi+bj*ldb;
1965 REAL *pD = sD->pA+di+dj*ldd;
1966 // solve
1967 jj = 0;
1968 for(; jj<n-1; jj+=2)
1969 {
1970 ii = 0;
1971 id = n-jj-2;
1972 for(; ii<m-1; ii+=2)
1973 {
1974 d_00 = alpha * pB[ii+0+ldb*(id+0)];
1975 d_10 = alpha * pB[ii+1+ldb*(id+0)];
1976 d_01 = alpha * pB[ii+0+ldb*(id+1)];
1977 d_11 = alpha * pB[ii+1+ldb*(id+1)];
1978 kk = id+2;
1979 for(; kk<n; kk++)
1980 {
1981 d_00 -= pA[id+0+lda*(kk+0)] * pD[ii+0+ldd*(kk+0)];
1982 d_10 -= pA[id+0+lda*(kk+0)] * pD[ii+1+ldd*(kk+0)];
1983 d_01 -= pA[id+1+lda*(kk+0)] * pD[ii+0+ldd*(kk+0)];
1984 d_11 -= pA[id+1+lda*(kk+0)] * pD[ii+1+ldd*(kk+0)];
1985 }
1986 d_00 -= pA[id+0+lda*(id+1)] * d_01;
1987 d_10 -= pA[id+0+lda*(id+1)] * d_11;
1988 pD[ii+0+ldd*(id+0)] = d_00;
1989 pD[ii+1+ldd*(id+0)] = d_10;
1990 pD[ii+0+ldd*(id+1)] = d_01;
1991 pD[ii+1+ldd*(id+1)] = d_11;
1992 }
1993 for(; ii<m; ii++)
1994 {
1995 d_00 = alpha * pB[ii+0+ldb*(id+0)];
1996 d_01 = alpha * pB[ii+0+ldb*(id+1)];
1997 kk = id+2;
1998 for(; kk<n; kk++)
1999 {
2000 d_00 -= pA[id+0+lda*(kk+0)] * pD[ii+0+ldd*(kk+0)];
2001 d_01 -= pA[id+1+lda*(kk+0)] * pD[ii+0+ldd*(kk+0)];
2002 }
2003 d_00 -= pA[id+0+lda*(id+1)] * d_01;
2004 pD[ii+0+ldd*(id+0)] = d_00;
2005 pD[ii+0+ldd*(id+1)] = d_01;
2006 }
2007 }
2008 for(; jj<n; jj++)
2009 {
2010 ii = 0;
2011 id = n-jj-1;
2012 for(; ii<m-1; ii+=2)
2013 {
2014 d_00 = alpha * pB[ii+0+ldb*(id+0)];
2015 d_10 = alpha * pB[ii+1+ldb*(id+0)];
2016 kk = id+1;
2017 for(; kk<n; kk++)
2018 {
2019 d_00 -= pA[id+0+lda*(kk+0)] * pD[ii+0+ldd*(kk+0)];
2020 d_10 -= pA[id+0+lda*(kk+0)] * pD[ii+1+ldd*(kk+0)];
2021 }
2022 pD[ii+0+ldd*(id+0)] = d_00;
2023 pD[ii+1+ldd*(id+0)] = d_10;
2024 }
2025 for(; ii<m; ii++)
2026 {
2027 d_00 = alpha * pB[ii+ldb*(id)];
2028 kk = id+1;
2029 for(; kk<n; kk++)
2030 d_00 -= pA[id+lda*(kk)] * pD[ii+ldd*(kk)];
2031 }
2032 }
2033 return;
2034 }
2035
2036
2037
2038 // dtrmm_right_upper_transposed_notunit (A triangular !!!)
TRMM_RUTN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)2039 void TRMM_RUTN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
2040 {
2041 if(m<=0 | n<=0)
2042 return;
2043
2044 // invalidate stored inverse diagonal of result matrix
2045 sD->use_dA = 0;
2046
2047 int ii, jj, kk;
2048 REAL
2049 c_00, c_01,
2050 c_10, c_11;
2051 int lda = sA->m;
2052 int ldb = sB->m;
2053 int ldd = sD->m;
2054 REAL *pA = sA->pA + ai + aj*lda;
2055 REAL *pB = sB->pA + bi + bj*ldb;
2056 REAL *pD = sD->pA + di + dj*ldd;
2057 jj = 0;
2058 for(; jj<n-1; jj+=2)
2059 {
2060 ii = 0;
2061 for(; ii<m-1; ii+=2)
2062 {
2063 c_00 = 0.0;
2064 c_10 = 0.0;
2065 c_01 = 0.0;
2066 c_11 = 0.0;
2067 kk = jj;
2068 c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
2069 c_10 += pB[(ii+1)+ldb*kk] * pA[(jj+0)+lda*kk];
2070 kk++;
2071 for(; kk<n; kk++)
2072 {
2073 c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
2074 c_10 += pB[(ii+1)+ldb*kk] * pA[(jj+0)+lda*kk];
2075 c_01 += pB[(ii+0)+ldb*kk] * pA[(jj+1)+lda*kk];
2076 c_11 += pB[(ii+1)+ldb*kk] * pA[(jj+1)+lda*kk];
2077 }
2078 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
2079 pD[(ii+1)+ldd*(jj+0)] = alpha * c_10;
2080 pD[(ii+0)+ldd*(jj+1)] = alpha * c_01;
2081 pD[(ii+1)+ldd*(jj+1)] = alpha * c_11;
2082 }
2083 for(; ii<m; ii++)
2084 {
2085 c_00 = 0.0;
2086 c_01 = 0.0;
2087 kk = jj;
2088 c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
2089 kk++;
2090 for(; kk<n; kk++)
2091 {
2092 c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
2093 c_01 += pB[(ii+0)+ldb*kk] * pA[(jj+1)+lda*kk];
2094 }
2095 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
2096 pD[(ii+0)+ldd*(jj+1)] = alpha * c_01;
2097 }
2098 }
2099 for(; jj<n; jj++)
2100 {
2101 ii = 0;
2102 for(; ii<m-1; ii+=2)
2103 {
2104 c_00 = 0.0;
2105 c_10 = 0.0;
2106 for(kk=jj; kk<n; kk++)
2107 {
2108 c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
2109 c_10 += pB[(ii+1)+ldb*kk] * pA[(jj+0)+lda*kk];
2110 }
2111 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
2112 pD[(ii+1)+ldd*(jj+0)] = alpha * c_10;
2113 }
2114 for(; ii<m; ii++)
2115 {
2116 c_00 = 0.0;
2117 for(kk=jj; kk<n; kk++)
2118 {
2119 c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
2120 }
2121 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
2122 }
2123 }
2124 return;
2125 }
2126
2127
2128
2129 // dtrmm_right_lower_nottransposed_notunit (A triangular !!!)
TRMM_RLNN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)2130 void TRMM_RLNN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
2131 {
2132 if(m<=0 | n<=0)
2133 return;
2134
2135 // invalidate stored inverse diagonal of result matrix
2136 sD->use_dA = 0;
2137
2138 int ii, jj, kk;
2139 REAL
2140 c_00, c_01,
2141 c_10, c_11;
2142 int lda = sA->m;
2143 int ldb = sB->m;
2144 int ldd = sD->m;
2145 REAL *pA = sA->pA + ai + aj*lda;
2146 REAL *pB = sB->pA + bi + bj*ldb;
2147 REAL *pD = sD->pA + di + dj*ldd;
2148 jj = 0;
2149 for(; jj<n-1; jj+=2)
2150 {
2151 ii = 0;
2152 for(; ii<m-1; ii+=2)
2153 {
2154 c_00 = 0.0; ;
2155 c_10 = 0.0; ;
2156 c_01 = 0.0; ;
2157 c_11 = 0.0; ;
2158 kk = jj;
2159 c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
2160 c_10 += pB[(ii+1)+ldb*kk] * pA[kk+lda*(jj+0)];
2161 kk++;
2162 for(; kk<n; kk++)
2163 {
2164 c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
2165 c_10 += pB[(ii+1)+ldb*kk] * pA[kk+lda*(jj+0)];
2166 c_01 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+1)];
2167 c_11 += pB[(ii+1)+ldb*kk] * pA[kk+lda*(jj+1)];
2168 }
2169 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
2170 pD[(ii+1)+ldd*(jj+0)] = alpha * c_10;
2171 pD[(ii+0)+ldd*(jj+1)] = alpha * c_01;
2172 pD[(ii+1)+ldd*(jj+1)] = alpha * c_11;
2173 }
2174 for(; ii<m; ii++)
2175 {
2176 c_00 = 0.0; ;
2177 c_01 = 0.0; ;
2178 kk = jj;
2179 c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
2180 kk++;
2181 for(; kk<n; kk++)
2182 {
2183 c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
2184 c_01 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+1)];
2185 }
2186 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
2187 pD[(ii+0)+ldd*(jj+1)] = alpha * c_01;
2188 }
2189 }
2190 for(; jj<n; jj++)
2191 {
2192 ii = 0;
2193 for(; ii<m-1; ii+=2)
2194 {
2195 c_00 = 0.0; ;
2196 c_10 = 0.0; ;
2197 for(kk=jj; kk<n; kk++)
2198 {
2199 c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
2200 c_10 += pB[(ii+1)+ldb*kk] * pA[kk+lda*(jj+0)];
2201 }
2202 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
2203 pD[(ii+1)+ldd*(jj+0)] = alpha * c_10;
2204 }
2205 for(; ii<m; ii++)
2206 {
2207 c_00 = 0.0; ;
2208 for(kk=jj; kk<n; kk++)
2209 {
2210 c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
2211 }
2212 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
2213 }
2214 }
2215 return;
2216 }
2217
2218
2219
2220 // dsyrk_lower not-transposed
SYRK_LN(int m,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)2221 void SYRK_LN(int m, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
2222 {
2223 if(m<=0)
2224 return;
2225
2226 // invalidate stored inverse diagonal of result matrix
2227 sD->use_dA = 0;
2228
2229 int ii, jj, kk;
2230 REAL
2231 c_00, c_01,
2232 c_10, c_11;
2233 int lda = sA->m;
2234 int ldb = sB->m;
2235 int ldc = sC->m;
2236 int ldd = sD->m;
2237 REAL *pA = sA->pA + ai + aj*lda;
2238 REAL *pB = sB->pA + bi + bj*ldb;
2239 REAL *pC = sC->pA + ci + cj*ldc;
2240 REAL *pD = sD->pA + di + dj*ldd;
2241 jj = 0;
2242 for(; jj<m-1; jj+=2)
2243 {
2244 // diagonal
2245 c_00 = 0.0;
2246 c_10 = 0.0;
2247 c_11 = 0.0;
2248 for(kk=0; kk<k; kk++)
2249 {
2250 c_00 += pA[jj+0+lda*kk] * pB[jj+0+ldb*kk];
2251 c_10 += pA[jj+1+lda*kk] * pB[jj+0+ldb*kk];
2252 c_11 += pA[jj+1+lda*kk] * pB[jj+1+ldb*kk];
2253 }
2254 pD[jj+0+ldd*(jj+0)] = beta * pC[jj+0+ldc*(jj+0)] + alpha * c_00;
2255 pD[jj+1+ldd*(jj+0)] = beta * pC[jj+1+ldc*(jj+0)] + alpha * c_10;
2256 pD[jj+1+ldd*(jj+1)] = beta * pC[jj+1+ldc*(jj+1)] + alpha * c_11;
2257 // lower
2258 ii = jj+2;
2259 for(; ii<m-1; ii+=2)
2260 {
2261 c_00 = 0.0;
2262 c_10 = 0.0;
2263 c_01 = 0.0;
2264 c_11 = 0.0;
2265 for(kk=0; kk<k; kk++)
2266 {
2267 c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
2268 c_10 += pA[ii+1+lda*kk] * pB[jj+0+ldb*kk];
2269 c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
2270 c_11 += pA[ii+1+lda*kk] * pB[jj+1+ldb*kk];
2271 }
2272 pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
2273 pD[ii+1+ldd*(jj+0)] = beta * pC[ii+1+ldc*(jj+0)] + alpha * c_10;
2274 pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
2275 pD[ii+1+ldd*(jj+1)] = beta * pC[ii+1+ldc*(jj+1)] + alpha * c_11;
2276 }
2277 for(; ii<m; ii++)
2278 {
2279 c_00 = 0.0;
2280 c_01 = 0.0;
2281 for(kk=0; kk<k; kk++)
2282 {
2283 c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
2284 c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
2285 }
2286 pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
2287 pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
2288 }
2289 }
2290 if(jj<m)
2291 {
2292 // diagonal
2293 c_00 = 0.0;
2294 for(kk=0; kk<k; kk++)
2295 {
2296 c_00 += pA[jj+lda*kk] * pB[jj+ldb*kk];
2297 }
2298 pD[jj+ldd*jj] = beta * pC[jj+ldc*jj] + alpha * c_00;
2299 }
2300 return;
2301 }
2302
2303
2304
2305 // dsyrk_lower not-transposed
SYRK_LN_MN(int m,int n,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)2306 void SYRK_LN_MN(int m, int n, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
2307 {
2308 if(m<=0 | n<=0)
2309 return;
2310
2311 // invalidate stored inverse diagonal of result matrix
2312 sD->use_dA = 0;
2313
2314 int ii, jj, kk;
2315 REAL
2316 c_00, c_01,
2317 c_10, c_11;
2318 int lda = sA->m;
2319 int ldb = sB->m;
2320 int ldc = sC->m;
2321 int ldd = sD->m;
2322 REAL *pA = sA->pA + ai + aj*lda;
2323 REAL *pB = sB->pA + bi + bj*ldb;
2324 REAL *pC = sC->pA + ci + cj*ldc;
2325 REAL *pD = sD->pA + di + dj*ldd;
2326 jj = 0;
2327 for(; jj<n-1; jj+=2)
2328 {
2329 // diagonal
2330 c_00 = 0.0;
2331 c_10 = 0.0;
2332 c_11 = 0.0;
2333 for(kk=0; kk<k; kk++)
2334 {
2335 c_00 += pA[jj+0+lda*kk] * pB[jj+0+ldb*kk];
2336 c_10 += pA[jj+1+lda*kk] * pB[jj+0+ldb*kk];
2337 c_11 += pA[jj+1+lda*kk] * pB[jj+1+ldb*kk];
2338 }
2339 pD[jj+0+ldd*(jj+0)] = beta * pC[jj+0+ldc*(jj+0)] + alpha * c_00;
2340 pD[jj+1+ldd*(jj+0)] = beta * pC[jj+1+ldc*(jj+0)] + alpha * c_10;
2341 pD[jj+1+ldd*(jj+1)] = beta * pC[jj+1+ldc*(jj+1)] + alpha * c_11;
2342 // lower
2343 ii = jj+2;
2344 for(; ii<m-1; ii+=2)
2345 {
2346 c_00 = 0.0;
2347 c_10 = 0.0;
2348 c_01 = 0.0;
2349 c_11 = 0.0;
2350 for(kk=0; kk<k; kk++)
2351 {
2352 c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
2353 c_10 += pA[ii+1+lda*kk] * pB[jj+0+ldb*kk];
2354 c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
2355 c_11 += pA[ii+1+lda*kk] * pB[jj+1+ldb*kk];
2356 }
2357 pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
2358 pD[ii+1+ldd*(jj+0)] = beta * pC[ii+1+ldc*(jj+0)] + alpha * c_10;
2359 pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
2360 pD[ii+1+ldd*(jj+1)] = beta * pC[ii+1+ldc*(jj+1)] + alpha * c_11;
2361 }
2362 for(; ii<m; ii++)
2363 {
2364 c_00 = 0.0;
2365 c_01 = 0.0;
2366 for(kk=0; kk<k; kk++)
2367 {
2368 c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
2369 c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
2370 }
2371 pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
2372 pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
2373 }
2374 }
2375 for(; jj<n; jj++)
2376 {
2377 // diagonal
2378 c_00 = 0.0;
2379 for(kk=0; kk<k; kk++)
2380 {
2381 c_00 += pA[jj+lda*kk] * pB[jj+ldb*kk];
2382 }
2383 pD[jj+ldd*jj] = beta * pC[jj+ldc*jj] + alpha * c_00;
2384 // lower
2385 for(ii=jj+1; ii<m; ii++)
2386 {
2387 c_00 = 0.0;
2388 for(kk=0; kk<k; kk++)
2389 {
2390 c_00 += pA[ii+lda*kk] * pB[jj+ldb*kk];
2391 }
2392 pD[ii+ldd*jj] = beta * pC[ii+ldc*jj] + alpha * c_00;
2393 }
2394 }
2395 return;
2396 }
2397
2398
2399
2400 // dsyrk_lower transposed
SYRK_LT(int m,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)2401 void SYRK_LT(int m, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
2402 {
2403 if(m<=0)
2404 return;
2405
2406 // invalidate stored inverse diagonal of result matrix
2407 sD->use_dA = 0;
2408
2409 int ii, jj, kk;
2410 REAL
2411 c_00, c_01,
2412 c_10, c_11;
2413 int lda = sA->m;
2414 int ldb = sB->m;
2415 int ldc = sC->m;
2416 int ldd = sD->m;
2417 REAL *pA = sA->pA + ai + aj*lda;
2418 REAL *pB = sB->pA + bi + bj*ldb;
2419 REAL *pC = sC->pA + ci + cj*ldc;
2420 REAL *pD = sD->pA + di + dj*ldd;
2421 jj = 0;
2422 for(; jj<m-1; jj+=2)
2423 {
2424 // diagonal
2425 c_00 = 0.0;
2426 c_10 = 0.0;
2427 c_11 = 0.0;
2428 for(kk=0; kk<k; kk++)
2429 {
2430 c_00 += pA[kk+lda*(jj+0)] * pB[kk+ldb*(jj+0)];
2431 c_10 += pA[kk+lda*(jj+1)] * pB[kk+ldb*(jj+0)];
2432 c_11 += pA[kk+lda*(jj+1)] * pB[kk+ldb*(jj+1)];
2433 }
2434 pD[jj+0+ldd*(jj+0)] = beta * pC[jj+0+ldc*(jj+0)] + alpha * c_00;
2435 pD[jj+1+ldd*(jj+0)] = beta * pC[jj+1+ldc*(jj+0)] + alpha * c_10;
2436 pD[jj+1+ldd*(jj+1)] = beta * pC[jj+1+ldc*(jj+1)] + alpha * c_11;
2437 // lower
2438 ii = jj+2;
2439 for(; ii<m-1; ii+=2)
2440 {
2441 c_00 = 0.0;
2442 c_10 = 0.0;
2443 c_01 = 0.0;
2444 c_11 = 0.0;
2445 for(kk=0; kk<k; kk++)
2446 {
2447 c_00 += pA[kk+lda*(ii+0)] * pB[kk+ldb*(jj+0)];
2448 c_10 += pA[kk+lda*(ii+1)] * pB[kk+ldb*(jj+0)];
2449 c_01 += pA[kk+lda*(ii+0)] * pB[kk+ldb*(jj+1)];
2450 c_11 += pA[kk+lda*(ii+1)] * pB[kk+ldb*(jj+1)];
2451 }
2452 pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
2453 pD[ii+1+ldd*(jj+0)] = beta * pC[ii+1+ldc*(jj+0)] + alpha * c_10;
2454 pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
2455 pD[ii+1+ldd*(jj+1)] = beta * pC[ii+1+ldc*(jj+1)] + alpha * c_11;
2456 }
2457 for(; ii<m; ii++)
2458 {
2459 c_00 = 0.0;
2460 c_01 = 0.0;
2461 for(kk=0; kk<k; kk++)
2462 {
2463 c_00 += pA[kk+lda*(ii+0)] * pB[kk+ldb*(jj+0)];
2464 c_01 += pA[kk+lda*(ii+0)] * pB[kk+ldb*(jj+1)];
2465 }
2466 pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
2467 pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
2468 }
2469 }
2470 if(jj<m)
2471 {
2472 // diagonal
2473 c_00 = 0.0;
2474 for(kk=0; kk<k; kk++)
2475 {
2476 c_00 += pA[kk+lda*jj] * pB[kk+ldb*jj];
2477 }
2478 pD[jj+ldd*jj] = beta * pC[jj+ldc*jj] + alpha * c_00;
2479 }
2480 return;
2481 }
2482
2483
2484
2485 // dsyrk_upper not-transposed
SYRK_UN(int m,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)2486 void SYRK_UN(int m, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
2487 {
2488 if(m<=0)
2489 return;
2490
2491 // invalidate stored inverse diagonal of result matrix
2492 sD->use_dA = 0;
2493
2494 int ii, jj, kk;
2495 REAL
2496 c_00, c_01,
2497 c_10, c_11;
2498 int lda = sA->m;
2499 int ldb = sB->m;
2500 int ldc = sC->m;
2501 int ldd = sD->m;
2502 REAL *pA = sA->pA + ai + aj*lda;
2503 REAL *pB = sB->pA + bi + bj*ldb;
2504 REAL *pC = sC->pA + ci + cj*ldc;
2505 REAL *pD = sD->pA + di + dj*ldd;
2506 jj = 0;
2507 for(; jj<m-1; jj+=2)
2508 {
2509 // upper
2510 ii = 0;
2511 for(; ii<jj; ii+=2)
2512 {
2513 c_00 = 0.0;
2514 c_10 = 0.0;
2515 c_01 = 0.0;
2516 c_11 = 0.0;
2517 for(kk=0; kk<k; kk++)
2518 {
2519 c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
2520 c_10 += pA[ii+1+lda*kk] * pB[jj+0+ldb*kk];
2521 c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
2522 c_11 += pA[ii+1+lda*kk] * pB[jj+1+ldb*kk];
2523 }
2524 pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
2525 pD[ii+1+ldd*(jj+0)] = beta * pC[ii+1+ldc*(jj+0)] + alpha * c_10;
2526 pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
2527 pD[ii+1+ldd*(jj+1)] = beta * pC[ii+1+ldc*(jj+1)] + alpha * c_11;
2528 }
2529 // diagonal
2530 c_00 = 0.0;
2531 c_01 = 0.0;
2532 c_11 = 0.0;
2533 for(kk=0; kk<k; kk++)
2534 {
2535 c_00 += pA[jj+0+lda*kk] * pB[jj+0+ldb*kk];
2536 c_01 += pA[jj+0+lda*kk] * pB[jj+1+ldb*kk];
2537 c_11 += pA[jj+1+lda*kk] * pB[jj+1+ldb*kk];
2538 }
2539 pD[jj+0+ldd*(jj+0)] = beta * pC[jj+0+ldc*(jj+0)] + alpha * c_00;
2540 pD[jj+0+ldd*(jj+1)] = beta * pC[jj+0+ldc*(jj+1)] + alpha * c_01;
2541 pD[jj+1+ldd*(jj+1)] = beta * pC[jj+1+ldc*(jj+1)] + alpha * c_11;
2542 }
2543 if(jj<m)
2544 {
2545 // upper
2546 ii = 0;
2547 for(; ii<jj; ii+=2)
2548 {
2549 c_00 = 0.0;
2550 c_10 = 0.0;
2551 for(kk=0; kk<k; kk++)
2552 {
2553 c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
2554 c_10 += pA[ii+1+lda*kk] * pB[jj+0+ldb*kk];
2555 }
2556 pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
2557 pD[ii+1+ldd*(jj+0)] = beta * pC[ii+1+ldc*(jj+0)] + alpha * c_10;
2558 }
2559 // diagonal
2560 c_00 = 0.0;
2561 for(kk=0; kk<k; kk++)
2562 {
2563 c_00 += pA[jj+0+lda*kk] * pB[jj+0+ldb*kk];
2564 }
2565 pD[jj+0+ldd*(jj+0)] = beta * pC[jj+0+ldc*(jj+0)] + alpha * c_00;
2566 }
2567 return;
2568 }
2569
2570
2571
2572 // dsyrk_upper transposed
SYRK_UT(int m,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)2573 void SYRK_UT(int m, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
2574 {
2575 if(m<=0)
2576 return;
2577
2578 // invalidate stored inverse diagonal of result matrix
2579 sD->use_dA = 0;
2580
2581 int ii, jj, kk;
2582 REAL
2583 c_00, c_01,
2584 c_10, c_11;
2585 int lda = sA->m;
2586 int ldb = sB->m;
2587 int ldc = sC->m;
2588 int ldd = sD->m;
2589 REAL *pA = sA->pA + ai + aj*lda;
2590 REAL *pB = sB->pA + bi + bj*ldb;
2591 REAL *pC = sC->pA + ci + cj*ldc;
2592 REAL *pD = sD->pA + di + dj*ldd;
2593 jj = 0;
2594 for(; jj<m-1; jj+=2)
2595 {
2596 // upper
2597 ii = 0;
2598 for(; ii<jj; ii+=2)
2599 {
2600 c_00 = 0.0;
2601 c_10 = 0.0;
2602 c_01 = 0.0;
2603 c_11 = 0.0;
2604 for(kk=0; kk<k; kk++)
2605 {
2606 c_00 += pA[kk+lda*(ii+0)] * pB[kk+ldb*(jj+0)];
2607 c_10 += pA[kk+lda*(ii+1)] * pB[kk+ldb*(jj+0)];
2608 c_01 += pA[kk+lda*(ii+0)] * pB[kk+ldb*(jj+1)];
2609 c_11 += pA[kk+lda*(ii+1)] * pB[kk+ldb*(jj+1)];
2610 }
2611 pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
2612 pD[ii+1+ldd*(jj+0)] = beta * pC[ii+1+ldc*(jj+0)] + alpha * c_10;
2613 pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
2614 pD[ii+1+ldd*(jj+1)] = beta * pC[ii+1+ldc*(jj+1)] + alpha * c_11;
2615 }
2616 // diagonal
2617 c_00 = 0.0;
2618 c_01 = 0.0;
2619 c_11 = 0.0;
2620 for(kk=0; kk<k; kk++)
2621 {
2622 c_00 += pA[kk+lda*(jj+0)] * pB[kk+ldb*(jj+0)];
2623 c_01 += pA[kk+lda*(jj+0)] * pB[kk+ldb*(jj+1)];
2624 c_11 += pA[kk+lda*(jj+1)] * pB[kk+ldb*(jj+1)];
2625 }
2626 pD[jj+0+ldd*(jj+0)] = beta * pC[jj+0+ldc*(jj+0)] + alpha * c_00;
2627 pD[jj+0+ldd*(jj+1)] = beta * pC[jj+0+ldc*(jj+1)] + alpha * c_01;
2628 pD[jj+1+ldd*(jj+1)] = beta * pC[jj+1+ldc*(jj+1)] + alpha * c_11;
2629 }
2630 if(jj<m)
2631 {
2632 // upper
2633 ii = 0;
2634 for(; ii<jj; ii+=2)
2635 {
2636 c_00 = 0.0;
2637 c_10 = 0.0;
2638 for(kk=0; kk<k; kk++)
2639 {
2640 c_00 += pA[kk+lda*(ii+0)] * pB[kk+ldb*(jj+0)];
2641 c_10 += pA[kk+lda*(ii+1)] * pB[kk+ldb*(jj+0)];
2642 }
2643 pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
2644 pD[ii+1+ldd*(jj+0)] = beta * pC[ii+1+ldc*(jj+0)] + alpha * c_10;
2645 }
2646 // diagonal
2647 c_00 = 0.0;
2648 for(kk=0; kk<k; kk++)
2649 {
2650 c_00 += pA[kk+lda*(jj+0)] * pB[kk+ldb*(jj+0)];
2651 }
2652 pD[jj+0+ldd*(jj+0)] = beta * pC[jj+0+ldc*(jj+0)] + alpha * c_00;
2653 }
2654 return;
2655 }
2656
2657
2658
2659 #elif defined(LA_EXTERNAL_BLAS_WRAPPER)
2660
2661
2662
2663 // dgemm nn
GEMM_NN(int m,int n,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)2664 void GEMM_NN(int m, int n, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
2665 {
2666
2667 // invalidate stored inverse diagonal of result matrix
2668 sD->use_dA = 0;
2669
2670 int jj;
2671 char cn = 'n';
2672 REAL *pA = sA->pA+ai+aj*sA->m;
2673 REAL *pB = sB->pA+bi+bj*sB->m;
2674 REAL *pC = sC->pA+ci+cj*sC->m;
2675 REAL *pD = sD->pA+di+dj*sD->m;
2676 int i1 = 1;
2677 int lda = sA->m;
2678 int ldb = sB->m;
2679 int ldc = sC->m;
2680 int ldd = sD->m;
2681 if(!(beta==0.0 || pC==pD))
2682 {
2683 for(jj=0; jj<n; jj++)
2684 COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
2685 }
2686 GEMM(&cn, &cn, &m, &n, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
2687 return;
2688 }
2689
2690
2691
2692 // dgemm nt
GEMM_NT(int m,int n,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)2693 void GEMM_NT(int m, int n, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
2694 {
2695
2696 // invalidate stored inverse diagonal of result matrix
2697 sD->use_dA = 0;
2698
2699 int jj;
2700 char cn = 'n';
2701 char ct = 't';
2702 REAL *pA = sA->pA+ai+aj*sA->m;
2703 REAL *pB = sB->pA+bi+bj*sB->m;
2704 REAL *pC = sC->pA+ci+cj*sC->m;
2705 REAL *pD = sD->pA+di+dj*sD->m;
2706 int i1 = 1;
2707 int lda = sA->m;
2708 int ldb = sB->m;
2709 int ldc = sC->m;
2710 int ldd = sD->m;
2711 if(!(beta==0.0 || pC==pD))
2712 {
2713 for(jj=0; jj<n; jj++)
2714 COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
2715 }
2716 GEMM(&cn, &ct, &m, &n, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
2717 return;
2718 }
2719
2720
2721
2722 // dgemm tn
GEMM_TN(int m,int n,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)2723 void GEMM_TN(int m, int n, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
2724 {
2725
2726 // invalidate stored inverse diagonal of result matrix
2727 sD->use_dA = 0;
2728
2729 int jj;
2730 char cn = 'n';
2731 char ct = 't';
2732 REAL *pA = sA->pA+ai+aj*sA->m;
2733 REAL *pB = sB->pA+bi+bj*sB->m;
2734 REAL *pC = sC->pA+ci+cj*sC->m;
2735 REAL *pD = sD->pA+di+dj*sD->m;
2736 int i1 = 1;
2737 int lda = sA->m;
2738 int ldb = sB->m;
2739 int ldc = sC->m;
2740 int ldd = sD->m;
2741 if(!(beta==0.0 || pC==pD))
2742 {
2743 for(jj=0; jj<n; jj++)
2744 COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
2745 }
2746 GEMM(&ct, &cn, &m, &n, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
2747 return;
2748 }
2749
2750
2751
2752 // dgemm tt
GEMM_TT(int m,int n,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)2753 void GEMM_TT(int m, int n, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
2754 {
2755
2756 // invalidate stored inverse diagonal of result matrix
2757 sD->use_dA = 0;
2758
2759 int jj;
2760 char cn = 'n';
2761 char ct = 't';
2762 REAL *pA = sA->pA+ai+aj*sA->m;
2763 REAL *pB = sB->pA+bi+bj*sB->m;
2764 REAL *pC = sC->pA+ci+cj*sC->m;
2765 REAL *pD = sD->pA+di+dj*sD->m;
2766 int i1 = 1;
2767 int lda = sA->m;
2768 int ldb = sB->m;
2769 int ldc = sC->m;
2770 int ldd = sD->m;
2771 if(!(beta==0.0 || pC==pD))
2772 {
2773 for(jj=0; jj<n; jj++)
2774 COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
2775 }
2776 GEMM(&ct, &ct, &m, &n, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
2777 return;
2778 }
2779
2780
2781
2782 // dtrsm_left_lower_nottransposed_notunit
TRSM_LLNN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)2783 void TRSM_LLNN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
2784 {
2785
2786 // invalidate stored inverse diagonal of result matrix
2787 sD->use_dA = 0;
2788
2789 int jj;
2790 char cl = 'l';
2791 char cn = 'n';
2792 REAL *pA = sA->pA+ai+aj*sA->m;
2793 REAL *pB = sB->pA+bi+bj*sB->m;
2794 REAL *pD = sD->pA+di+dj*sD->m;
2795 int i1 = 1;
2796 int lda = sA->m;
2797 int ldb = sB->m;
2798 int ldd = sD->m;
2799 if(!(pB==pD))
2800 {
2801 for(jj=0; jj<n; jj++)
2802 COPY(&m, pB+jj*ldb, &i1, pD+jj*sD->m, &i1);
2803 }
2804 TRSM(&cl, &cl, &cn, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
2805 return;
2806 }
2807
2808
2809
2810 // dtrsm_left_lower_nottransposed_unit
TRSM_LLNU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)2811 void TRSM_LLNU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
2812 {
2813
2814 // invalidate stored inverse diagonal of result matrix
2815 sD->use_dA = 0;
2816
2817 int jj;
2818 char cl = 'l';
2819 char cn = 'n';
2820 char cu = 'u';
2821 REAL *pA = sA->pA+ai+aj*sA->m;
2822 REAL *pB = sB->pA+bi+bj*sB->m;
2823 REAL *pD = sD->pA+di+dj*sD->m;
2824 int i1 = 1;
2825 int lda = sA->m;
2826 int ldb = sB->m;
2827 int ldd = sD->m;
2828 if(!(pB==pD))
2829 {
2830 for(jj=0; jj<n; jj++)
2831 COPY(&m, pB+jj*ldb, &i1, pD+jj*sD->m, &i1);
2832 }
2833 TRSM(&cl, &cl, &cn, &cu, &m, &n, &alpha, pA, &lda, pD, &ldd);
2834 return;
2835 }
2836
2837
2838
2839 // dtrsm_left_lower_transposed_notunit
TRSM_LLTN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)2840 void TRSM_LLTN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
2841 {
2842
2843 // invalidate stored inverse diagonal of result matrix
2844 sD->use_dA = 0;
2845
2846 int jj;
2847 char cl = 'l';
2848 char cn = 'n';
2849 char ct = 't';
2850 REAL *pA = sA->pA+ai+aj*sA->m;
2851 REAL *pB = sB->pA+bi+bj*sB->m;
2852 REAL *pD = sD->pA+di+dj*sD->m;
2853 int i1 = 1;
2854 int lda = sA->m;
2855 int ldb = sB->m;
2856 int ldd = sD->m;
2857 if(!(pB==pD))
2858 {
2859 for(jj=0; jj<n; jj++)
2860 COPY(&m, pB+jj*ldb, &i1, pD+jj*sD->m, &i1);
2861 }
2862 TRSM(&cl, &cl, &ct, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
2863 return;
2864 }
2865
2866
2867
2868 // dtrsm_left_lower_transposed_unit
TRSM_LLTU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)2869 void TRSM_LLTU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
2870 {
2871
2872 // invalidate stored inverse diagonal of result matrix
2873 sD->use_dA = 0;
2874
2875 int jj;
2876 char cl = 'l';
2877 char cn = 'n';
2878 char ct = 't';
2879 char cu = 'u';
2880 REAL *pA = sA->pA+ai+aj*sA->m;
2881 REAL *pB = sB->pA+bi+bj*sB->m;
2882 REAL *pD = sD->pA+di+dj*sD->m;
2883 int i1 = 1;
2884 int lda = sA->m;
2885 int ldb = sB->m;
2886 int ldd = sD->m;
2887 if(!(pB==pD))
2888 {
2889 for(jj=0; jj<n; jj++)
2890 COPY(&m, pB+jj*ldb, &i1, pD+jj*sD->m, &i1);
2891 }
2892 TRSM(&cl, &cl, &ct, &cu, &m, &n, &alpha, pA, &lda, pD, &ldd);
2893 return;
2894 }
2895
2896
2897
2898 // dtrsm_left_upper_nottransposed_notunit
TRSM_LUNN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)2899 void TRSM_LUNN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
2900 {
2901
2902 // invalidate stored inverse diagonal of result matrix
2903 sD->use_dA = 0;
2904
2905 int jj;
2906 char cl = 'l';
2907 char cn = 'n';
2908 char cu = 'u';
2909 REAL *pA = sA->pA+ai+aj*sA->m;
2910 REAL *pB = sB->pA+bi+bj*sB->m;
2911 REAL *pD = sD->pA+di+dj*sD->m;
2912 int i1 = 1;
2913 int lda = sA->m;
2914 int ldb = sB->m;
2915 int ldd = sD->m;
2916 if(!(pB==pD))
2917 {
2918 for(jj=0; jj<n; jj++)
2919 COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
2920 }
2921 TRSM(&cl, &cu, &cn, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
2922 return;
2923 }
2924
2925
2926
2927 // dtrsm_left_upper_nottransposed_unit
TRSM_LUNU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)2928 void TRSM_LUNU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
2929 {
2930
2931 // invalidate stored inverse diagonal of result matrix
2932 sD->use_dA = 0;
2933
2934 int jj;
2935 char cl = 'l';
2936 char cn = 'n';
2937 char cu = 'u';
2938 REAL *pA = sA->pA+ai+aj*sA->m;
2939 REAL *pB = sB->pA+bi+bj*sB->m;
2940 REAL *pD = sD->pA+di+dj*sD->m;
2941 int i1 = 1;
2942 int lda = sA->m;
2943 int ldb = sB->m;
2944 int ldd = sD->m;
2945 if(!(pB==pD))
2946 {
2947 for(jj=0; jj<n; jj++)
2948 COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
2949 }
2950 TRSM(&cl, &cu, &cn, &cu, &m, &n, &alpha, pA, &lda, pD, &ldd);
2951 return;
2952 }
2953
2954
2955
2956 // dtrsm_left_upper_transposed_notunit
TRSM_LUTN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)2957 void TRSM_LUTN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
2958 {
2959
2960 // invalidate stored inverse diagonal of result matrix
2961 sD->use_dA = 0;
2962
2963 int jj;
2964 char cl = 'l';
2965 char cn = 'n';
2966 char ct = 't';
2967 char cu = 'u';
2968 REAL *pA = sA->pA+ai+aj*sA->m;
2969 REAL *pB = sB->pA+bi+bj*sB->m;
2970 REAL *pD = sD->pA+di+dj*sD->m;
2971 int i1 = 1;
2972 int lda = sA->m;
2973 int ldb = sB->m;
2974 int ldd = sD->m;
2975 if(!(pB==pD))
2976 {
2977 for(jj=0; jj<n; jj++)
2978 COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
2979 }
2980 TRSM(&cl, &cu, &ct, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
2981 return;
2982 }
2983
2984
2985
2986 // dtrsm_left_upper_transposed_unit
TRSM_LUTU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)2987 void TRSM_LUTU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
2988 {
2989
2990 // invalidate stored inverse diagonal of result matrix
2991 sD->use_dA = 0;
2992
2993 int jj;
2994 char cl = 'l';
2995 char cn = 'n';
2996 char ct = 't';
2997 char cu = 'u';
2998 REAL *pA = sA->pA+ai+aj*sA->m;
2999 REAL *pB = sB->pA+bi+bj*sB->m;
3000 REAL *pD = sD->pA+di+dj*sD->m;
3001 int i1 = 1;
3002 int lda = sA->m;
3003 int ldb = sB->m;
3004 int ldd = sD->m;
3005 if(!(pB==pD))
3006 {
3007 for(jj=0; jj<n; jj++)
3008 COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
3009 }
3010 TRSM(&cl, &cu, &ct, &cu, &m, &n, &alpha, pA, &lda, pD, &ldd);
3011 return;
3012 }
3013
3014
3015
3016 // dtrsm_right_lower_nottransposed_notunit
TRSM_RLNN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)3017 void TRSM_RLNN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
3018 {
3019
3020 // invalidate stored inverse diagonal of result matrix
3021 sD->use_dA = 0;
3022
3023 int jj;
3024 char cl = 'l';
3025 char cn = 'n';
3026 char cr = 'r';
3027 char ct = 't';
3028 char cu = 'u';
3029 REAL *pA = sA->pA+ai+aj*sA->m;
3030 REAL *pB = sB->pA+bi+bj*sB->m;
3031 REAL *pD = sD->pA+di+dj*sD->m;
3032 int i1 = 1;
3033 int lda = sA->m;
3034 int ldb = sB->m;
3035 int ldd = sD->m;
3036 if(!(pB==pD))
3037 {
3038 for(jj=0; jj<n; jj++)
3039 COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
3040 }
3041 TRSM(&cr, &cl, &cn, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
3042 return;
3043 }
3044
3045
3046
3047 // dtrsm_right_lower_nottransposed_unit
TRSM_RLNU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)3048 void TRSM_RLNU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
3049 {
3050
3051 // invalidate stored inverse diagonal of result matrix
3052 sD->use_dA = 0;
3053
3054 int jj;
3055 char cl = 'l';
3056 char cn = 'n';
3057 char cr = 'r';
3058 char ct = 't';
3059 char cu = 'u';
3060 REAL *pA = sA->pA+ai+aj*sA->m;
3061 REAL *pB = sB->pA+bi+bj*sB->m;
3062 REAL *pD = sD->pA+di+dj*sD->m;
3063 int i1 = 1;
3064 int lda = sA->m;
3065 int ldb = sB->m;
3066 int ldd = sD->m;
3067 if(!(pB==pD))
3068 {
3069 for(jj=0; jj<n; jj++)
3070 COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
3071 }
3072 TRSM(&cr, &cl, &cn, &cu, &m, &n, &alpha, pA, &lda, pD, &ldd);
3073 return;
3074 }
3075
3076
3077
3078 // dtrsm_right_lower_transposed_notunit
TRSM_RLTN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)3079 void TRSM_RLTN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
3080 {
3081
3082 // invalidate stored inverse diagonal of result matrix
3083 sD->use_dA = 0;
3084
3085 int jj;
3086 char cl = 'l';
3087 char cn = 'n';
3088 char cr = 'r';
3089 char ct = 't';
3090 char cu = 'u';
3091 REAL *pA = sA->pA+ai+aj*sA->m;
3092 REAL *pB = sB->pA+bi+bj*sB->m;
3093 REAL *pD = sD->pA+di+dj*sD->m;
3094 int i1 = 1;
3095 int lda = sA->m;
3096 int ldb = sB->m;
3097 int ldd = sD->m;
3098 if(!(pB==pD))
3099 {
3100 for(jj=0; jj<n; jj++)
3101 COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
3102 }
3103 TRSM(&cr, &cl, &ct, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
3104 return;
3105 }
3106
3107
3108
3109 // dtrsm_right_lower_transposed_unit
TRSM_RLTU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)3110 void TRSM_RLTU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
3111 {
3112
3113 // invalidate stored inverse diagonal of result matrix
3114 sD->use_dA = 0;
3115
3116 int jj;
3117 char cl = 'l';
3118 char cn = 'n';
3119 char cr = 'r';
3120 char ct = 't';
3121 char cu = 'u';
3122 REAL *pA = sA->pA+ai+aj*sA->m;
3123 REAL *pB = sB->pA+bi+bj*sB->m;
3124 REAL *pD = sD->pA+di+dj*sD->m;
3125 int i1 = 1;
3126 int lda = sA->m;
3127 int ldb = sB->m;
3128 int ldd = sD->m;
3129 if(!(pB==pD))
3130 {
3131 for(jj=0; jj<n; jj++)
3132 COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
3133 }
3134 TRSM(&cr, &cl, &ct, &cu, &m, &n, &alpha, pA, &lda, pD, &ldd);
3135 return;
3136 }
3137
3138
3139
3140 // dtrsm_right_upper_nottransposed_notunit
TRSM_RUNN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)3141 void TRSM_RUNN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
3142 {
3143
3144 // invalidate stored inverse diagonal of result matrix
3145 sD->use_dA = 0;
3146
3147 int jj;
3148 char cl = 'l';
3149 char cn = 'n';
3150 char cr = 'r';
3151 char ct = 't';
3152 char cu = 'u';
3153 REAL *pA = sA->pA+ai+aj*sA->m;
3154 REAL *pB = sB->pA+bi+bj*sB->m;
3155 REAL *pD = sD->pA+di+dj*sD->m;
3156 int i1 = 1;
3157 int lda = sA->m;
3158 int ldb = sB->m;
3159 int ldd = sD->m;
3160 if(!(pB==pD))
3161 {
3162 for(jj=0; jj<n; jj++)
3163 COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
3164 }
3165 TRSM(&cr, &cu, &cn, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
3166 return;
3167 }
3168
3169
3170
3171 // dtrsm_right_upper_nottransposed_unit
TRSM_RUNU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)3172 void TRSM_RUNU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
3173 {
3174
3175 // invalidate stored inverse diagonal of result matrix
3176 sD->use_dA = 0;
3177
3178 int jj;
3179 char cl = 'l';
3180 char cn = 'n';
3181 char cr = 'r';
3182 char ct = 't';
3183 char cu = 'u';
3184 REAL *pA = sA->pA+ai+aj*sA->m;
3185 REAL *pB = sB->pA+bi+bj*sB->m;
3186 REAL *pD = sD->pA+di+dj*sD->m;
3187 int i1 = 1;
3188 int lda = sA->m;
3189 int ldb = sB->m;
3190 int ldd = sD->m;
3191 if(!(pB==pD))
3192 {
3193 for(jj=0; jj<n; jj++)
3194 COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
3195 }
3196 TRSM(&cr, &cu, &cn, &cu, &m, &n, &alpha, pA, &lda, pD, &ldd);
3197 return;
3198 }
3199
3200
3201
3202 // dtrsm_right_upper_transposed_notunit
TRSM_RUTN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)3203 void TRSM_RUTN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
3204 {
3205
3206 // invalidate stored inverse diagonal of result matrix
3207 sD->use_dA = 0;
3208
3209 int jj;
3210 char cl = 'l';
3211 char cn = 'n';
3212 char cr = 'r';
3213 char ct = 't';
3214 char cu = 'u';
3215 REAL *pA = sA->pA+ai+aj*sA->m;
3216 REAL *pB = sB->pA+bi+bj*sB->m;
3217 REAL *pD = sD->pA+di+dj*sD->m;
3218 int i1 = 1;
3219 int lda = sA->m;
3220 int ldb = sB->m;
3221 int ldd = sD->m;
3222 if(!(pB==pD))
3223 {
3224 for(jj=0; jj<n; jj++)
3225 COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
3226 }
3227 TRSM(&cr, &cu, &ct, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
3228 return;
3229 }
3230
3231
3232
3233 // dtrsm_right_upper_transposed_unit
TRSM_RUTU(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)3234 void TRSM_RUTU(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
3235 {
3236
3237 // invalidate stored inverse diagonal of result matrix
3238 sD->use_dA = 0;
3239
3240 int jj;
3241 char cl = 'l';
3242 char cn = 'n';
3243 char cr = 'r';
3244 char ct = 't';
3245 char cu = 'u';
3246 REAL *pA = sA->pA+ai+aj*sA->m;
3247 REAL *pB = sB->pA+bi+bj*sB->m;
3248 REAL *pD = sD->pA+di+dj*sD->m;
3249 int i1 = 1;
3250 int lda = sA->m;
3251 int ldb = sB->m;
3252 int ldd = sD->m;
3253 if(!(pB==pD))
3254 {
3255 for(jj=0; jj<n; jj++)
3256 COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
3257 }
3258 TRSM(&cr, &cu, &ct, &cu, &m, &n, &alpha, pA, &lda, pD, &ldd);
3259 return;
3260 }
3261
3262
3263
3264 // dtrmm_right_upper_transposed_notunit (A triangular !!!)
TRMM_RUTN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)3265 void TRMM_RUTN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
3266 {
3267
3268 // invalidate stored inverse diagonal of result matrix
3269 sD->use_dA = 0;
3270
3271 int jj;
3272 char cl = 'l';
3273 char cn = 'n';
3274 char cr = 'r';
3275 char ct = 't';
3276 char cu = 'u';
3277 REAL *pA = sA->pA+ai+aj*sA->m;
3278 REAL *pB = sB->pA+bi+bj*sB->m;
3279 REAL *pD = sD->pA+di+dj*sD->m;
3280 int i1 = 1;
3281 int lda = sA->m;
3282 int ldb = sB->m;
3283 int ldd = sD->m;
3284 if(!(pB==pD))
3285 {
3286 for(jj=0; jj<n; jj++)
3287 COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
3288 }
3289 TRMM(&cr, &cu, &ct, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
3290 return;
3291 }
3292
3293
3294
3295 // dtrmm_right_lower_nottransposed_notunit (A triangular !!!)
TRMM_RLNN(int m,int n,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,struct XMAT * sD,int di,int dj)3296 void TRMM_RLNN(int m, int n, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, struct XMAT *sD, int di, int dj)
3297 {
3298
3299 // invalidate stored inverse diagonal of result matrix
3300 sD->use_dA = 0;
3301
3302 int jj;
3303 char cl = 'l';
3304 char cn = 'n';
3305 char cr = 'r';
3306 char ct = 't';
3307 char cu = 'u';
3308 REAL *pA = sA->pA+ai+aj*sA->m;
3309 REAL *pB = sB->pA+bi+bj*sB->m;
3310 REAL *pD = sD->pA+di+dj*sD->m;
3311 int i1 = 1;
3312 int lda = sA->m;
3313 int ldb = sB->m;
3314 int ldd = sD->m;
3315 if(!(pB==pD))
3316 {
3317 for(jj=0; jj<n; jj++)
3318 COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
3319 }
3320 TRMM(&cr, &cl, &cn, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
3321 return;
3322 }
3323
3324
3325
3326 // dsyrk lower not-transposed (allowing for different factors => use dgemm !!!)
SYRK_LN(int m,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)3327 void SYRK_LN(int m, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
3328 {
3329
3330 // invalidate stored inverse diagonal of result matrix
3331 sD->use_dA = 0;
3332
3333 int jj;
3334 char cl = 'l';
3335 char cn = 'n';
3336 char cr = 'r';
3337 char ct = 't';
3338 char cu = 'u';
3339 REAL *pA = sA->pA + ai + aj*sA->m;
3340 REAL *pB = sB->pA + bi + bj*sB->m;
3341 REAL *pC = sC->pA + ci + cj*sC->m;
3342 REAL *pD = sD->pA + di + dj*sD->m;
3343 int i1 = 1;
3344 int lda = sA->m;
3345 int ldb = sB->m;
3346 int ldc = sC->m;
3347 int ldd = sD->m;
3348 if(!(beta==0.0 || pC==pD))
3349 {
3350 for(jj=0; jj<m; jj++)
3351 COPY(&m, pC+jj*sC->m, &i1, pD+jj*sD->m, &i1);
3352 }
3353 if(pA==pB)
3354 {
3355 SYRK(&cl, &cn, &m, &k, &alpha, pA, &lda, &beta, pD, &ldd);
3356 }
3357 else
3358 {
3359 GEMM(&cn, &ct, &m, &m, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
3360 }
3361 return;
3362 }
3363
3364
3365
3366 // dsyrk lower not-transposed (allowing for different factors => use dgemm !!!)
SYRK_LN_MN(int m,int n,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)3367 void SYRK_LN_MN(int m, int n, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
3368 {
3369
3370 // invalidate stored inverse diagonal of result matrix
3371 sD->use_dA = 0;
3372
3373 int jj;
3374 char cl = 'l';
3375 char cn = 'n';
3376 char cr = 'r';
3377 char ct = 't';
3378 char cu = 'u';
3379 REAL *pA = sA->pA + ai + aj*sA->m;
3380 REAL *pB = sB->pA + bi + bj*sB->m;
3381 REAL *pC = sC->pA + ci + cj*sC->m;
3382 REAL *pD = sD->pA + di + dj*sD->m;
3383 int i1 = 1;
3384 int mmn = m-n;
3385 int lda = sA->m;
3386 int ldb = sB->m;
3387 int ldc = sC->m;
3388 int ldd = sD->m;
3389 if(!(beta==0.0 || pC==pD))
3390 {
3391 for(jj=0; jj<n; jj++)
3392 COPY(&m, pC+jj*sC->m, &i1, pD+jj*sD->m, &i1);
3393 }
3394 if(pA==pB)
3395 {
3396 SYRK(&cl, &cn, &n, &k, &alpha, pA, &lda, &beta, pD, &ldd);
3397 GEMM(&cn, &ct, &mmn, &n, &k, &alpha, pA+n, &lda, pB, &ldb, &beta, pD+n, &ldd);
3398 }
3399 else
3400 {
3401 GEMM(&cn, &ct, &m, &n, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
3402 }
3403 return;
3404 }
3405
3406
3407
3408 // dsyrk lower transposed (allowing for different factors => use dgemm !!!)
SYRK_LT(int m,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)3409 void SYRK_LT(int m, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
3410 {
3411
3412 // invalidate stored inverse diagonal of result matrix
3413 sD->use_dA = 0;
3414
3415 int jj;
3416 char cl = 'l';
3417 char cn = 'n';
3418 char cr = 'r';
3419 char ct = 't';
3420 char cu = 'u';
3421 REAL *pA = sA->pA + ai + aj*sA->m;
3422 REAL *pB = sB->pA + bi + bj*sB->m;
3423 REAL *pC = sC->pA + ci + cj*sC->m;
3424 REAL *pD = sD->pA + di + dj*sD->m;
3425 int i1 = 1;
3426 int lda = sA->m;
3427 int ldb = sB->m;
3428 int ldc = sC->m;
3429 int ldd = sD->m;
3430 if(!(beta==0.0 || pC==pD))
3431 {
3432 for(jj=0; jj<m; jj++)
3433 COPY(&m, pC+jj*sC->m, &i1, pD+jj*sD->m, &i1);
3434 }
3435 if(pA==pB)
3436 {
3437 SYRK(&cl, &ct, &m, &k, &alpha, pA, &lda, &beta, pD, &ldd);
3438 }
3439 else
3440 {
3441 GEMM(&ct, &cn, &m, &m, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
3442 }
3443 return;
3444 }
3445
3446
3447
3448 // dsyrk upper not-transposed (allowing for different factors => use dgemm !!!)
SYRK_UN(int m,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)3449 void SYRK_UN(int m, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
3450 {
3451
3452 // invalidate stored inverse diagonal of result matrix
3453 sD->use_dA = 0;
3454
3455 int jj;
3456 char cl = 'l';
3457 char cn = 'n';
3458 char cr = 'r';
3459 char ct = 't';
3460 char cu = 'u';
3461 REAL *pA = sA->pA + ai + aj*sA->m;
3462 REAL *pB = sB->pA + bi + bj*sB->m;
3463 REAL *pC = sC->pA + ci + cj*sC->m;
3464 REAL *pD = sD->pA + di + dj*sD->m;
3465 int i1 = 1;
3466 int lda = sA->m;
3467 int ldb = sB->m;
3468 int ldc = sC->m;
3469 int ldd = sD->m;
3470 if(!(beta==0.0 || pC==pD))
3471 {
3472 for(jj=0; jj<m; jj++)
3473 COPY(&m, pC+jj*sC->m, &i1, pD+jj*sD->m, &i1);
3474 }
3475 if(pA==pB)
3476 {
3477 SYRK(&cu, &cn, &m, &k, &alpha, pA, &lda, &beta, pD, &ldd);
3478 }
3479 else
3480 {
3481 GEMM(&cn, &ct, &m, &m, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
3482 }
3483 return;
3484 }
3485
3486
3487
3488 // dsyrk upper transposed (allowing for different factors => use dgemm !!!)
SYRK_UT(int m,int k,REAL alpha,struct XMAT * sA,int ai,int aj,struct XMAT * sB,int bi,int bj,REAL beta,struct XMAT * sC,int ci,int cj,struct XMAT * sD,int di,int dj)3489 void SYRK_UT(int m, int k, REAL alpha, struct XMAT *sA, int ai, int aj, struct XMAT *sB, int bi, int bj, REAL beta, struct XMAT *sC, int ci, int cj, struct XMAT *sD, int di, int dj)
3490 {
3491
3492 // invalidate stored inverse diagonal of result matrix
3493 sD->use_dA = 0;
3494
3495 int jj;
3496 char cl = 'l';
3497 char cn = 'n';
3498 char cr = 'r';
3499 char ct = 't';
3500 char cu = 'u';
3501 REAL *pA = sA->pA + ai + aj*sA->m;
3502 REAL *pB = sB->pA + bi + bj*sB->m;
3503 REAL *pC = sC->pA + ci + cj*sC->m;
3504 REAL *pD = sD->pA + di + dj*sD->m;
3505 int i1 = 1;
3506 int lda = sA->m;
3507 int ldb = sB->m;
3508 int ldc = sC->m;
3509 int ldd = sD->m;
3510 if(!(beta==0.0 || pC==pD))
3511 {
3512 for(jj=0; jj<m; jj++)
3513 COPY(&m, pC+jj*sC->m, &i1, pD+jj*sD->m, &i1);
3514 }
3515 if(pA==pB)
3516 {
3517 SYRK(&cu, &ct, &m, &k, &alpha, pA, &lda, &beta, pD, &ldd);
3518 }
3519 else
3520 {
3521 GEMM(&ct, &cn, &m, &m, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
3522 }
3523 return;
3524 }
3525
3526
3527
3528 #else
3529
3530 #error : wrong LA choice
3531
3532 #endif
3533