1 /**************************************************************************************************
2 * *
3 * This file is part of BLASFEO. *
4 * *
5 * BLASFEO -- BLAS For Embedded Optimization. *
6 * Copyright (C) 2019 by Gianluca Frison. *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8 * All rights reserved. *
9 * *
10 * The 2-Clause BSD License *
11 * *
12 * Redistribution and use in source and binary forms, with or without *
13 * modification, are permitted provided that the following conditions are met: *
14 * *
15 * 1. Redistributions of source code must retain the above copyright notice, this *
16 * list of conditions and the following disclaimer. *
17 * 2. Redistributions in binary form must reproduce the above copyright notice, *
18 * this list of conditions and the following disclaimer in the documentation *
19 * and/or other materials provided with the distribution. *
20 * *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
31 * *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de *
33 * *
34 **************************************************************************************************/
35
36 #include <math.h>
37 #include <stdio.h>
38
39 #include "../../include/blasfeo_common.h"
40 #include "../../include/blasfeo_s_aux.h"
41
42
43
44 // C numbering, starting from 0
45 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
sidamax_lib4(int n,int offset,float * pA,int sda,int * p_idamax,float * p_amax)46 void sidamax_lib4(int n, int offset, float *pA, int sda, int *p_idamax, float *p_amax)
47 {
48
49 int ii;
50 float tmp, amax;
51
52 int idamax = -1;
53
54 p_idamax[0] = idamax;
55 if(n<1)
56 return;
57
58 const int bs = 4;
59
60 int na = (bs - offset%bs)%bs;
61 na = n<na ? n : na;
62
63 amax = -1.0;
64 ii = 0;
65 if(na>0)
66 {
67 for( ; ii<na; ii++)
68 {
69 tmp = fabs(pA[0]);
70 if(tmp>amax)
71 {
72 idamax = ii+0;
73 amax = tmp;
74 }
75 pA += 1;
76 }
77 pA += bs*(sda-1);
78 }
79 for( ; ii<n-3; ii+=4)
80 {
81 tmp = fabs(pA[0]);
82 if(tmp>amax)
83 {
84 idamax = ii+0;
85 amax = tmp;
86 }
87 tmp = fabs(pA[1]);
88 if(tmp>amax)
89 {
90 idamax = ii+1;
91 amax = tmp;
92 }
93 tmp = fabs(pA[2]);
94 if(tmp>amax)
95 {
96 idamax = ii+2;
97 amax = tmp;
98 }
99 tmp = fabs(pA[3]);
100 if(tmp>amax)
101 {
102 idamax = ii+3;
103 amax = tmp;
104 }
105 pA += bs*sda;
106 }
107 for( ; ii<n; ii++)
108 {
109 tmp = fabs(pA[0]);
110 if(tmp>amax)
111 {
112 idamax = ii+0;
113 amax = tmp;
114 }
115 pA += 1;
116 }
117
118 p_amax[0] = amax;
119 p_idamax[0] = idamax;
120
121 return;
122
123 }
124 #endif
125
126
127
128 // C numering (starting from zero) in the ipiv
129 // it process m>=4 rows and 4 cols
130 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgetrf_pivot_4_lib4(int m,float * pA,int sda,float * inv_diag_A,int * ipiv)131 void kernel_sgetrf_pivot_4_lib4(int m, float *pA, int sda, float *inv_diag_A, int* ipiv)
132 {
133
134 const int bs = 4;
135
136 // assume m>=4
137 int ma = m-4;
138
139 float
140 tmp0, tmp1, tmp2, tmp3,
141 u_00, u_01, u_02, u_03,
142 u_11, u_12, u_13,
143 u_22, u_23,
144 u_33;
145
146 float
147 *pB;
148
149 int
150 k, idamax;
151
152 // first column
153 sidamax_lib4(m-0, 0, &pA[0+bs*0], sda, &idamax, &tmp0);
154 ipiv[0] = idamax;
155 if(tmp0!=0.0)
156 {
157 if(ipiv[0]!=0)
158 srowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
159
160 tmp0 = 1.0 / pA[0+bs*0];
161 inv_diag_A[0] = tmp0;
162 pA[1+bs*0] *= tmp0;
163 pA[2+bs*0] *= tmp0;
164 pA[3+bs*0] *= tmp0;
165 pB = pA + bs*sda;
166 for(k=0; k<ma-3; k+=4)
167 {
168 pB[0+bs*0] *= tmp0;
169 pB[1+bs*0] *= tmp0;
170 pB[2+bs*0] *= tmp0;
171 pB[3+bs*0] *= tmp0;
172 pB += bs*sda;
173 }
174 for( ; k<ma; k++)
175 {
176 pB[0+bs*0] *= tmp0;
177 pB += 1;
178 }
179 }
180 else
181 {
182 inv_diag_A[0] = 0.0;
183 }
184
185 // second column
186 u_01 = pA[0+bs*1];
187 tmp1 = pA[1+bs*1];
188 tmp2 = pA[2+bs*1];
189 tmp3 = pA[3+bs*1];
190 tmp1 -= pA[1+bs*0] * u_01;
191 tmp2 -= pA[2+bs*0] * u_01;
192 tmp3 -= pA[3+bs*0] * u_01;
193 pA[1+bs*1] = tmp1;
194 pA[2+bs*1] = tmp2;
195 pA[3+bs*1] = tmp3;
196 pB = pA + bs*sda;
197 for(k=0; k<ma-3; k+=4)
198 {
199 tmp0 = pB[0+bs*1];
200 tmp1 = pB[1+bs*1];
201 tmp2 = pB[2+bs*1];
202 tmp3 = pB[3+bs*1];
203 tmp0 -= pB[0+bs*0] * u_01;
204 tmp1 -= pB[1+bs*0] * u_01;
205 tmp2 -= pB[2+bs*0] * u_01;
206 tmp3 -= pB[3+bs*0] * u_01;
207 pB[0+bs*1] = tmp0;
208 pB[1+bs*1] = tmp1;
209 pB[2+bs*1] = tmp2;
210 pB[3+bs*1] = tmp3;
211 pB += bs*sda;
212 }
213 for( ; k<ma; k++)
214 {
215 tmp0 = pB[0+bs*1];
216 tmp0 -= pB[0+bs*0] * u_01;
217 pB[0+bs*1] = tmp0;
218 pB += 1;
219 }
220
221 sidamax_lib4(m-1, 1, &pA[1+bs*1], sda, &idamax, &tmp1);
222 ipiv[1] = idamax+1;
223 if(tmp1!=0)
224 {
225 if(ipiv[1]!=1)
226 srowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
227
228 tmp1 = 1.0 / pA[1+bs*1];
229 inv_diag_A[1] = tmp1;
230 pA[2+bs*1] *= tmp1;
231 pA[3+bs*1] *= tmp1;
232 pB = pA + bs*sda;
233 for(k=0; k<ma-3; k+=4)
234 {
235 pB[0+bs*1] *= tmp1;
236 pB[1+bs*1] *= tmp1;
237 pB[2+bs*1] *= tmp1;
238 pB[3+bs*1] *= tmp1;
239 pB += bs*sda;
240 }
241 for( ; k<ma; k++)
242 {
243 pB[0+bs*1] *= tmp1;
244 pB += 1;
245 }
246 }
247 else
248 {
249 inv_diag_A[1] = 0.0;
250 }
251
252 // third column
253 u_02 = pA[0+bs*2];
254 u_12 = pA[1+bs*2];
255 u_12 -= pA[1+bs*0] * u_02;
256 pA[1+bs*2] = u_12;
257 tmp2 = pA[2+bs*2];
258 tmp3 = pA[3+bs*2];
259 tmp2 -= pA[2+bs*0] * u_02;
260 tmp3 -= pA[3+bs*0] * u_02;
261 tmp2 -= pA[2+bs*1] * u_12;
262 tmp3 -= pA[3+bs*1] * u_12;
263 pA[2+bs*2] = tmp2;
264 pA[3+bs*2] = tmp3;
265 pB = pA + bs*sda;
266 for(k=0; k<ma-3; k+=4)
267 {
268 tmp0 = pB[0+bs*2];
269 tmp1 = pB[1+bs*2];
270 tmp2 = pB[2+bs*2];
271 tmp3 = pB[3+bs*2];
272 tmp0 -= pB[0+bs*0] * u_02;
273 tmp1 -= pB[1+bs*0] * u_02;
274 tmp2 -= pB[2+bs*0] * u_02;
275 tmp3 -= pB[3+bs*0] * u_02;
276 tmp0 -= pB[0+bs*1] * u_12;
277 tmp1 -= pB[1+bs*1] * u_12;
278 tmp2 -= pB[2+bs*1] * u_12;
279 tmp3 -= pB[3+bs*1] * u_12;
280 pB[0+bs*2] = tmp0;
281 pB[1+bs*2] = tmp1;
282 pB[2+bs*2] = tmp2;
283 pB[3+bs*2] = tmp3;
284 pB += bs*sda;
285 }
286 for( ; k<ma; k++)
287 {
288 tmp0 = pB[0+bs*2];
289 tmp0 -= pB[0+bs*0] * u_02;
290 tmp0 -= pB[0+bs*1] * u_12;
291 pB[0+bs*2] = tmp0;
292 pB += 1;
293 }
294
295 sidamax_lib4(m-2, 2, &pA[2+bs*2], sda, &idamax, &tmp2);
296 ipiv[2] = idamax+2;
297 if(tmp2!=0)
298 {
299 if(ipiv[2]!=2)
300 srowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
301
302 tmp2 = 1.0 / pA[2+bs*2];
303 inv_diag_A[2] = tmp2;
304 pA[3+bs*2] *= tmp2;
305 pB = pA + bs*sda;
306 for(k=0; k<ma-3; k+=4)
307 {
308 pB[0+bs*2] *= tmp2;
309 pB[1+bs*2] *= tmp2;
310 pB[2+bs*2] *= tmp2;
311 pB[3+bs*2] *= tmp2;
312 pB += bs*sda;
313 }
314 for( ; k<ma; k++)
315 {
316 pB[0+bs*2] *= tmp2;
317 pB += 1;
318 }
319 }
320 else
321 {
322 inv_diag_A[2] = 0.0;
323 }
324
325 // fourth column
326 u_03 = pA[0+bs*3];
327 u_13 = pA[1+bs*3];
328 u_13 -= pA[1+bs*0] * u_03;
329 pA[1+bs*3] = u_13;
330 u_23 = pA[2+bs*3];
331 u_23 -= pA[2+bs*0] * u_03;
332 u_23 -= pA[2+bs*1] * u_13;
333 pA[2+bs*3] = u_23;
334 tmp3 = pA[3+bs*3];
335 tmp3 -= pA[3+bs*0] * u_03;
336 tmp3 -= pA[3+bs*1] * u_13;
337 tmp3 -= pA[3+bs*2] * u_23;
338 pA[3+bs*3] = tmp3;
339 pB = pA + bs*sda;
340 for(k=0; k<ma-3; k+=4)
341 {
342 tmp0 = pB[0+bs*3];
343 tmp1 = pB[1+bs*3];
344 tmp2 = pB[2+bs*3];
345 tmp3 = pB[3+bs*3];
346 tmp0 -= pB[0+bs*0] * u_03;
347 tmp1 -= pB[1+bs*0] * u_03;
348 tmp2 -= pB[2+bs*0] * u_03;
349 tmp3 -= pB[3+bs*0] * u_03;
350 tmp0 -= pB[0+bs*1] * u_13;
351 tmp1 -= pB[1+bs*1] * u_13;
352 tmp2 -= pB[2+bs*1] * u_13;
353 tmp3 -= pB[3+bs*1] * u_13;
354 tmp0 -= pB[0+bs*2] * u_23;
355 tmp1 -= pB[1+bs*2] * u_23;
356 tmp2 -= pB[2+bs*2] * u_23;
357 tmp3 -= pB[3+bs*2] * u_23;
358 pB[0+bs*3] = tmp0;
359 pB[1+bs*3] = tmp1;
360 pB[2+bs*3] = tmp2;
361 pB[3+bs*3] = tmp3;
362 pB += bs*sda;
363 }
364 for( ; k<ma; k++)
365 {
366 tmp0 = pB[0+bs*3];
367 tmp0 -= pB[0+bs*0] * u_03;
368 tmp0 -= pB[0+bs*1] * u_13;
369 tmp0 -= pB[0+bs*2] * u_23;
370 pB[0+bs*3] = tmp0;
371 pB += 1;
372 }
373
374 sidamax_lib4(m-3, 3, &pA[3+bs*3], sda, &idamax, &tmp3);
375 ipiv[3] = idamax+3;
376 if(tmp3!=0)
377 {
378 if(ipiv[3]!=3)
379 srowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
380
381 tmp3 = 1.0 / pA[3+bs*3];
382 inv_diag_A[3] = tmp3;
383 pB = pA + bs*sda;
384 for(k=0; k<ma-3; k+=4)
385 {
386 pB[0+bs*3] *= tmp3;
387 pB[1+bs*3] *= tmp3;
388 pB[2+bs*3] *= tmp3;
389 pB[3+bs*3] *= tmp3;
390 pB += bs*sda;
391 }
392 for( ; k<ma; k++)
393 {
394 pB[0+bs*3] *= tmp3;
395 pB += 1;
396 }
397 }
398 else
399 {
400 inv_diag_A[3] = 0.0;
401 }
402
403 return;
404
405 }
406 #endif
407
408
409
410 // it process m>0 rows and 0<n<=4 cols
411 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgetrf_pivot_4_vs_lib4(int m,int n,float * pA,int sda,float * inv_diag_A,int * ipiv)412 void kernel_sgetrf_pivot_4_vs_lib4(int m, int n, float *pA, int sda, float *inv_diag_A, int* ipiv)
413 {
414
415 if(m<=0 || n<=0)
416 return;
417
418 const int bs = 4;
419
420 // assume m>=4
421 int ma = m-4;
422
423 float
424 tmp0, tmp1, tmp2, tmp3,
425 u_00, u_01, u_02, u_03,
426 u_11, u_12, u_13,
427 u_22, u_23,
428 u_33;
429
430 float
431 *pB;
432
433 int
434 k, idamax;
435
436 // first column
437
438 // find pivot & scale
439 sidamax_lib4(m-0, 0, &pA[0+bs*0], sda, &idamax, &tmp0);
440 ipiv[0] = idamax;
441 if(tmp0!=0.0)
442 {
443 if(ipiv[0]!=0)
444 srowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
445
446 tmp0 = 1.0 / pA[0+bs*0];
447 inv_diag_A[0] = tmp0;
448 if(m>=4)
449 {
450 pA[1+bs*0] *= tmp0;
451 pA[2+bs*0] *= tmp0;
452 pA[3+bs*0] *= tmp0;
453 pB = pA + bs*sda;
454 for(k=0; k<ma-3; k+=4)
455 {
456 pB[0+bs*0] *= tmp0;
457 pB[1+bs*0] *= tmp0;
458 pB[2+bs*0] *= tmp0;
459 pB[3+bs*0] *= tmp0;
460 pB += bs*sda;
461 }
462 for( ; k<ma; k++)
463 {
464 pB[0+bs*0] *= tmp0;
465 pB += 1;
466 }
467 }
468 else // m = {1,2,3}
469 {
470 if(m>1)
471 {
472 pA[1+bs*0] *= tmp0;
473 if(m>2)
474 pA[2+bs*0] *= tmp0;
475 }
476 }
477 }
478 else
479 {
480 inv_diag_A[0] = 0.0;
481 }
482
483 if(n==1 || m==1) // XXX for the first row there is nothing to do, so we can return here
484 return;
485
486 // second column
487
488 // correct
489 if(m>=4)
490 {
491 u_01 = pA[0+bs*1];
492 tmp1 = pA[1+bs*1];
493 tmp2 = pA[2+bs*1];
494 tmp3 = pA[3+bs*1];
495 tmp1 -= pA[1+bs*0] * u_01;
496 tmp2 -= pA[2+bs*0] * u_01;
497 tmp3 -= pA[3+bs*0] * u_01;
498 pA[1+bs*1] = tmp1;
499 pA[2+bs*1] = tmp2;
500 pA[3+bs*1] = tmp3;
501 pB = pA + bs*sda;
502 for(k=0; k<ma-3; k+=4)
503 {
504 tmp0 = pB[0+bs*1];
505 tmp1 = pB[1+bs*1];
506 tmp2 = pB[2+bs*1];
507 tmp3 = pB[3+bs*1];
508 tmp0 -= pB[0+bs*0] * u_01;
509 tmp1 -= pB[1+bs*0] * u_01;
510 tmp2 -= pB[2+bs*0] * u_01;
511 tmp3 -= pB[3+bs*0] * u_01;
512 pB[0+bs*1] = tmp0;
513 pB[1+bs*1] = tmp1;
514 pB[2+bs*1] = tmp2;
515 pB[3+bs*1] = tmp3;
516 pB += bs*sda;
517 }
518 for( ; k<ma; k++)
519 {
520 tmp0 = pB[0+bs*1];
521 tmp0 -= pB[0+bs*0] * u_01;
522 pB[0+bs*1] = tmp0;
523 pB += 1;
524 }
525 }
526 else // m = {2,3}
527 {
528 u_01 = pA[0+bs*1];
529 tmp1 = pA[1+bs*1];
530 tmp1 -= pA[1+bs*0] * u_01;
531 pA[1+bs*1] = tmp1;
532 if(m>2)
533 {
534 tmp2 = pA[2+bs*1];
535 tmp2 -= pA[2+bs*0] * u_01;
536 pA[2+bs*1] = tmp2;
537 }
538 }
539
540 // find pivot & scale
541 sidamax_lib4(m-1, 1, &pA[1+bs*1], sda, &idamax, &tmp1);
542 ipiv[1] = idamax+1;
543 if(tmp1!=0)
544 {
545 if(ipiv[1]!=1)
546 srowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
547
548 tmp1 = 1.0 / pA[1+bs*1];
549 inv_diag_A[1] = tmp1;
550 if(m>=4)
551 {
552 pA[2+bs*1] *= tmp1;
553 pA[3+bs*1] *= tmp1;
554 pB = pA + bs*sda;
555 for(k=0; k<ma-3; k+=4)
556 {
557 pB[0+bs*1] *= tmp1;
558 pB[1+bs*1] *= tmp1;
559 pB[2+bs*1] *= tmp1;
560 pB[3+bs*1] *= tmp1;
561 pB += bs*sda;
562 }
563 for( ; k<ma; k++)
564 {
565 pB[0+bs*1] *= tmp1;
566 pB += 1;
567 }
568 }
569 else // m = {2,3}
570 {
571 if(m>2)
572 pA[2+bs*1] *= tmp1;
573 }
574 }
575 else
576 {
577 inv_diag_A[1] = 0.0;
578 }
579
580 if(n==2)
581 return;
582
583 // third column
584
585 // correct
586 if(m>=4)
587 {
588 u_02 = pA[0+bs*2];
589 u_12 = pA[1+bs*2];
590 u_12 -= pA[1+bs*0] * u_02;
591 pA[1+bs*2] = u_12;
592 tmp2 = pA[2+bs*2];
593 tmp3 = pA[3+bs*2];
594 tmp2 -= pA[2+bs*0] * u_02;
595 tmp3 -= pA[3+bs*0] * u_02;
596 tmp2 -= pA[2+bs*1] * u_12;
597 tmp3 -= pA[3+bs*1] * u_12;
598 pA[2+bs*2] = tmp2;
599 pA[3+bs*2] = tmp3;
600 pB = pA + bs*sda;
601 for(k=0; k<ma-3; k+=4)
602 {
603 tmp0 = pB[0+bs*2];
604 tmp1 = pB[1+bs*2];
605 tmp2 = pB[2+bs*2];
606 tmp3 = pB[3+bs*2];
607 tmp0 -= pB[0+bs*0] * u_02;
608 tmp1 -= pB[1+bs*0] * u_02;
609 tmp2 -= pB[2+bs*0] * u_02;
610 tmp3 -= pB[3+bs*0] * u_02;
611 tmp0 -= pB[0+bs*1] * u_12;
612 tmp1 -= pB[1+bs*1] * u_12;
613 tmp2 -= pB[2+bs*1] * u_12;
614 tmp3 -= pB[3+bs*1] * u_12;
615 pB[0+bs*2] = tmp0;
616 pB[1+bs*2] = tmp1;
617 pB[2+bs*2] = tmp2;
618 pB[3+bs*2] = tmp3;
619 pB += bs*sda;
620 }
621 for( ; k<ma; k++)
622 {
623 tmp0 = pB[0+bs*2];
624 tmp0 -= pB[0+bs*0] * u_02;
625 tmp0 -= pB[0+bs*1] * u_12;
626 pB[0+bs*2] = tmp0;
627 pB += 1;
628 }
629 }
630 else // m = {2,3}
631 {
632 u_02 = pA[0+bs*2];
633 u_12 = pA[1+bs*2];
634 u_12 -= pA[1+bs*0] * u_02;
635 pA[1+bs*2] = u_12;
636 if(m>2)
637 {
638 tmp2 = pA[2+bs*2];
639 tmp2 -= pA[2+bs*0] * u_02;
640 tmp2 -= pA[2+bs*1] * u_12;
641 pA[2+bs*2] = tmp2;
642 }
643 }
644
645 // find pivot & scale
646 if(m>2)
647 {
648 sidamax_lib4(m-2, 2, &pA[2+bs*2], sda, &idamax, &tmp2);
649 ipiv[2] = idamax+2;
650 if(tmp2!=0)
651 {
652 if(ipiv[2]!=2)
653 srowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
654
655 tmp2 = 1.0 / pA[2+bs*2];
656 inv_diag_A[2] = tmp2;
657 if(m>=4)
658 {
659 pA[3+bs*2] *= tmp2;
660 pB = pA + bs*sda;
661 for(k=0; k<ma-3; k+=4)
662 {
663 pB[0+bs*2] *= tmp2;
664 pB[1+bs*2] *= tmp2;
665 pB[2+bs*2] *= tmp2;
666 pB[3+bs*2] *= tmp2;
667 pB += bs*sda;
668 }
669 for( ; k<ma; k++)
670 {
671 pB[0+bs*2] *= tmp2;
672 pB += 1;
673 }
674 }
675 }
676 else
677 {
678 inv_diag_A[2] = 0.0;
679 }
680 }
681
682 if(n<4)
683 return;
684
685 // fourth column
686
687 // correct
688 if(m>=4)
689 {
690 u_03 = pA[0+bs*3];
691 u_13 = pA[1+bs*3];
692 u_13 -= pA[1+bs*0] * u_03;
693 pA[1+bs*3] = u_13;
694 u_23 = pA[2+bs*3];
695 u_23 -= pA[2+bs*0] * u_03;
696 u_23 -= pA[2+bs*1] * u_13;
697 pA[2+bs*3] = u_23;
698 tmp3 = pA[3+bs*3];
699 tmp3 -= pA[3+bs*0] * u_03;
700 tmp3 -= pA[3+bs*1] * u_13;
701 tmp3 -= pA[3+bs*2] * u_23;
702 pA[3+bs*3] = tmp3;
703 pB = pA + bs*sda;
704 for(k=0; k<ma-3; k+=4)
705 {
706 tmp0 = pB[0+bs*3];
707 tmp1 = pB[1+bs*3];
708 tmp2 = pB[2+bs*3];
709 tmp3 = pB[3+bs*3];
710 tmp0 -= pB[0+bs*0] * u_03;
711 tmp1 -= pB[1+bs*0] * u_03;
712 tmp2 -= pB[2+bs*0] * u_03;
713 tmp3 -= pB[3+bs*0] * u_03;
714 tmp0 -= pB[0+bs*1] * u_13;
715 tmp1 -= pB[1+bs*1] * u_13;
716 tmp2 -= pB[2+bs*1] * u_13;
717 tmp3 -= pB[3+bs*1] * u_13;
718 tmp0 -= pB[0+bs*2] * u_23;
719 tmp1 -= pB[1+bs*2] * u_23;
720 tmp2 -= pB[2+bs*2] * u_23;
721 tmp3 -= pB[3+bs*2] * u_23;
722 pB[0+bs*3] = tmp0;
723 pB[1+bs*3] = tmp1;
724 pB[2+bs*3] = tmp2;
725 pB[3+bs*3] = tmp3;
726 pB += bs*sda;
727 }
728 for( ; k<ma; k++)
729 {
730 tmp0 = pB[0+bs*3];
731 tmp0 -= pB[0+bs*0] * u_03;
732 tmp0 -= pB[0+bs*1] * u_13;
733 tmp0 -= pB[0+bs*2] * u_23;
734 pB[0+bs*3] = tmp0;
735 pB += 1;
736 }
737 }
738 else // m = {2,3}
739 {
740 u_03 = pA[0+bs*3];
741 u_13 = pA[1+bs*3];
742 u_13 -= pA[1+bs*0] * u_03;
743 pA[1+bs*3] = u_13;
744 if(m>2)
745 {
746 u_23 = pA[2+bs*3];
747 u_23 -= pA[2+bs*0] * u_03;
748 u_23 -= pA[2+bs*1] * u_13;
749 pA[2+bs*3] = u_23;
750 }
751 }
752
753 if(m>3)
754 {
755 // find pivot & scale
756 sidamax_lib4(m-3, 3, &pA[3+bs*3], sda, &idamax, &tmp3);
757 ipiv[3] = idamax+3;
758 if(tmp3!=0)
759 {
760 if(ipiv[3]!=3)
761 srowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
762
763 tmp3 = 1.0 / pA[3+bs*3];
764 inv_diag_A[3] = tmp3;
765 pB = pA + bs*sda;
766 for(k=0; k<ma-3; k+=4)
767 {
768 pB[0+bs*3] *= tmp3;
769 pB[1+bs*3] *= tmp3;
770 pB[2+bs*3] *= tmp3;
771 pB[3+bs*3] *= tmp3;
772 pB += bs*sda;
773 }
774 for( ; k<ma; k++)
775 {
776 pB[0+bs*3] *= tmp3;
777 pB += 1;
778 }
779 }
780 else
781 {
782 inv_diag_A[3] = 0.0;
783 }
784 }
785
786 return;
787
788 }
789 #endif
790
791
792
793
794
795
796