1 /**************************************************************************************************
2 *                                                                                                 *
3 * This file is part of BLASFEO.                                                                   *
4 *                                                                                                 *
5 * BLASFEO -- BLAS For Embedded Optimization.                                                      *
6 * Copyright (C) 2019 by Gianluca Frison.                                                          *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
8 * All rights reserved.                                                                            *
9 *                                                                                                 *
10 * The 2-Clause BSD License                                                                        *
11 *                                                                                                 *
12 * Redistribution and use in source and binary forms, with or without                              *
13 * modification, are permitted provided that the following conditions are met:                     *
14 *                                                                                                 *
15 * 1. Redistributions of source code must retain the above copyright notice, this                  *
16 *    list of conditions and the following disclaimer.                                             *
17 * 2. Redistributions in binary form must reproduce the above copyright notice,                    *
18 *    this list of conditions and the following disclaimer in the documentation                    *
19 *    and/or other materials provided with the distribution.                                       *
20 *                                                                                                 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND                 *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED                   *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE                          *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR                 *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES                  *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;                    *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND                     *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT                      *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS                   *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                                    *
31 *                                                                                                 *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de                             *
33 *                                                                                                 *
34 **************************************************************************************************/
35 
36 #include <math.h>
37 #include <stdio.h>
38 
39 #include "../../include/blasfeo_common.h"
40 #include "../../include/blasfeo_s_aux.h"
41 
42 
43 
44 // C numbering, starting from 0
45 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
sidamax_lib4(int n,int offset,float * pA,int sda,int * p_idamax,float * p_amax)46 void sidamax_lib4(int n, int offset, float *pA, int sda, int *p_idamax, float *p_amax)
47 	{
48 
49 	int ii;
50 	float tmp, amax;
51 
52 	int idamax = -1;
53 
54 	p_idamax[0] = idamax;
55 	if(n<1)
56 		return;
57 
58 	const int bs = 4;
59 
60 	int na = (bs - offset%bs)%bs;
61 	na = n<na ? n : na;
62 
63 	amax = -1.0;
64 	ii = 0;
65 	if(na>0)
66 		{
67 		for( ; ii<na; ii++)
68 			{
69 			tmp = fabs(pA[0]);
70 			if(tmp>amax)
71 				{
72 				idamax = ii+0;
73 				amax = tmp;
74 				}
75 			pA += 1;
76 			}
77 		pA += bs*(sda-1);
78 		}
79 	for( ; ii<n-3; ii+=4)
80 		{
81 		tmp = fabs(pA[0]);
82 		if(tmp>amax)
83 			{
84 			idamax = ii+0;
85 			amax = tmp;
86 			}
87 		tmp = fabs(pA[1]);
88 		if(tmp>amax)
89 			{
90 			idamax = ii+1;
91 			amax = tmp;
92 			}
93 		tmp = fabs(pA[2]);
94 		if(tmp>amax)
95 			{
96 			idamax = ii+2;
97 			amax = tmp;
98 			}
99 		tmp = fabs(pA[3]);
100 		if(tmp>amax)
101 			{
102 			idamax = ii+3;
103 			amax = tmp;
104 			}
105 		pA += bs*sda;
106 		}
107 	for( ; ii<n; ii++)
108 		{
109 		tmp = fabs(pA[0]);
110 		if(tmp>amax)
111 			{
112 			idamax = ii+0;
113 			amax = tmp;
114 			}
115 		pA += 1;
116 		}
117 
118 	p_amax[0] = amax;
119 	p_idamax[0] = idamax;
120 
121 	return;
122 
123 	}
124 #endif
125 
126 
127 
128 // C numering (starting from zero) in the ipiv
129 // it process m>=4 rows and 4 cols
130 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgetrf_pivot_4_lib4(int m,float * pA,int sda,float * inv_diag_A,int * ipiv)131 void kernel_sgetrf_pivot_4_lib4(int m, float *pA, int sda, float *inv_diag_A, int* ipiv)
132 	{
133 
134 	const int bs = 4;
135 
136 	// assume m>=4
137 	int ma = m-4;
138 
139 	float
140 		tmp0, tmp1, tmp2, tmp3,
141 		u_00, u_01, u_02, u_03,
142 		      u_11, u_12, u_13,
143 		            u_22, u_23,
144 		                  u_33;
145 
146 	float
147 		*pB;
148 
149 	int
150 		k, idamax;
151 
152 	// first column
153 	sidamax_lib4(m-0, 0, &pA[0+bs*0], sda, &idamax, &tmp0);
154 	ipiv[0] = idamax;
155 	if(tmp0!=0.0)
156 		{
157 		if(ipiv[0]!=0)
158 			srowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
159 
160 		tmp0 = 1.0 / pA[0+bs*0];
161 		inv_diag_A[0] = tmp0;
162 		pA[1+bs*0] *= tmp0;
163 		pA[2+bs*0] *= tmp0;
164 		pA[3+bs*0] *= tmp0;
165 		pB = pA + bs*sda;
166 		for(k=0; k<ma-3; k+=4)
167 			{
168 			pB[0+bs*0] *= tmp0;
169 			pB[1+bs*0] *= tmp0;
170 			pB[2+bs*0] *= tmp0;
171 			pB[3+bs*0] *= tmp0;
172 			pB += bs*sda;
173 			}
174 		for( ; k<ma; k++)
175 			{
176 			pB[0+bs*0] *= tmp0;
177 			pB += 1;
178 			}
179 		}
180 	else
181 		{
182 		inv_diag_A[0] = 0.0;
183 		}
184 
185 	// second column
186 	u_01  = pA[0+bs*1];
187 	tmp1  = pA[1+bs*1];
188 	tmp2  = pA[2+bs*1];
189 	tmp3  = pA[3+bs*1];
190 	tmp1 -= pA[1+bs*0] * u_01;
191 	tmp2 -= pA[2+bs*0] * u_01;
192 	tmp3 -= pA[3+bs*0] * u_01;
193 	pA[1+bs*1] = tmp1;
194 	pA[2+bs*1] = tmp2;
195 	pA[3+bs*1] = tmp3;
196 	pB = pA + bs*sda;
197 	for(k=0; k<ma-3; k+=4)
198 		{
199 		tmp0  = pB[0+bs*1];
200 		tmp1  = pB[1+bs*1];
201 		tmp2  = pB[2+bs*1];
202 		tmp3  = pB[3+bs*1];
203 		tmp0 -= pB[0+bs*0] * u_01;
204 		tmp1 -= pB[1+bs*0] * u_01;
205 		tmp2 -= pB[2+bs*0] * u_01;
206 		tmp3 -= pB[3+bs*0] * u_01;
207 		pB[0+bs*1] = tmp0;
208 		pB[1+bs*1] = tmp1;
209 		pB[2+bs*1] = tmp2;
210 		pB[3+bs*1] = tmp3;
211 		pB += bs*sda;
212 		}
213 	for( ; k<ma; k++)
214 		{
215 		tmp0 = pB[0+bs*1];
216 		tmp0 -= pB[0+bs*0] * u_01;
217 		pB[0+bs*1] = tmp0;
218 		pB += 1;
219 		}
220 
221 	sidamax_lib4(m-1, 1, &pA[1+bs*1], sda, &idamax, &tmp1);
222 	ipiv[1] = idamax+1;
223 	if(tmp1!=0)
224 		{
225 		if(ipiv[1]!=1)
226 			srowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
227 
228 		tmp1 = 1.0 / pA[1+bs*1];
229 		inv_diag_A[1] = tmp1;
230 		pA[2+bs*1] *= tmp1;
231 		pA[3+bs*1] *= tmp1;
232 		pB = pA + bs*sda;
233 		for(k=0; k<ma-3; k+=4)
234 			{
235 			pB[0+bs*1] *= tmp1;
236 			pB[1+bs*1] *= tmp1;
237 			pB[2+bs*1] *= tmp1;
238 			pB[3+bs*1] *= tmp1;
239 			pB += bs*sda;
240 			}
241 		for( ; k<ma; k++)
242 			{
243 			pB[0+bs*1] *= tmp1;
244 			pB += 1;
245 			}
246 		}
247 	else
248 		{
249 		inv_diag_A[1] = 0.0;
250 		}
251 
252 	// third column
253 	u_02  = pA[0+bs*2];
254 	u_12  = pA[1+bs*2];
255 	u_12 -= pA[1+bs*0] * u_02;
256 	pA[1+bs*2] = u_12;
257 	tmp2  = pA[2+bs*2];
258 	tmp3  = pA[3+bs*2];
259 	tmp2 -= pA[2+bs*0] * u_02;
260 	tmp3 -= pA[3+bs*0] * u_02;
261 	tmp2 -= pA[2+bs*1] * u_12;
262 	tmp3 -= pA[3+bs*1] * u_12;
263 	pA[2+bs*2] = tmp2;
264 	pA[3+bs*2] = tmp3;
265 	pB = pA + bs*sda;
266 	for(k=0; k<ma-3; k+=4)
267 		{
268 		tmp0  = pB[0+bs*2];
269 		tmp1  = pB[1+bs*2];
270 		tmp2  = pB[2+bs*2];
271 		tmp3  = pB[3+bs*2];
272 		tmp0 -= pB[0+bs*0] * u_02;
273 		tmp1 -= pB[1+bs*0] * u_02;
274 		tmp2 -= pB[2+bs*0] * u_02;
275 		tmp3 -= pB[3+bs*0] * u_02;
276 		tmp0 -= pB[0+bs*1] * u_12;
277 		tmp1 -= pB[1+bs*1] * u_12;
278 		tmp2 -= pB[2+bs*1] * u_12;
279 		tmp3 -= pB[3+bs*1] * u_12;
280 		pB[0+bs*2] = tmp0;
281 		pB[1+bs*2] = tmp1;
282 		pB[2+bs*2] = tmp2;
283 		pB[3+bs*2] = tmp3;
284 		pB += bs*sda;
285 		}
286 	for( ; k<ma; k++)
287 		{
288 		tmp0  = pB[0+bs*2];
289 		tmp0 -= pB[0+bs*0] * u_02;
290 		tmp0 -= pB[0+bs*1] * u_12;
291 		pB[0+bs*2] = tmp0;
292 		pB += 1;
293 		}
294 
295 	sidamax_lib4(m-2, 2, &pA[2+bs*2], sda, &idamax, &tmp2);
296 	ipiv[2] = idamax+2;
297 	if(tmp2!=0)
298 		{
299 		if(ipiv[2]!=2)
300 			srowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
301 
302 		tmp2 = 1.0 / pA[2+bs*2];
303 		inv_diag_A[2] = tmp2;
304 		pA[3+bs*2] *= tmp2;
305 		pB = pA + bs*sda;
306 		for(k=0; k<ma-3; k+=4)
307 			{
308 			pB[0+bs*2] *= tmp2;
309 			pB[1+bs*2] *= tmp2;
310 			pB[2+bs*2] *= tmp2;
311 			pB[3+bs*2] *= tmp2;
312 			pB += bs*sda;
313 			}
314 		for( ; k<ma; k++)
315 			{
316 			pB[0+bs*2] *= tmp2;
317 			pB += 1;
318 			}
319 		}
320 	else
321 		{
322 		inv_diag_A[2] = 0.0;
323 		}
324 
325 	// fourth column
326 	u_03  = pA[0+bs*3];
327 	u_13  = pA[1+bs*3];
328 	u_13 -= pA[1+bs*0] * u_03;
329 	pA[1+bs*3] = u_13;
330 	u_23  = pA[2+bs*3];
331 	u_23 -= pA[2+bs*0] * u_03;
332 	u_23 -= pA[2+bs*1] * u_13;
333 	pA[2+bs*3] = u_23;
334 	tmp3  = pA[3+bs*3];
335 	tmp3 -= pA[3+bs*0] * u_03;
336 	tmp3 -= pA[3+bs*1] * u_13;
337 	tmp3 -= pA[3+bs*2] * u_23;
338 	pA[3+bs*3] = tmp3;
339 	pB = pA + bs*sda;
340 	for(k=0; k<ma-3; k+=4)
341 		{
342 		tmp0  = pB[0+bs*3];
343 		tmp1  = pB[1+bs*3];
344 		tmp2  = pB[2+bs*3];
345 		tmp3  = pB[3+bs*3];
346 		tmp0 -= pB[0+bs*0] * u_03;
347 		tmp1 -= pB[1+bs*0] * u_03;
348 		tmp2 -= pB[2+bs*0] * u_03;
349 		tmp3 -= pB[3+bs*0] * u_03;
350 		tmp0 -= pB[0+bs*1] * u_13;
351 		tmp1 -= pB[1+bs*1] * u_13;
352 		tmp2 -= pB[2+bs*1] * u_13;
353 		tmp3 -= pB[3+bs*1] * u_13;
354 		tmp0 -= pB[0+bs*2] * u_23;
355 		tmp1 -= pB[1+bs*2] * u_23;
356 		tmp2 -= pB[2+bs*2] * u_23;
357 		tmp3 -= pB[3+bs*2] * u_23;
358 		pB[0+bs*3] = tmp0;
359 		pB[1+bs*3] = tmp1;
360 		pB[2+bs*3] = tmp2;
361 		pB[3+bs*3] = tmp3;
362 		pB += bs*sda;
363 		}
364 	for( ; k<ma; k++)
365 		{
366 		tmp0  = pB[0+bs*3];
367 		tmp0 -= pB[0+bs*0] * u_03;
368 		tmp0 -= pB[0+bs*1] * u_13;
369 		tmp0 -= pB[0+bs*2] * u_23;
370 		pB[0+bs*3] = tmp0;
371 		pB += 1;
372 		}
373 
374 	sidamax_lib4(m-3, 3, &pA[3+bs*3], sda, &idamax, &tmp3);
375 	ipiv[3] = idamax+3;
376 	if(tmp3!=0)
377 		{
378 		if(ipiv[3]!=3)
379 			srowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
380 
381 		tmp3 = 1.0 / pA[3+bs*3];
382 		inv_diag_A[3] = tmp3;
383 		pB = pA + bs*sda;
384 		for(k=0; k<ma-3; k+=4)
385 			{
386 			pB[0+bs*3] *= tmp3;
387 			pB[1+bs*3] *= tmp3;
388 			pB[2+bs*3] *= tmp3;
389 			pB[3+bs*3] *= tmp3;
390 			pB += bs*sda;
391 			}
392 		for( ; k<ma; k++)
393 			{
394 			pB[0+bs*3] *= tmp3;
395 			pB += 1;
396 			}
397 		}
398 	else
399 		{
400 		inv_diag_A[3] = 0.0;
401 		}
402 
403 	return;
404 
405 	}
406 #endif
407 
408 
409 
410 // it process m>0 rows and 0<n<=4 cols
411 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgetrf_pivot_4_vs_lib4(int m,int n,float * pA,int sda,float * inv_diag_A,int * ipiv)412 void kernel_sgetrf_pivot_4_vs_lib4(int m, int n, float *pA, int sda, float *inv_diag_A, int* ipiv)
413 	{
414 
415 	if(m<=0 || n<=0)
416 		return;
417 
418 	const int bs = 4;
419 
420 	// assume m>=4
421 	int ma = m-4;
422 
423 	float
424 		tmp0, tmp1, tmp2, tmp3,
425 		u_00, u_01, u_02, u_03,
426 		      u_11, u_12, u_13,
427 		            u_22, u_23,
428 		                  u_33;
429 
430 	float
431 		*pB;
432 
433 	int
434 		k, idamax;
435 
436 	// first column
437 
438 	// find pivot & scale
439 	sidamax_lib4(m-0, 0, &pA[0+bs*0], sda, &idamax, &tmp0);
440 	ipiv[0] = idamax;
441 	if(tmp0!=0.0)
442 		{
443 		if(ipiv[0]!=0)
444 			srowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
445 
446 		tmp0 = 1.0 / pA[0+bs*0];
447 		inv_diag_A[0] = tmp0;
448 		if(m>=4)
449 			{
450 			pA[1+bs*0] *= tmp0;
451 			pA[2+bs*0] *= tmp0;
452 			pA[3+bs*0] *= tmp0;
453 			pB = pA + bs*sda;
454 			for(k=0; k<ma-3; k+=4)
455 				{
456 				pB[0+bs*0] *= tmp0;
457 				pB[1+bs*0] *= tmp0;
458 				pB[2+bs*0] *= tmp0;
459 				pB[3+bs*0] *= tmp0;
460 				pB += bs*sda;
461 				}
462 			for( ; k<ma; k++)
463 				{
464 				pB[0+bs*0] *= tmp0;
465 				pB += 1;
466 				}
467 			}
468 		else // m = {1,2,3}
469 			{
470 			if(m>1)
471 				{
472 				pA[1+bs*0] *= tmp0;
473 				if(m>2)
474 					pA[2+bs*0] *= tmp0;
475 				}
476 			}
477 		}
478 	else
479 		{
480 		inv_diag_A[0] = 0.0;
481 		}
482 
483 	if(n==1 || m==1) // XXX for the first row there is nothing to do, so we can return here
484 		return;
485 
486 	// second column
487 
488 	// correct
489 	if(m>=4)
490 		{
491 		u_01  = pA[0+bs*1];
492 		tmp1  = pA[1+bs*1];
493 		tmp2  = pA[2+bs*1];
494 		tmp3  = pA[3+bs*1];
495 		tmp1 -= pA[1+bs*0] * u_01;
496 		tmp2 -= pA[2+bs*0] * u_01;
497 		tmp3 -= pA[3+bs*0] * u_01;
498 		pA[1+bs*1] = tmp1;
499 		pA[2+bs*1] = tmp2;
500 		pA[3+bs*1] = tmp3;
501 		pB = pA + bs*sda;
502 		for(k=0; k<ma-3; k+=4)
503 			{
504 			tmp0  = pB[0+bs*1];
505 			tmp1  = pB[1+bs*1];
506 			tmp2  = pB[2+bs*1];
507 			tmp3  = pB[3+bs*1];
508 			tmp0 -= pB[0+bs*0] * u_01;
509 			tmp1 -= pB[1+bs*0] * u_01;
510 			tmp2 -= pB[2+bs*0] * u_01;
511 			tmp3 -= pB[3+bs*0] * u_01;
512 			pB[0+bs*1] = tmp0;
513 			pB[1+bs*1] = tmp1;
514 			pB[2+bs*1] = tmp2;
515 			pB[3+bs*1] = tmp3;
516 			pB += bs*sda;
517 			}
518 		for( ; k<ma; k++)
519 			{
520 			tmp0 = pB[0+bs*1];
521 			tmp0 -= pB[0+bs*0] * u_01;
522 			pB[0+bs*1] = tmp0;
523 			pB += 1;
524 			}
525 		}
526 	else // m = {2,3}
527 		{
528 		u_01  = pA[0+bs*1];
529 		tmp1  = pA[1+bs*1];
530 		tmp1 -= pA[1+bs*0] * u_01;
531 		pA[1+bs*1] = tmp1;
532 		if(m>2)
533 			{
534 			tmp2  = pA[2+bs*1];
535 			tmp2 -= pA[2+bs*0] * u_01;
536 			pA[2+bs*1] = tmp2;
537 			}
538 		}
539 
540 	// find pivot & scale
541 	sidamax_lib4(m-1, 1, &pA[1+bs*1], sda, &idamax, &tmp1);
542 	ipiv[1] = idamax+1;
543 	if(tmp1!=0)
544 		{
545 		if(ipiv[1]!=1)
546 			srowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
547 
548 		tmp1 = 1.0 / pA[1+bs*1];
549 		inv_diag_A[1] = tmp1;
550 		if(m>=4)
551 			{
552 			pA[2+bs*1] *= tmp1;
553 			pA[3+bs*1] *= tmp1;
554 			pB = pA + bs*sda;
555 			for(k=0; k<ma-3; k+=4)
556 				{
557 				pB[0+bs*1] *= tmp1;
558 				pB[1+bs*1] *= tmp1;
559 				pB[2+bs*1] *= tmp1;
560 				pB[3+bs*1] *= tmp1;
561 				pB += bs*sda;
562 				}
563 			for( ; k<ma; k++)
564 				{
565 				pB[0+bs*1] *= tmp1;
566 				pB += 1;
567 				}
568 			}
569 		else // m = {2,3}
570 			{
571 			if(m>2)
572 				pA[2+bs*1] *= tmp1;
573 			}
574 		}
575 	else
576 		{
577 		inv_diag_A[1] = 0.0;
578 		}
579 
580 	if(n==2)
581 		return;
582 
583 	// third column
584 
585 	// correct
586 	if(m>=4)
587 		{
588 		u_02  = pA[0+bs*2];
589 		u_12  = pA[1+bs*2];
590 		u_12 -= pA[1+bs*0] * u_02;
591 		pA[1+bs*2] = u_12;
592 		tmp2  = pA[2+bs*2];
593 		tmp3  = pA[3+bs*2];
594 		tmp2 -= pA[2+bs*0] * u_02;
595 		tmp3 -= pA[3+bs*0] * u_02;
596 		tmp2 -= pA[2+bs*1] * u_12;
597 		tmp3 -= pA[3+bs*1] * u_12;
598 		pA[2+bs*2] = tmp2;
599 		pA[3+bs*2] = tmp3;
600 		pB = pA + bs*sda;
601 		for(k=0; k<ma-3; k+=4)
602 			{
603 			tmp0  = pB[0+bs*2];
604 			tmp1  = pB[1+bs*2];
605 			tmp2  = pB[2+bs*2];
606 			tmp3  = pB[3+bs*2];
607 			tmp0 -= pB[0+bs*0] * u_02;
608 			tmp1 -= pB[1+bs*0] * u_02;
609 			tmp2 -= pB[2+bs*0] * u_02;
610 			tmp3 -= pB[3+bs*0] * u_02;
611 			tmp0 -= pB[0+bs*1] * u_12;
612 			tmp1 -= pB[1+bs*1] * u_12;
613 			tmp2 -= pB[2+bs*1] * u_12;
614 			tmp3 -= pB[3+bs*1] * u_12;
615 			pB[0+bs*2] = tmp0;
616 			pB[1+bs*2] = tmp1;
617 			pB[2+bs*2] = tmp2;
618 			pB[3+bs*2] = tmp3;
619 			pB += bs*sda;
620 			}
621 		for( ; k<ma; k++)
622 			{
623 			tmp0  = pB[0+bs*2];
624 			tmp0 -= pB[0+bs*0] * u_02;
625 			tmp0 -= pB[0+bs*1] * u_12;
626 			pB[0+bs*2] = tmp0;
627 			pB += 1;
628 			}
629 		}
630 	else // m = {2,3}
631 		{
632 		u_02  = pA[0+bs*2];
633 		u_12  = pA[1+bs*2];
634 		u_12 -= pA[1+bs*0] * u_02;
635 		pA[1+bs*2] = u_12;
636 		if(m>2)
637 			{
638 			tmp2  = pA[2+bs*2];
639 			tmp2 -= pA[2+bs*0] * u_02;
640 			tmp2 -= pA[2+bs*1] * u_12;
641 			pA[2+bs*2] = tmp2;
642 			}
643 		}
644 
645 	// find pivot & scale
646 	if(m>2)
647 		{
648 		sidamax_lib4(m-2, 2, &pA[2+bs*2], sda, &idamax, &tmp2);
649 		ipiv[2] = idamax+2;
650 		if(tmp2!=0)
651 			{
652 			if(ipiv[2]!=2)
653 				srowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
654 
655 			tmp2 = 1.0 / pA[2+bs*2];
656 			inv_diag_A[2] = tmp2;
657 			if(m>=4)
658 				{
659 				pA[3+bs*2] *= tmp2;
660 				pB = pA + bs*sda;
661 				for(k=0; k<ma-3; k+=4)
662 					{
663 					pB[0+bs*2] *= tmp2;
664 					pB[1+bs*2] *= tmp2;
665 					pB[2+bs*2] *= tmp2;
666 					pB[3+bs*2] *= tmp2;
667 					pB += bs*sda;
668 					}
669 				for( ; k<ma; k++)
670 					{
671 					pB[0+bs*2] *= tmp2;
672 					pB += 1;
673 					}
674 				}
675 			}
676 		else
677 			{
678 			inv_diag_A[2] = 0.0;
679 			}
680 		}
681 
682 	if(n<4)
683 		return;
684 
685 	// fourth column
686 
687 	// correct
688 	if(m>=4)
689 		{
690 		u_03  = pA[0+bs*3];
691 		u_13  = pA[1+bs*3];
692 		u_13 -= pA[1+bs*0] * u_03;
693 		pA[1+bs*3] = u_13;
694 		u_23  = pA[2+bs*3];
695 		u_23 -= pA[2+bs*0] * u_03;
696 		u_23 -= pA[2+bs*1] * u_13;
697 		pA[2+bs*3] = u_23;
698 		tmp3  = pA[3+bs*3];
699 		tmp3 -= pA[3+bs*0] * u_03;
700 		tmp3 -= pA[3+bs*1] * u_13;
701 		tmp3 -= pA[3+bs*2] * u_23;
702 		pA[3+bs*3] = tmp3;
703 		pB = pA + bs*sda;
704 		for(k=0; k<ma-3; k+=4)
705 			{
706 			tmp0  = pB[0+bs*3];
707 			tmp1  = pB[1+bs*3];
708 			tmp2  = pB[2+bs*3];
709 			tmp3  = pB[3+bs*3];
710 			tmp0 -= pB[0+bs*0] * u_03;
711 			tmp1 -= pB[1+bs*0] * u_03;
712 			tmp2 -= pB[2+bs*0] * u_03;
713 			tmp3 -= pB[3+bs*0] * u_03;
714 			tmp0 -= pB[0+bs*1] * u_13;
715 			tmp1 -= pB[1+bs*1] * u_13;
716 			tmp2 -= pB[2+bs*1] * u_13;
717 			tmp3 -= pB[3+bs*1] * u_13;
718 			tmp0 -= pB[0+bs*2] * u_23;
719 			tmp1 -= pB[1+bs*2] * u_23;
720 			tmp2 -= pB[2+bs*2] * u_23;
721 			tmp3 -= pB[3+bs*2] * u_23;
722 			pB[0+bs*3] = tmp0;
723 			pB[1+bs*3] = tmp1;
724 			pB[2+bs*3] = tmp2;
725 			pB[3+bs*3] = tmp3;
726 			pB += bs*sda;
727 			}
728 		for( ; k<ma; k++)
729 			{
730 			tmp0  = pB[0+bs*3];
731 			tmp0 -= pB[0+bs*0] * u_03;
732 			tmp0 -= pB[0+bs*1] * u_13;
733 			tmp0 -= pB[0+bs*2] * u_23;
734 			pB[0+bs*3] = tmp0;
735 			pB += 1;
736 			}
737 		}
738 	else // m = {2,3}
739 		{
740 		u_03  = pA[0+bs*3];
741 		u_13  = pA[1+bs*3];
742 		u_13 -= pA[1+bs*0] * u_03;
743 		pA[1+bs*3] = u_13;
744 		if(m>2)
745 			{
746 			u_23  = pA[2+bs*3];
747 			u_23 -= pA[2+bs*0] * u_03;
748 			u_23 -= pA[2+bs*1] * u_13;
749 			pA[2+bs*3] = u_23;
750 			}
751 		}
752 
753 	if(m>3)
754 		{
755 		// find pivot & scale
756 		sidamax_lib4(m-3, 3, &pA[3+bs*3], sda, &idamax, &tmp3);
757 		ipiv[3] = idamax+3;
758 		if(tmp3!=0)
759 			{
760 			if(ipiv[3]!=3)
761 				srowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
762 
763 			tmp3 = 1.0 / pA[3+bs*3];
764 			inv_diag_A[3] = tmp3;
765 			pB = pA + bs*sda;
766 			for(k=0; k<ma-3; k+=4)
767 				{
768 				pB[0+bs*3] *= tmp3;
769 				pB[1+bs*3] *= tmp3;
770 				pB[2+bs*3] *= tmp3;
771 				pB[3+bs*3] *= tmp3;
772 				pB += bs*sda;
773 				}
774 			for( ; k<ma; k++)
775 				{
776 				pB[0+bs*3] *= tmp3;
777 				pB += 1;
778 				}
779 			}
780 		else
781 			{
782 			inv_diag_A[3] = 0.0;
783 			}
784 		}
785 
786 	return;
787 
788 	}
789 #endif
790 
791 
792 
793 
794 
795 
796