1 /**************************************************************************************************
2 *                                                                                                 *
3 * This file is part of BLASFEO.                                                                   *
4 *                                                                                                 *
5 * BLASFEO -- BLAS For Embedded Optimization.                                                      *
6 * Copyright (C) 2019 by Gianluca Frison.                                                          *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
8 * All rights reserved.                                                                            *
9 *                                                                                                 *
10 * The 2-Clause BSD License                                                                        *
11 *                                                                                                 *
12 * Redistribution and use in source and binary forms, with or without                              *
13 * modification, are permitted provided that the following conditions are met:                     *
14 *                                                                                                 *
15 * 1. Redistributions of source code must retain the above copyright notice, this                  *
16 *    list of conditions and the following disclaimer.                                             *
17 * 2. Redistributions in binary form must reproduce the above copyright notice,                    *
18 *    this list of conditions and the following disclaimer in the documentation                    *
19 *    and/or other materials provided with the distribution.                                       *
20 *                                                                                                 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND                 *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED                   *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE                          *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR                 *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES                  *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;                    *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND                     *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT                      *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS                   *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                                    *
31 *                                                                                                 *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de                             *
33 *                                                                                                 *
34 **************************************************************************************************/
35 /*
36  * auxiliary functions for LA:HIGH_PERFORMANCE PS_4 (panel major)
37  *
38  * auxiliary/d_aux_lib*.c
39  *
40  */
41 
42 #include <stdlib.h>
43 #include <stdio.h>
44 #include <math.h>
45 
46 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
47 #include <mmintrin.h>
48 #include <xmmintrin.h>  // SSE
49 #include <emmintrin.h>  // SSE2
50 #include <pmmintrin.h>  // SSE3
51 #include <smmintrin.h>  // SSE4
52 #include <immintrin.h>  // AVX
53 #endif
54 
55 #include "../include/blasfeo_common.h"
56 #include "../include/blasfeo_block_size.h"
57 #include "../include/blasfeo_d_aux.h"
58 #include "../include/blasfeo_d_kernel.h"
59 
60 
61 /*
62  * Auxiliary functions for PS:4 (panel-major)
63  *
64  * auxiliary/d_aux_lib4.c
65  * TOFIX: move inside str* interface
66  *
67  */
68 
69 
70 
71 // scales and adds a packed matrix into a packed matrix: B = B + alpha*A
dgead_lib(int m,int n,double alpha,int offsetA,double * A,int sda,int offsetB,double * B,int sdb)72 void dgead_lib(int m, int n, double alpha, int offsetA, double *A, int sda, int offsetB, double *B, int sdb)
73 	{
74 
75 	if(m<=0 || n<=0)
76 		return;
77 
78 	const int bs = 4;
79 
80 	int mna, ii;
81 
82 	int offA = offsetA%bs;
83 	int offB = offsetB%bs;
84 
85 	// A at the beginning of the block
86 	A -= offA;
87 
88 	// A at the beginning of the block
89 	B -= offB;
90 
91 	// same alignment
92 	if(offA==offB)
93 		{
94 		ii = 0;
95 		// clean up at the beginning
96 		mna = (4-offB)%bs;
97 		if(mna>0)
98 			{
99 			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
100 				{
101 				if(m==1)
102 					{
103 					kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
104 					return;
105 					}
106 				else //if(m==2 && mna==3)
107 					{
108 					kernel_dgead_2_0_lib4(n, alpha, A+offA, B+offB);
109 					return;
110 					}
111 				}
112 			if(mna==1)
113 				{
114 				kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
115 				A += 4*sda;
116 				B += 4*sdb;
117 				ii += 1;
118 				}
119 			else if(mna==2)
120 				{
121 				kernel_dgead_2_0_lib4(n, alpha, A+offA, B+offB);
122 				A += 4*sda;
123 				B += 4*sdb;
124 				ii += 2;
125 				}
126 			else // if(mna==3)
127 				{
128 				kernel_dgead_3_0_lib4(n, alpha, A+offA, B+offB);
129 				A += 4*sda;
130 				B += 4*sdb;
131 				ii += 3;
132 				}
133 			}
134 		// main loop
135 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
136 		for(; ii<m-7; ii+=8)
137 			{
138 			kernel_dgead_8_0_lib4(n, alpha, A, sda, B, sdb);
139 			A += 8*sda;
140 			B += 8*sdb;
141 			}
142 #endif
143 		for(; ii<m-3; ii+=4)
144 			{
145 			kernel_dgead_4_0_lib4(n, alpha, A, B);
146 			A += 4*sda;
147 			B += 4*sdb;
148 			}
149 		// clean up at the end
150 		if(ii<m)
151 			{
152 			if(m-ii==1)
153 				kernel_dgead_1_0_lib4(n, alpha, A, B);
154 			else if(m-ii==2)
155 				kernel_dgead_2_0_lib4(n, alpha, A, B);
156 			else // if(m-ii==3)
157 				kernel_dgead_3_0_lib4(n, alpha, A, B);
158 			}
159 		}
160 	// skip one element of A
161 	else if(offA==(offB+1)%bs)
162 		{
163 		ii = 0;
164 		// clean up at the beginning
165 		mna = (4-offB)%bs;
166 		if(mna>0)
167 			{
168 			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
169 				{
170 				if(m==1)
171 					{
172 					kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
173 					return;
174 					}
175 				else //if(m==2 && mna==3)
176 					{
177 					kernel_dgead_2_0_lib4(n, alpha, A+offA, B+offB);
178 					return;
179 					}
180 				}
181 			if(mna==1)
182 				{
183 				kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
184 				//A += 4*sda;
185 				B += 4*sdb;
186 				ii += 1;
187 				}
188 			else if(mna==2)
189 				{
190 				kernel_dgead_2_3_lib4(n, alpha, A, sda, B+2);
191 				A += 4*sda;
192 				B += 4*sdb;
193 				ii += 2;
194 				}
195 			else // if(mna==3)
196 				{
197 				kernel_dgead_3_2_lib4(n, alpha, A, sda, B+1);
198 				A += 4*sda;
199 				B += 4*sdb;
200 				ii += 3;
201 				}
202 			}
203 		// main loop
204 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
205 		for( ; ii<m-7; ii+=8)
206 			{
207 			kernel_dgead_8_1_lib4(n, alpha, A, sda, B, sdb);
208 			A += 8*sda;
209 			B += 8*sdb;
210 			}
211 #endif
212 		for( ; ii<m-3; ii+=4)
213 			{
214 			kernel_dgead_4_1_lib4(n, alpha, A, sda, B);
215 			A += 4*sda;
216 			B += 4*sdb;
217 			}
218 		// clean up at the end
219 		if(ii<m)
220 			{
221 			if(m-ii==1)
222 				kernel_dgead_1_0_lib4(n, alpha, A+1, B);
223 			else if(m-ii==2)
224 				kernel_dgead_2_0_lib4(n, alpha, A+1, B);
225 			else // if(m-ii==3)
226 				kernel_dgead_3_0_lib4(n, alpha, A+1, B);
227 			}
228 		}
229 	// skip 2 elements of A
230 	else if(offA==(offB+2)%bs)
231 		{
232 		ii = 0;
233 		// clean up at the beginning
234 		mna = (4-offB)%bs;
235 		if(mna>0)
236 			{
237 			if(m<mna)
238 				{
239 				if(m==1)
240 					{
241 					kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
242 					return;
243 					}
244 				else // if(m==2 && mna==3)
245 					{
246 					kernel_dgead_2_3_lib4(n, alpha, A, sda, B+1);
247 					return;
248 					}
249 				}
250 			if(mna==1)
251 				{
252 				kernel_dgead_1_0_lib4(n, alpha, A+1, B+3);
253 				// A += 4*sda;
254 				B += 4*sdb;
255 				ii += 1;
256 				}
257 			else if(mna==2)
258 				{
259 				kernel_dgead_2_0_lib4(n, alpha, A, B+2);
260 				// A += 4*sda;
261 				B += 4*sdb;
262 				ii += 2;
263 				}
264 			else // if(mna==3)
265 				{
266 				kernel_dgead_3_3_lib4(n, alpha, A, sda, B+1);
267 				A += 4*sda;
268 				B += 4*sdb;
269 				ii += 3;
270 				}
271 			}
272 		// main loop
273 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
274 		for(; ii<m-7; ii+=8)
275 			{
276 			kernel_dgead_8_2_lib4(n, alpha, A, sda, B, sdb);
277 			A += 8*sda;
278 			B += 8*sdb;
279 			}
280 #endif
281 		for(; ii<m-3; ii+=4)
282 			{
283 			kernel_dgead_4_2_lib4(n, alpha, A, sda, B);
284 			A += 4*sda;
285 			B += 4*sdb;
286 			}
287 		// clean up at the end
288 		if(ii<m)
289 			{
290 			if(m-ii==1)
291 				kernel_dgead_1_0_lib4(n, alpha, A+2, B);
292 			else if(m-ii==2)
293 				kernel_dgead_2_0_lib4(n, alpha, A+2, B);
294 			else // if(m-ii==3)
295 				kernel_dgead_3_2_lib4(n, alpha, A, sda, B);
296 			}
297 		}
298 	// skip 3 elements of A
299 	else // if(offA==(offB+3)%bs)
300 		{
301 		ii = 0;
302 		// clean up at the beginning
303 		mna = (4-offB)%bs;
304 		if(mna>0)
305 			{
306 			if(m<mna)
307 				{
308 				if(m==1)
309 					{
310 					kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
311 					return;
312 					}
313 				else // if(m==2 && mna==3)
314 					{
315 					kernel_dgead_2_0_lib4(n, alpha, A+offA, B+offB);
316 					return;
317 					}
318 				}
319 			if(mna==1)
320 				{
321 				kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
322 				// A += 4*sda;
323 				B += 4*sdb;
324 				ii += 1;
325 				}
326 			else if(mna==2)
327 				{
328 				kernel_dgead_2_0_lib4(n, alpha, A+offA, B+offB);
329 				// A += 4*sda;
330 				B += 4*sdb;
331 				ii += 2;
332 				}
333 			else // if(mna==3)
334 				{
335 				kernel_dgead_3_0_lib4(n, alpha, A+offA, B+offB);
336 				// A += 4*sda;
337 				B += 4*sdb;
338 				ii += 3;
339 				}
340 			}
341 		// main loop
342 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
343 		for(; ii<m-7; ii+=8)
344 			{
345 			kernel_dgead_8_3_lib4(n, alpha, A, sda, B, sdb);
346 			A += 8*sda;
347 			B += 8*sdb;
348 			}
349 #endif
350 		for(; ii<m-3; ii+=4)
351 			{
352 			kernel_dgead_4_3_lib4(n, alpha, A, sda, B);
353 			A += 4*sda;
354 			B += 4*sdb;
355 			}
356 		// clean up at the end
357 		if(ii<m)
358 			{
359 			if(m-ii==1)
360 				kernel_dgead_1_0_lib4(n, alpha, A+3, B);
361 			else if(m-ii==2)
362 				kernel_dgead_2_3_lib4(n, alpha, A, sda, B);
363 			else // if(m-ii==3)
364 				kernel_dgead_3_3_lib4(n, alpha, A, sda, B);
365 			}
366 		}
367 
368 	}
369 
370 
371 
372 // scales and adds a strvec into a strvec
blasfeo_dvecad(int m,double alpha,struct blasfeo_dvec * sa,int ai,struct blasfeo_dvec * sc,int ci)373 void blasfeo_dvecad(int m, double alpha, struct blasfeo_dvec *sa, int ai, struct blasfeo_dvec *sc, int ci)
374 	{
375 	double *pa = sa->pa + ai;
376 	double *pc = sc->pa + ci;
377 	int ii;
378 	ii = 0;
379 	for(; ii<m-3; ii+=4)
380 		{
381 		pc[ii+0] += alpha*pa[ii+0];
382 		pc[ii+1] += alpha*pa[ii+1];
383 		pc[ii+2] += alpha*pa[ii+2];
384 		pc[ii+3] += alpha*pa[ii+3];
385 		}
386 	for(; ii<m; ii++)
387 		{
388 		pc[ii+0] += alpha*pa[ii+0];
389 		}
390 	return;
391 	}
392 
393 
394 // --------- Transpose
395 
396 // transpose general matrix; m and n are referred to the original matrix
dgetr_lib(int m,int n,double alpha,int offsetA,double * pA,int sda,int offsetC,double * pC,int sdc)397 void dgetr_lib(int m, int n, double alpha, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc)
398 	{
399 
400 /*
401 
402 m = 5
403 n = 3
404 offsetA = 1
405 offsetC = 2
406 
407 A =
408  x x x
409  -
410  x x x
411  x x x
412  x x x
413  x x x
414 
415 C =
416  x x x x x
417  x x x x x
418  -
419  x x x x x
420 
421 */
422 
423 
424 	if(m<=0 || n<=0)
425 		return;
426 
427 	const int bs = 4;
428 
429 	int mna = (bs-offsetA%bs)%bs;
430 	mna = m<mna ? m : mna;
431 	int nna = (bs-offsetC%bs)%bs;
432 	nna = n<nna ? n : nna;
433 
434 	int ii;
435 
436 	ii = 0;
437 
438 	if(mna>0)
439 		{
440 		if(mna==1)
441 			kernel_dgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
442 		else if(mna==2)
443 			kernel_dgetr_2_lib4(0, n, nna, alpha, pA, pC, sdc);
444 		else //if(mna==3)
445 			kernel_dgetr_3_lib4(0, n, nna, alpha, pA, pC, sdc);
446 		ii += mna;
447 		pA += mna + bs*(sda-1);
448 		pC += mna*bs;
449 		}
450 #if defined(TARGET_X64_INTEL_HASWELL)
451 	for( ; ii<m-7; ii+=8)
452 		{
453 		kernel_dgetr_8_lib4(0, n, nna, alpha, pA, sda, pC, sdc);
454 		pA += 2*bs*sda;
455 		pC += 2*bs*bs;
456 		}
457 #endif
458 	for( ; ii<m-3; ii+=4)
459 //	for( ; ii<m; ii+=4)
460 		{
461 		kernel_dgetr_4_lib4(0, n, nna, alpha, pA, pC, sdc);
462 		pA += bs*sda;
463 		pC += bs*bs;
464 		}
465 
466 	// clean-up at the end using smaller kernels
467 	if(ii==m)
468 		return;
469 
470 	if(m-ii==1)
471 		kernel_dgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
472 	else if(m-ii==2)
473 		kernel_dgetr_2_lib4(0, n, nna, alpha, pA, pC, sdc);
474 	else if(m-ii==3)
475 		kernel_dgetr_3_lib4(0, n, nna, alpha, pA, pC, sdc);
476 
477 	return;
478 
479 	}
480 
481 
482 
483 // transpose lower triangular matrix
dtrtr_l_lib(int m,double alpha,int offsetA,double * pA,int sda,int offsetC,double * pC,int sdc)484 void dtrtr_l_lib(int m, double alpha, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc)
485 	{
486 
487 /*
488 
489 A =
490  x
491  x x
492  x x x
493  x x x x
494 
495  x x x x x
496  x x x x x x
497  x x x x x x x
498  x x x x x x x x
499 
500 C =
501  x x x x x x x x
502 
503    x x x x x x x
504      x x x x x x
505 	   x x x x x
506 	     x x x x
507 
508 	       x x x
509 	         x x
510 	           x
511 
512 */
513 
514 	int n = m;
515 
516 	if(m<=0 || n<=0)
517 		return;
518 
519 	const int bs = 4;
520 
521 	int mna = (bs-offsetA%bs)%bs;
522 	mna = m<mna ? m : mna;
523 	int nna = (bs-offsetC%bs)%bs;
524 	nna = n<nna ? n : nna;
525 
526 	int ii;
527 
528 	ii = 0;
529 
530 	if(mna>0)
531 		{
532 		if(mna==1)
533 			{
534 			pC[0] = alpha * pA[0];
535 			}
536 		else if(mna==2)
537 			{
538 			if(nna==1)
539 				{
540 				pC[0+bs*0] = alpha * pA[0+bs*0];
541 				pC[0+bs*1] = alpha * pA[1+bs*0];
542 				pC[1+bs*(0+sdc)] = alpha * pA[1+bs*1];
543 				}
544 			else
545 				{
546 				pC[0+bs*0] = alpha * pA[0+bs*0];
547 				pC[0+bs*1] = alpha * pA[1+bs*0];
548 				pC[1+bs*1] = alpha * pA[1+bs*1];
549 				}
550 			}
551 		else //if(mna==3)
552 			{
553 			if(nna==1)
554 				{
555 				pC[0+bs*0] = alpha * pA[0+bs*0];
556 				pC[0+bs*1] = alpha * pA[1+bs*0];
557 				pC[0+bs*2] = alpha * pA[2+bs*0];
558 				pC[1+bs*(0+sdc)] = alpha * pA[1+bs*1];
559 				pC[1+bs*(1+sdc)] = alpha * pA[2+bs*1];
560 				pC[2+bs*(1+sdc)] = alpha * pA[2+bs*2];
561 				}
562 			else if(nna==2)
563 				{
564 				pC[0+bs*0] = alpha * pA[0+bs*0];
565 				pC[0+bs*1] = alpha * pA[1+bs*0];
566 				pC[0+bs*2] = alpha * pA[2+bs*0];
567 				pC[1+bs*1] = alpha * pA[1+bs*1];
568 				pC[1+bs*2] = alpha * pA[2+bs*1];
569 				pC[2+bs*(1+sdc)] = alpha * pA[2+bs*2];
570 				}
571 			else
572 				{
573 				pC[0+bs*0] = alpha * pA[0+bs*0];
574 				pC[0+bs*1] = alpha * pA[1+bs*0];
575 				pC[0+bs*2] = alpha * pA[2+bs*0];
576 				pC[1+bs*1] = alpha * pA[1+bs*1];
577 				pC[1+bs*2] = alpha * pA[2+bs*1];
578 				pC[2+bs*2] = alpha * pA[2+bs*2];
579 				}
580 			}
581 		ii += mna;
582 		pA += mna + bs*(sda-1);
583 		pC += mna*bs;
584 		}
585 #if 0 //defined(TARGET_X64_INTEL_HASWELL)
586 	for( ; ii<m-7; ii+=8)
587 		{
588 		kernel_dgetr_8_lib4(1, n, nna, alpha, pA, sda, pC, sdc);
589 		pA += 2*bs*sda;
590 		pC += 2*bs*bs;
591 		}
592 #endif
593 	for( ; ii<m-3; ii+=4)
594 		{
595 		kernel_dgetr_4_lib4(1, ii, nna, alpha, pA, pC, sdc);
596 		pA += bs*sda;
597 		pC += bs*bs;
598 		}
599 
600 	// clean-up at the end using smaller kernels
601 	if(ii==m)
602 		return;
603 
604 	if(m-ii==1)
605 		kernel_dgetr_1_lib4(1, ii, nna, alpha, pA, pC, sdc);
606 	else if(m-ii==2)
607 		kernel_dgetr_2_lib4(1, ii, nna, alpha, pA, pC, sdc);
608 	else if(m-ii==3)
609 		kernel_dgetr_3_lib4(1, ii, nna, alpha, pA, pC, sdc);
610 
611 	return;
612 
613 	}
614 
615 
616 
617 // transpose an aligned upper triangular matrix into an aligned lower triangular matrix
dtrtr_u_lib(int m,double alpha,int offsetA,double * pA,int sda,int offsetC,double * pC,int sdc)618 void dtrtr_u_lib(int m, double alpha, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc)
619 	{
620 
621 /*
622 
623 A =
624  x x x x x x x x
625    x x x x x x x
626 
627      x x x x x x
628        x x x x x
629          x x x x
630            x x x
631              x x
632                x
633 
634 C =
635  x
636 
637  x x
638  x x x
639  x x x x
640  x x x x x
641  x x x x x x
642  x x x x x x x
643  x x x x x x x x
644 
645 */
646 
647 	int n = m;
648 
649 	if(m<=0 || n<=0)
650 		return;
651 
652 	const int bs = 4;
653 
654 	int mna = (bs-offsetA%bs)%bs;
655 	mna = m<mna ? m : mna;
656 	int nna = (bs-offsetC%bs)%bs;
657 	nna = n<nna ? n : nna;
658 	int tna = nna;
659 
660 	int ii;
661 
662 	ii = 0;
663 
664 	if(mna>0)
665 		{
666 		if(mna==1)
667 			{
668 			kernel_dgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
669 			if(nna!=1)
670 				{
671 //				pC[0+bs*0] = alpha * pA[0+bs*0];
672 				pA += 1*bs;
673 				pC += 1;
674 				tna = (bs-(offsetC+1)%bs)%bs;
675 				}
676 			else //if(nna==1)
677 				{
678 //				pC[0+bs*0] = alpha * pA[0+bs*0];
679 				pA += 1*bs;
680 				pC += 1 + (sdc-1)*bs;
681 				tna = 0; //(bs-(offsetC+1)%bs)%bs;
682 				}
683 //			kernel_dgetr_1_lib4(0, n-1, tna, alpha, pA, pC, sdc);
684 			}
685 		else if(mna==2)
686 			{
687 			if(nna==0 || nna==3)
688 				{
689 				pC[0+bs*0] = alpha * pA[0+bs*0];
690 				pC[1+bs*0] = alpha * pA[0+bs*1];
691 				pC[1+bs*1] = alpha * pA[1+bs*1];
692 				pA += 2*bs;
693 				pC += 2;
694 				tna = (bs-(offsetC+2)%bs)%bs;
695 				kernel_dgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
696 				}
697 			else if(nna==1)
698 				{
699 				pC[0+bs*0] = alpha * pA[0+bs*0];
700 				pA += 1*bs;
701 				pC += 1 + (sdc-1)*bs;
702 //				pC[0+bs*0] = alpha * pA[0+bs*0];
703 //				pC[0+bs*1] = alpha * pA[1+bs*0];
704 				kernel_dgetr_2_lib4(0, n-1, 0, alpha, pA, pC, sdc);
705 				pA += 1*bs;
706 				pC += 1;
707 				tna = 3; //(bs-(offsetC+2)%bs)%bs;
708 //				kernel_dgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
709 				}
710 			else if(nna==2)
711 				{
712 				pC[0+bs*0] = alpha * pA[0+bs*0];
713 				pC[1+bs*0] = alpha * pA[0+bs*1];
714 				pC[1+bs*1] = alpha * pA[1+bs*1];
715 				pA += 2*bs;
716 				pC += 2 + (sdc-1)*bs;
717 				tna = 0; //(bs-(offsetC+2)%bs)%bs;
718 				kernel_dgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
719 				}
720 			}
721 		else //if(mna==3)
722 			{
723 			if(nna==0)
724 				{
725 				pC[0+bs*0] = alpha * pA[0+bs*0];
726 				pC[1+bs*0] = alpha * pA[0+bs*1];
727 				pC[1+bs*1] = alpha * pA[1+bs*1];
728 				pC[2+bs*0] = alpha * pA[0+bs*2];
729 				pC[2+bs*1] = alpha * pA[1+bs*2];
730 				pC[2+bs*2] = alpha * pA[2+bs*2];
731 				pA += 3*bs;
732 				pC += 3;
733 				tna = 1;
734 				kernel_dgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
735 				}
736 			else if(nna==1)
737 				{
738 				pC[0+bs*0] = alpha * pA[0+bs*0];
739 				pA += bs;
740 				pC += 1 + (sdc-1)*bs;
741 				pC[0+bs*0] = alpha * pA[0+bs*0];
742 				pC[0+bs*1] = alpha * pA[1+bs*0];
743 				pC[1+bs*0] = alpha * pA[0+bs*1];
744 				pC[1+bs*1] = alpha * pA[1+bs*1];
745 				pC[1+bs*2] = alpha * pA[2+bs*1];
746 				pA += 2*bs;
747 				pC += 2;
748 				tna = 2;
749 				kernel_dgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
750 				}
751 			else if(nna==2)
752 				{
753 				pC[0+bs*0] = alpha * pA[0+bs*0];
754 				pC[1+bs*0] = alpha * pA[0+bs*1];
755 				pC[1+bs*1] = alpha * pA[1+bs*1];
756 				pA += 2*bs;
757 				pC += 2 + (sdc-1)*bs;
758 //				pC[0+bs*0] = alpha * pA[0+bs*0];
759 //				pC[0+bs*1] = alpha * pA[1+bs*0];
760 //				pC[0+bs*2] = alpha * pA[2+bs*0];
761 				kernel_dgetr_3_lib4(0, n-2, 0, alpha, pA, pC, sdc);
762 				pA += 1*bs;
763 				pC += 1;
764 				tna = 3;
765 //				kernel_dgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
766 				}
767 			else //if(nna==3)
768 				{
769 				pC[0+bs*0] = alpha * pA[0+bs*0];
770 				pC[1+bs*0] = alpha * pA[0+bs*1];
771 				pC[1+bs*1] = alpha * pA[1+bs*1];
772 				pC[2+bs*0] = alpha * pA[0+bs*2];
773 				pC[2+bs*1] = alpha * pA[1+bs*2];
774 				pC[2+bs*2] = alpha * pA[2+bs*2];
775 				pA += 3*bs;
776 				pC += 3 + (sdc-1)*bs;
777 				tna = 0;
778 				kernel_dgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
779 				}
780 			}
781 		ii += mna;
782 		pA += mna + bs*(sda-1);
783 		pC += mna*bs;
784 		}
785 #if 0 //defined(TARGET_X64_AVX2)
786 	for( ; ii<m-7; ii+=8)
787 		{
788 		kernel_dgetr_8_lib4(0, n, nna, alpha, pA, sda, pC, sdc);
789 		pA += 2*bs*sda;
790 		pC += 2*bs*bs;
791 		}
792 #endif
793 	for( ; ii<m-3; ii+=4)
794 		{
795 		if(tna==0)
796 			{
797 			pC[0+bs*0] = alpha * pA[0+bs*0];
798 			pC[1+bs*0] = alpha * pA[0+bs*1];
799 			pC[1+bs*1] = alpha * pA[1+bs*1];
800 			pC[2+bs*0] = alpha * pA[0+bs*2];
801 			pC[2+bs*1] = alpha * pA[1+bs*2];
802 			pC[2+bs*2] = alpha * pA[2+bs*2];
803 			pC[3+bs*0] = alpha * pA[0+bs*3];
804 			pC[3+bs*1] = alpha * pA[1+bs*3];
805 			pC[3+bs*2] = alpha * pA[2+bs*3];
806 			pC[3+bs*3] = alpha * pA[3+bs*3];
807 			pA += 4*bs;
808 			pC += sdc*bs;
809 			kernel_dgetr_4_lib4(0, n-ii-4, 0, alpha, pA, pC, sdc);
810 			}
811 		else if(tna==1)
812 			{
813 			pC[0+bs*0] = alpha * pA[0+bs*0];
814 			pA += bs;
815 			pC += 1 + (sdc-1)*bs;
816 			pC[0+bs*0] = alpha * pA[0+bs*0];
817 			pC[0+bs*1] = alpha * pA[1+bs*0];
818 			pC[1+bs*0] = alpha * pA[0+bs*1];
819 			pC[1+bs*1] = alpha * pA[1+bs*1];
820 			pC[1+bs*2] = alpha * pA[2+bs*1];
821 			pC[2+bs*0] = alpha * pA[0+bs*2];
822 			pC[2+bs*1] = alpha * pA[1+bs*2];
823 			pC[2+bs*2] = alpha * pA[2+bs*2];
824 			pC[2+bs*3] = alpha * pA[3+bs*2];
825 			pA += 3*bs;
826 			pC += 3;
827 			kernel_dgetr_4_lib4(0, n-ii-4, 1, alpha, pA, pC, sdc);
828 			}
829 		else if(tna==2)
830 			{
831 			pC[0+bs*0] = alpha * pA[0+bs*0];
832 			pC[1+bs*0] = alpha * pA[0+bs*1];
833 			pC[1+bs*1] = alpha * pA[1+bs*1];
834 			pA += 2*bs;
835 			pC += 2 + (sdc-1)*bs;
836 			pC[0+bs*0] = alpha * pA[0+bs*0];
837 			pC[0+bs*1] = alpha * pA[1+bs*0];
838 			pC[0+bs*2] = alpha * pA[2+bs*0];
839 			pC[1+bs*0] = alpha * pA[0+bs*1];
840 			pC[1+bs*1] = alpha * pA[1+bs*1];
841 			pC[1+bs*2] = alpha * pA[2+bs*1];
842 			pC[1+bs*3] = alpha * pA[3+bs*1];
843 			pA += 2*bs;
844 			pC += 2;
845 			kernel_dgetr_4_lib4(0, n-ii-4, 2, alpha, pA, pC, sdc);
846 			}
847 		else //if(tna==3)
848 			{
849 			pC[0+bs*0] = alpha * pA[0+bs*0];
850 			pC[1+bs*0] = alpha * pA[0+bs*1];
851 			pC[1+bs*1] = alpha * pA[1+bs*1];
852 			pC[2+bs*0] = alpha * pA[0+bs*2];
853 			pC[2+bs*1] = alpha * pA[1+bs*2];
854 			pC[2+bs*2] = alpha * pA[2+bs*2];
855 			pA += 3*bs;
856 			pC += 3 + (sdc-1)*bs;
857 			kernel_dgetr_4_lib4(0, n-ii-3, 0, alpha, pA, pC, sdc);
858 //			pC[0+bs*0] = alpha * pA[0+bs*0];
859 //			pC[0+bs*1] = alpha * pA[1+bs*0];
860 //			pC[0+bs*2] = alpha * pA[2+bs*0];
861 //			pC[0+bs*3] = alpha * pA[3+bs*0];
862 			pA += bs;
863 			pC += 1;
864 //			kernel_dgetr_4_lib4(0, n-ii-4, tna, alpha, pA, pC, sdc);
865 			}
866 		pA += bs*sda;
867 		pC += bs*bs;
868 		}
869 
870 	// clean-up at the end
871 	if(ii==m)
872 		return;
873 
874 	if(m-ii==1)
875 		{
876 		pC[0+bs*0] = alpha * pA[0+bs*0];
877 		}
878 	else if(m-ii==2)
879 		{
880 		if(tna!=1)
881 			{
882 			pC[0+bs*0] = alpha * pA[0+bs*0];
883 			pC[1+bs*0] = alpha * pA[0+bs*1];
884 			pC[1+bs*1] = alpha * pA[1+bs*1];
885 			}
886 		else //if(tna==1)
887 			{
888 			pC[0+bs*0] = alpha * pA[0+bs*0];
889 			pA += bs;
890 			pC += 1 + (sdc-1)*bs;
891 			pC[0+bs*0] = alpha * pA[0+bs*0];
892 			pC[0+bs*1] = alpha * pA[1+bs*0];
893 			}
894 		}
895 	else if(m-ii==3)
896 		{
897 		if(tna==0 || tna==3)
898 			{
899 			pC[0+bs*0] = alpha * pA[0+bs*0];
900 			pC[1+bs*0] = alpha * pA[0+bs*1];
901 			pC[1+bs*1] = alpha * pA[1+bs*1];
902 			pC[2+bs*0] = alpha * pA[0+bs*2];
903 			pC[2+bs*1] = alpha * pA[1+bs*2];
904 			pC[2+bs*2] = alpha * pA[2+bs*2];
905 			}
906 		else if(tna==1)
907 			{
908 			pC[0+bs*0] = alpha * pA[0+bs*0];
909 			pA += bs;
910 			pC += 1 + (sdc-1)*bs;
911 			pC[0+bs*0] = alpha * pA[0+bs*0];
912 			pC[0+bs*1] = alpha * pA[1+bs*0];
913 			pC[1+bs*0] = alpha * pA[0+bs*0];
914 			pC[1+bs*1] = alpha * pA[1+bs*1];
915 			pC[1+bs*2] = alpha * pA[2+bs*1];
916 			}
917 		else //if(tna==2)
918 			{
919 			pC[0+bs*0] = alpha * pA[0+bs*0];
920 			pC[1+bs*0] = alpha * pA[0+bs*1];
921 			pC[1+bs*1] = alpha * pA[1+bs*1];
922 			pA += 2*bs;
923 			pC += 2 + (sdc-1)*bs;
924 			pC[0+bs*0] = alpha * pA[0+bs*0];
925 			pC[0+bs*1] = alpha * pA[1+bs*0];
926 			pC[0+bs*2] = alpha * pA[2+bs*0];
927 			}
928 		}
929 
930 	return;
931 
932 	}
933 
934 
935 
936 // regularize diagonal
ddiareg_lib(int kmax,double reg,int offset,double * pD,int sdd)937 void ddiareg_lib(int kmax, double reg, int offset, double *pD, int sdd)
938 	{
939 
940 	const int bs = 4;
941 
942 	int kna = (bs-offset%bs)%bs;
943 	kna = kmax<kna ? kmax : kna;
944 
945 	int jj, ll;
946 
947 	if(kna>0)
948 		{
949 		for(ll=0; ll<kna; ll++)
950 			{
951 			pD[ll+bs*ll] += reg;
952 			}
953 		pD += kna + bs*(sdd-1) + kna*bs;
954 		kmax -= kna;
955 		}
956 	for(jj=0; jj<kmax-3; jj+=4)
957 		{
958 		pD[jj*sdd+(jj+0)*bs+0] += reg;
959 		pD[jj*sdd+(jj+1)*bs+1] += reg;
960 		pD[jj*sdd+(jj+2)*bs+2] += reg;
961 		pD[jj*sdd+(jj+3)*bs+3] += reg;
962 		}
963 	for(ll=0; ll<kmax-jj; ll++)
964 		{
965 		pD[jj*sdd+(jj+ll)*bs+ll] += reg;
966 		}
967 
968 	}
969 
970 
971 
972 // insert sqrt of vector to diagonal
ddiain_sqrt_lib(int kmax,double * x,int offset,double * pD,int sdd)973 void ddiain_sqrt_lib(int kmax, double *x, int offset, double *pD, int sdd)
974 	{
975 
976 	const int bs = 4;
977 
978 	int kna = (bs-offset%bs)%bs;
979 	kna = kmax<kna ? kmax : kna;
980 
981 	int jj, ll;
982 
983 	if(kna>0)
984 		{
985 		for(ll=0; ll<kna; ll++)
986 			{
987 			pD[ll+bs*ll] = sqrt(x[ll]);
988 			}
989 		pD += kna + bs*(sdd-1) + kna*bs;
990 		x  += kna;
991 		kmax -= kna;
992 		}
993 	for(jj=0; jj<kmax-3; jj+=4)
994 		{
995 		pD[jj*sdd+(jj+0)*bs+0] = sqrt(x[jj+0]);
996 		pD[jj*sdd+(jj+1)*bs+1] = sqrt(x[jj+1]);
997 		pD[jj*sdd+(jj+2)*bs+2] = sqrt(x[jj+2]);
998 		pD[jj*sdd+(jj+3)*bs+3] = sqrt(x[jj+3]);
999 		}
1000 	for(ll=0; ll<kmax-jj; ll++)
1001 		{
1002 		pD[jj*sdd+(jj+ll)*bs+ll] = sqrt(x[jj+ll]);
1003 		}
1004 
1005 	}
1006 
1007 
1008 
1009 // extract diagonal to vector
ddiaex_lib(int kmax,double alpha,int offset,double * pD,int sdd,double * x)1010 void ddiaex_lib(int kmax, double alpha, int offset, double *pD, int sdd, double *x)
1011 	{
1012 
1013 	const int bs = 4;
1014 
1015 	int kna = (bs-offset%bs)%bs;
1016 	kna = kmax<kna ? kmax : kna;
1017 
1018 	int jj, ll;
1019 
1020 	if(kna>0)
1021 		{
1022 		for(ll=0; ll<kna; ll++)
1023 			{
1024 			x[ll] = alpha * pD[ll+bs*ll];
1025 			}
1026 		pD += kna + bs*(sdd-1) + kna*bs;
1027 		x  += kna;
1028 		kmax -= kna;
1029 		}
1030 	for(jj=0; jj<kmax-3; jj+=4)
1031 		{
1032 		x[jj+0] = alpha * pD[jj*sdd+(jj+0)*bs+0];
1033 		x[jj+1] = alpha * pD[jj*sdd+(jj+1)*bs+1];
1034 		x[jj+2] = alpha * pD[jj*sdd+(jj+2)*bs+2];
1035 		x[jj+3] = alpha * pD[jj*sdd+(jj+3)*bs+3];
1036 		}
1037 	for(ll=0; ll<kmax-jj; ll++)
1038 		{
1039 		x[jj+ll] = alpha * pD[jj*sdd+(jj+ll)*bs+ll];
1040 		}
1041 
1042 	}
1043 
1044 
1045 
1046 // add scaled vector to diagonal
ddiaad_lib(int kmax,double alpha,double * x,int offset,double * pD,int sdd)1047 void ddiaad_lib(int kmax, double alpha, double *x, int offset, double *pD, int sdd)
1048 	{
1049 
1050 	const int bs = 4;
1051 
1052 	int kna = (bs-offset%bs)%bs;
1053 	kna = kmax<kna ? kmax : kna;
1054 
1055 	int jj, ll;
1056 
1057 	if(kna>0)
1058 		{
1059 		for(ll=0; ll<kna; ll++)
1060 			{
1061 			pD[ll+bs*ll] += alpha * x[ll];
1062 			}
1063 		pD += kna + bs*(sdd-1) + kna*bs;
1064 		x  += kna;
1065 		kmax -= kna;
1066 		}
1067 	for(jj=0; jj<kmax-3; jj+=4)
1068 		{
1069 		pD[jj*sdd+(jj+0)*bs+0] += alpha * x[jj+0];
1070 		pD[jj*sdd+(jj+1)*bs+1] += alpha * x[jj+1];
1071 		pD[jj*sdd+(jj+2)*bs+2] += alpha * x[jj+2];
1072 		pD[jj*sdd+(jj+3)*bs+3] += alpha * x[jj+3];
1073 		}
1074 	for(ll=0; ll<kmax-jj; ll++)
1075 		{
1076 		pD[jj*sdd+(jj+ll)*bs+ll] += alpha * x[jj+ll];
1077 		}
1078 
1079 	}
1080 
1081 
1082 
1083 // insert vector to diagonal, sparse formulation
ddiain_libsp(int kmax,int * idx,double alpha,double * x,double * pD,int sdd)1084 void ddiain_libsp(int kmax, int *idx, double alpha, double *x, double *pD, int sdd)
1085 	{
1086 
1087 	const int bs = 4;
1088 
1089 	int ii, jj;
1090 
1091 	for(jj=0; jj<kmax; jj++)
1092 		{
1093 		ii = idx[jj];
1094 		pD[ii/bs*bs*sdd+ii%bs+ii*bs] = alpha * x[jj];
1095 		}
1096 
1097 	}
1098 
1099 
1100 
1101 // extract diagonal to vector, sparse formulation
ddiaex_libsp(int kmax,int * idx,double alpha,double * pD,int sdd,double * x)1102 void ddiaex_libsp(int kmax, int *idx, double alpha, double *pD, int sdd, double *x)
1103 	{
1104 
1105 	const int bs = 4;
1106 
1107 	int ii, jj;
1108 
1109 	for(jj=0; jj<kmax; jj++)
1110 		{
1111 		ii = idx[jj];
1112 		x[jj] = alpha * pD[ii/bs*bs*sdd+ii%bs+ii*bs];
1113 		}
1114 
1115 	}
1116 
1117 
1118 
1119 // add scaled vector to diagonal, sparse formulation
ddiaad_libsp(int kmax,int * idx,double alpha,double * x,double * pD,int sdd)1120 void ddiaad_libsp(int kmax, int *idx, double alpha, double *x, double *pD, int sdd)
1121 	{
1122 
1123 	const int bs = 4;
1124 
1125 	int ii, jj;
1126 
1127 	for(jj=0; jj<kmax; jj++)
1128 		{
1129 		ii = idx[jj];
1130 		pD[ii/bs*bs*sdd+ii%bs+ii*bs] += alpha * x[jj];
1131 		}
1132 
1133 	}
1134 
1135 
1136 
1137 // add scaled vector to another vector and insert to diagonal, sparse formulation
ddiaadin_libsp(int kmax,int * idx,double alpha,double * x,double * y,double * pD,int sdd)1138 void ddiaadin_libsp(int kmax, int *idx, double alpha, double *x, double *y, double *pD, int sdd)
1139 	{
1140 
1141 	const int bs = 4;
1142 
1143 	int ii, jj;
1144 
1145 	for(jj=0; jj<kmax; jj++)
1146 		{
1147 		ii = idx[jj];
1148 		pD[ii/bs*bs*sdd+ii%bs+ii*bs] = y[jj] + alpha * x[jj];
1149 		}
1150 
1151 	}
1152 
1153 
1154 
1155 // insert vector to row
drowin_lib(int kmax,double alpha,double * x,double * pD)1156 void drowin_lib(int kmax, double alpha, double *x, double *pD)
1157 	{
1158 
1159 	const int bs = 4;
1160 
1161 	int jj, ll;
1162 
1163 	for(jj=0; jj<kmax-3; jj+=4)
1164 		{
1165 		pD[(jj+0)*bs] = alpha*x[jj+0];
1166 		pD[(jj+1)*bs] = alpha*x[jj+1];
1167 		pD[(jj+2)*bs] = alpha*x[jj+2];
1168 		pD[(jj+3)*bs] = alpha*x[jj+3];
1169 		}
1170 	for(; jj<kmax; jj++)
1171 		{
1172 		pD[(jj)*bs] = alpha*x[jj];
1173 		}
1174 
1175 	}
1176 
1177 
1178 
1179 // extract row to vector
drowex_lib(int kmax,double alpha,double * pD,double * x)1180 void drowex_lib(int kmax, double alpha, double *pD, double *x)
1181 	{
1182 
1183 	const int bs = 4;
1184 
1185 	int jj, ll;
1186 
1187 	for(jj=0; jj<kmax-3; jj+=4)
1188 		{
1189 		x[jj+0] = alpha*pD[(jj+0)*bs];
1190 		x[jj+1] = alpha*pD[(jj+1)*bs];
1191 		x[jj+2] = alpha*pD[(jj+2)*bs];
1192 		x[jj+3] = alpha*pD[(jj+3)*bs];
1193 		}
1194 	for(; jj<kmax; jj++)
1195 		{
1196 		x[jj] = alpha*pD[(jj)*bs];
1197 		}
1198 
1199 	}
1200 
1201 
1202 
1203 // add scaled vector to row
drowad_lib(int kmax,double alpha,double * x,double * pD)1204 void drowad_lib(int kmax, double alpha, double *x, double *pD)
1205 	{
1206 
1207 	const int bs = 4;
1208 
1209 	int jj, ll;
1210 
1211 	for(jj=0; jj<kmax-3; jj+=4)
1212 		{
1213 		pD[(jj+0)*bs] += alpha * x[jj+0];
1214 		pD[(jj+1)*bs] += alpha * x[jj+1];
1215 		pD[(jj+2)*bs] += alpha * x[jj+2];
1216 		pD[(jj+3)*bs] += alpha * x[jj+3];
1217 		}
1218 	for(; jj<kmax; jj++)
1219 		{
1220 		pD[(jj)*bs] += alpha * x[jj];
1221 		}
1222 
1223 	}
1224 
1225 
1226 
1227 // insert vector to row, sparse formulation
drowin_libsp(int kmax,double alpha,int * idx,double * x,double * pD)1228 void drowin_libsp(int kmax, double alpha, int *idx, double *x, double *pD)
1229 	{
1230 
1231 	const int bs = 4;
1232 
1233 	int ii, jj;
1234 
1235 	for(jj=0; jj<kmax; jj++)
1236 		{
1237 		ii = idx[jj];
1238 		pD[ii*bs] = alpha*x[jj];
1239 		}
1240 
1241 	}
1242 
1243 
1244 
1245 // add scaled vector to row, sparse formulation
drowad_libsp(int kmax,int * idx,double alpha,double * x,double * pD)1246 void drowad_libsp(int kmax, int *idx, double alpha, double *x, double *pD)
1247 	{
1248 
1249 	const int bs = 4;
1250 
1251 	int ii, jj;
1252 
1253 	for(jj=0; jj<kmax; jj++)
1254 		{
1255 		ii = idx[jj];
1256 		pD[ii*bs] += alpha * x[jj];
1257 		}
1258 
1259 	}
1260 
1261 
1262 
1263 // add scaled vector to another vector and insert to row, sparse formulation
drowadin_libsp(int kmax,int * idx,double alpha,double * x,double * y,double * pD)1264 void drowadin_libsp(int kmax, int *idx, double alpha, double *x, double *y, double *pD)
1265 	{
1266 
1267 	const int bs = 4;
1268 
1269 	int ii, jj;
1270 
1271 	for(jj=0; jj<kmax; jj++)
1272 		{
1273 		ii = idx[jj];
1274 		pD[ii*bs] = y[jj] + alpha * x[jj];
1275 		}
1276 
1277 	}
1278 
1279 
1280 
1281 // extract vector from column
dcolex_lib(int kmax,int offset,double * pD,int sdd,double * x)1282 void dcolex_lib(int kmax, int offset, double *pD, int sdd, double *x)
1283 	{
1284 
1285 	const int bs = 4;
1286 
1287 	int kna = (bs-offset%bs)%bs;
1288 	kna = kmax<kna ? kmax : kna;
1289 
1290 	int jj, ll;
1291 
1292 	if(kna>0)
1293 		{
1294 		for(ll=0; ll<kna; ll++)
1295 			{
1296 			x[ll] = pD[ll];
1297 			}
1298 
1299 		pD += kna + bs*(sdd-1);
1300 		x  += kna;
1301 		kmax -= kna;
1302 		}
1303 
1304 	for(jj=0; jj<kmax-3; jj+=4)
1305 		{
1306 		x[jj+0] = pD[jj*sdd+0];
1307 		x[jj+1] = pD[jj*sdd+1];
1308 		x[jj+2] = pD[jj*sdd+2];
1309 		x[jj+3] = pD[jj*sdd+3];
1310 		}
1311 	for(ll=0; ll<kmax-jj; ll++)
1312 		{
1313 		x[jj+ll] = pD[jj*sdd+ll];
1314 		}
1315 
1316 	}
1317 
1318 
1319 
1320 // insert vector to column
dcolin_lib(int kmax,double * x,int offset,double * pD,int sdd)1321 void dcolin_lib(int kmax, double *x, int offset, double *pD, int sdd)
1322 	{
1323 
1324 	const int bs = 4;
1325 
1326 	int kna = (bs-offset%bs)%bs;
1327 	kna = kmax<kna ? kmax : kna;
1328 
1329 	int jj, ll;
1330 
1331 	if(kna>0)
1332 		{
1333 		for(ll=0; ll<kna; ll++)
1334 			{
1335 			pD[ll] = x[ll];
1336 			}
1337 		pD += kna + bs*(sdd-1);
1338 		x  += kna;
1339 		kmax -= kna;
1340 		}
1341 	for(jj=0; jj<kmax-3; jj+=4)
1342 		{
1343 		pD[jj*sdd+0] = x[jj+0];
1344 		pD[jj*sdd+1] = x[jj+1];
1345 		pD[jj*sdd+2] = x[jj+2];
1346 		pD[jj*sdd+3] = x[jj+3];
1347 		}
1348 	for(ll=0; ll<kmax-jj; ll++)
1349 		{
1350 		pD[jj*sdd+ll] = x[jj+ll];
1351 		}
1352 
1353 	}
1354 
1355 
1356 
1357 // add scaled vector to column
dcolad_lib(int kmax,double alpha,double * x,int offset,double * pD,int sdd)1358 void dcolad_lib(int kmax, double alpha, double *x, int offset, double *pD, int sdd)
1359 	{
1360 
1361 	const int bs = 4;
1362 
1363 	int kna = (bs-offset%bs)%bs;
1364 	kna = kmax<kna ? kmax : kna;
1365 
1366 	int jj, ll;
1367 
1368 	if(kna>0)
1369 		{
1370 		for(ll=0; ll<kna; ll++)
1371 			{
1372 			pD[ll] += alpha * x[ll];
1373 			}
1374 		pD += kna + bs*(sdd-1);
1375 		x  += kna;
1376 		kmax -= kna;
1377 		}
1378 	for(jj=0; jj<kmax-3; jj+=4)
1379 		{
1380 		pD[jj*sdd+0] += alpha * x[jj+0];
1381 		pD[jj*sdd+1] += alpha * x[jj+1];
1382 		pD[jj*sdd+2] += alpha * x[jj+2];
1383 		pD[jj*sdd+3] += alpha * x[jj+3];
1384 		}
1385 	for(ll=0; ll<kmax-jj; ll++)
1386 		{
1387 		pD[jj*sdd+ll] += alpha * x[jj+ll];
1388 		}
1389 
1390 	}
1391 
1392 
1393 
1394 // insert vector to diagonal, sparse formulation
dcolin_libsp(int kmax,int * idx,double * x,double * pD,int sdd)1395 void dcolin_libsp(int kmax, int *idx, double *x, double *pD, int sdd)
1396 	{
1397 
1398 	const int bs = 4;
1399 
1400 	int ii, jj;
1401 
1402 	for(jj=0; jj<kmax; jj++)
1403 		{
1404 		ii = idx[jj];
1405 		pD[ii/bs*bs*sdd+ii%bs] = x[jj];
1406 		}
1407 
1408 	}
1409 
1410 
1411 
1412 // add scaled vector to diagonal, sparse formulation
dcolad_libsp(int kmax,double alpha,int * idx,double * x,double * pD,int sdd)1413 void dcolad_libsp(int kmax, double alpha, int *idx, double *x, double *pD, int sdd)
1414 	{
1415 
1416 	const int bs = 4;
1417 
1418 	int ii, jj;
1419 
1420 	for(jj=0; jj<kmax; jj++)
1421 		{
1422 		ii = idx[jj];
1423 		pD[ii/bs*bs*sdd+ii%bs] += alpha * x[jj];
1424 		}
1425 
1426 	}
1427 
1428 
1429 
1430 // swaps two cols
dcolsw_lib(int kmax,int offsetA,double * pA,int sda,int offsetC,double * pC,int sdc)1431 void dcolsw_lib(int kmax, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc)
1432 	{
1433 
1434 	const int bs = 4;
1435 
1436 	int ii;
1437 
1438 	double tmp;
1439 
1440 	if(offsetA==offsetC)
1441 		{
1442 		if(offsetA>0)
1443 			{
1444 			ii = 0;
1445 			for(; ii<bs-offsetA; ii++)
1446 				{
1447 				tmp = pA[0+bs*0];
1448 				pA[0+bs*0] = pC[0+bs*0];
1449 				pC[0+bs*0] = tmp;
1450 				pA += 1;
1451 				pC += 1;
1452 				}
1453 			pA += bs*(sda-1);
1454 			pC += bs*(sdc-1);
1455 			kmax -= bs-offsetA;
1456 			}
1457 		ii = 0;
1458 		for(; ii<kmax-3; ii+=4)
1459 			{
1460 			tmp = pA[0+bs*0];
1461 			pA[0+bs*0] = pC[0+bs*0];
1462 			pC[0+bs*0] = tmp;
1463 			tmp = pA[1+bs*0];
1464 			pA[1+bs*0] = pC[1+bs*0];
1465 			pC[1+bs*0] = tmp;
1466 			tmp = pA[2+bs*0];
1467 			pA[2+bs*0] = pC[2+bs*0];
1468 			pC[2+bs*0] = tmp;
1469 			tmp = pA[3+bs*0];
1470 			pA[3+bs*0] = pC[3+bs*0];
1471 			pC[3+bs*0] = tmp;
1472 			pA += bs*sda;
1473 			pC += bs*sdc;
1474 			}
1475 		for(; ii<kmax; ii++)
1476 			{
1477 			tmp = pA[0+bs*0];
1478 			pA[0+bs*0] = pC[0+bs*0];
1479 			pC[0+bs*0] = tmp;
1480 			pA += 1;
1481 			pC += 1;
1482 			}
1483 		}
1484 	else
1485 		{
1486 		printf("\ndcolsw: feature not implemented yet: offsetA!=offsetC\n\n");
1487 		exit(1);
1488 		}
1489 
1490 	return;
1491 
1492 	}
1493 
1494 
1495 
1496 // insert vector to vector, sparse formulation
dvecin_libsp(int kmax,int * idx,double * x,double * y)1497 void dvecin_libsp(int kmax, int *idx, double *x, double *y)
1498 	{
1499 
1500 	int jj;
1501 
1502 	for(jj=0; jj<kmax; jj++)
1503 		{
1504 		y[idx[jj]] = x[jj];
1505 		}
1506 
1507 	}
1508 
1509 
1510 
1511 // adds vector to vector, sparse formulation
dvecad_libsp(int kmax,int * idx,double alpha,double * x,double * y)1512 void dvecad_libsp(int kmax, int *idx, double alpha, double *x, double *y)
1513 	{
1514 
1515 	int jj;
1516 
1517 	for(jj=0; jj<kmax; jj++)
1518 		{
1519 		y[idx[jj]] += alpha * x[jj];
1520 		}
1521 
1522 	}
1523 
1524 
1525 
1526 /*
1527  * STRMAT interface (General to specific interface)
1528  *
1529  */
1530 
1531 #if defined(LA_HIGH_PERFORMANCE)
1532 
1533 
1534 
1535 // return the memory size (in bytes) needed for a strmat
blasfeo_memsize_dmat(int m,int n)1536 int blasfeo_memsize_dmat(int m, int n)
1537 	{
1538 	const int bs = 4;
1539 	int nc = D_NC;
1540 	int al = bs*nc;
1541 	int pm = (m+bs-1)/bs*bs;
1542 	int cn = (n+nc-1)/nc*nc;
1543 	int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
1544 	int memsize = (pm*cn+tmp)*sizeof(double);
1545 	return memsize;
1546 	}
1547 
1548 
1549 
1550 // return the memory size (in bytes) needed for the digonal of a strmat
blasfeo_memsize_diag_dmat(int m,int n)1551 int blasfeo_memsize_diag_dmat(int m, int n)
1552 	{
1553 	const int bs = 4;
1554 	int nc = D_NC;
1555 	int al = bs*nc;
1556 	int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
1557 	int memsize = tmp*sizeof(double);
1558 	return memsize;
1559 	}
1560 
1561 
1562 
1563 // create a matrix structure for a matrix of size m*n by using memory passed by a pointer
blasfeo_create_dmat(int m,int n,struct blasfeo_dmat * sA,void * memory)1564 void blasfeo_create_dmat(int m, int n, struct blasfeo_dmat *sA, void *memory)
1565 	{
1566 	// invalidate stored inverse diagonal
1567 	sA->use_dA = 0;
1568 
1569 	const int bs = 4;
1570 	int nc = D_NC;
1571 	int al = bs*nc;
1572 	sA->m = m;
1573 	sA->n = n;
1574 	int pm = (m+bs-1)/bs*bs;
1575 	int cn = (n+nc-1)/nc*nc;
1576 	sA->pm = pm;
1577 	sA->cn = cn;
1578 	double *ptr = (double *) memory;
1579 	sA->pA = ptr;
1580 	ptr += pm*cn;
1581 	int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
1582 	sA->dA = ptr;
1583 	ptr += tmp;
1584 	sA->memsize = (pm*cn+tmp)*sizeof(double);
1585 	return;
1586 	}
1587 
1588 
1589 
1590 // return memory size (in bytes) needed for a strvec
blasfeo_memsize_dvec(int m)1591 int blasfeo_memsize_dvec(int m)
1592 	{
1593 	const int bs = 4;
1594 //	int nc = D_NC;
1595 //	int al = bs*nc;
1596 	int pm = (m+bs-1)/bs*bs;
1597 	int memsize = pm*sizeof(double);
1598 	return memsize;
1599 	}
1600 
1601 
1602 
1603 // create a vector structure for a vector of size m by using memory passed by a pointer
blasfeo_create_dvec(int m,struct blasfeo_dvec * sa,void * memory)1604 void blasfeo_create_dvec(int m, struct blasfeo_dvec *sa, void *memory)
1605 	{
1606 	const int bs = 4;
1607 //	int nc = D_NC;
1608 //	int al = bs*nc;
1609 	sa->m = m;
1610 	int pm = (m+bs-1)/bs*bs;
1611 	sa->pm = pm;
1612 	double *ptr = (double *) memory;
1613 	sa->pa = ptr;
1614 //	ptr += pm;
1615 	sa->memsize = pm*sizeof(double);
1616 	return;
1617 	}
1618 
1619 
1620 
1621 // convert a matrix into a matrix structure
blasfeo_pack_dmat(int m,int n,double * A,int lda,struct blasfeo_dmat * sA,int ai,int aj)1622 void blasfeo_pack_dmat(int m, int n, double *A, int lda, struct blasfeo_dmat *sA, int ai, int aj)
1623 	{
1624 	if(m<=0 || n<=0)
1625 		return;
1626 
1627 	// invalidate stored inverse diagonal
1628 	sA->use_dA = 0;
1629 
1630 	const int bs = 4;
1631 	int sda = sA->cn;
1632 	double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
1633 	int i, ii, j, jj, m0, m1, m2;
1634 	double 	*B, *pB;
1635 	sA->use_dA = 0;
1636 
1637 	// row vector in sA
1638 	if(m==1)
1639 		{
1640 		for(jj=0; jj<n; jj++)
1641 			{
1642 			pA[jj*bs] = A[jj*lda];
1643 			}
1644 		return;
1645 		}
1646 
1647 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1648 	__m256d
1649 		tmp;
1650 #endif
1651 	m0 = (bs-ai%bs)%bs;
1652 	if(m0>m)
1653 		m0 = m;
1654 	m1 = m - m0;
1655 	jj = 0;
1656 	for( ; jj<n-3; jj+=4)
1657 		{
1658 		B  =  A + jj*lda;
1659 		pB = pA + jj*bs;
1660 		ii = 0;
1661 		if(m0>0)
1662 			{
1663 			for( ; ii<m0; ii++)
1664 				{
1665 				pB[ii+bs*0] = B[ii+lda*0];
1666 				pB[ii+bs*1] = B[ii+lda*1];
1667 				pB[ii+bs*2] = B[ii+lda*2];
1668 				pB[ii+bs*3] = B[ii+lda*3];
1669 				}
1670 			B  += m0;
1671 			pB += m0 + bs*(sda-1);
1672 			}
1673 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1674 		for( ; ii<m-3; ii+=4)
1675 			{
1676 			tmp = _mm256_loadu_pd( &B[0+lda*0] );
1677 			_mm256_store_pd( &pB[0+bs*0], tmp );
1678 			tmp = _mm256_loadu_pd( &B[0+lda*1] );
1679 			_mm256_store_pd( &pB[0+bs*1], tmp );
1680 			tmp = _mm256_loadu_pd( &B[0+lda*2] );
1681 			_mm256_store_pd( &pB[0+bs*2], tmp );
1682 			tmp = _mm256_loadu_pd( &B[0+lda*3] );
1683 			_mm256_store_pd( &pB[0+bs*3], tmp );
1684 			B  += 4;
1685 			pB += bs*sda;
1686 			}
1687 #else
1688 		for( ; ii<m-3; ii+=4)
1689 			{
1690 			// col 0
1691 			pB[0+bs*0] = B[0+lda*0];
1692 			pB[1+bs*0] = B[1+lda*0];
1693 			pB[2+bs*0] = B[2+lda*0];
1694 			pB[3+bs*0] = B[3+lda*0];
1695 			// col 1
1696 			pB[0+bs*1] = B[0+lda*1];
1697 			pB[1+bs*1] = B[1+lda*1];
1698 			pB[2+bs*1] = B[2+lda*1];
1699 			pB[3+bs*1] = B[3+lda*1];
1700 			// col 2
1701 			pB[0+bs*2] = B[0+lda*2];
1702 			pB[1+bs*2] = B[1+lda*2];
1703 			pB[2+bs*2] = B[2+lda*2];
1704 			pB[3+bs*2] = B[3+lda*2];
1705 			// col 3
1706 			pB[0+bs*3] = B[0+lda*3];
1707 			pB[1+bs*3] = B[1+lda*3];
1708 			pB[2+bs*3] = B[2+lda*3];
1709 			pB[3+bs*3] = B[3+lda*3];
1710 			// update
1711 			B  += 4;
1712 			pB += bs*sda;
1713 			}
1714 #endif
1715 		for( ; ii<m; ii++)
1716 			{
1717 			// col 0
1718 			pB[0+bs*0] = B[0+lda*0];
1719 			// col 1
1720 			pB[0+bs*1] = B[0+lda*1];
1721 			// col 2
1722 			pB[0+bs*2] = B[0+lda*2];
1723 			// col 3
1724 			pB[0+bs*3] = B[0+lda*3];
1725 			// update
1726 			B  += 1;
1727 			pB += 1;
1728 			}
1729 		}
1730 	for( ; jj<n; jj++)
1731 		{
1732 
1733 		B  =  A + jj*lda;
1734 		pB = pA + jj*bs;
1735 
1736 		ii = 0;
1737 		if(m0>0)
1738 			{
1739 			for( ; ii<m0; ii++)
1740 				{
1741 				pB[ii+bs*0] = B[ii+lda*0];
1742 				}
1743 			B  += m0;
1744 			pB += m0 + bs*(sda-1);
1745 			}
1746 		for( ; ii<m-3; ii+=4)
1747 			{
1748 			// col 0
1749 			pB[0+bs*0] = B[0+lda*0];
1750 			pB[1+bs*0] = B[1+lda*0];
1751 			pB[2+bs*0] = B[2+lda*0];
1752 			pB[3+bs*0] = B[3+lda*0];
1753 			// update
1754 			B  += 4;
1755 			pB += bs*sda;
1756 			}
1757 		for( ; ii<m; ii++)
1758 			{
1759 			// col 0
1760 			pB[0+bs*0] = B[0+lda*0];
1761 			// update
1762 			B  += 1;
1763 			pB += 1;
1764 			}
1765 		}
1766 	return;
1767 	}
1768 
1769 
1770 
1771 // convert a lower triangular matrix into a matrix structure
1772 // TODO	vectorize triangle in the 4x4 diag blocks
blasfeo_pack_l_dmat(int m,int n,double * A,int lda,struct blasfeo_dmat * sA,int ai,int aj)1773 void blasfeo_pack_l_dmat(int m, int n, double *A, int lda, struct blasfeo_dmat *sA, int ai, int aj)
1774 	{
1775 	if(m<=0 || n<=0)
1776 		return;
1777 
1778 	// invalidate stored inverse diagonal
1779 	sA->use_dA = 0;
1780 
1781 	const int bs = 4;
1782 	int sda = sA->cn;
1783 	double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
1784 	int i, ii, j, jj, m0, m1, m2;
1785 	double 	*B, *pB;
1786 	sA->use_dA = 0;
1787 
1788 	// row vector in sA
1789 	if(m==1)
1790 		{
1791 		for(jj=0; jj<n; jj++)
1792 			{
1793 			pA[jj*bs] = A[jj*lda];
1794 			}
1795 		return;
1796 		}
1797 
1798 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1799 	__m256d
1800 		tmp;
1801 #endif
1802 	m0 = (bs-ai%bs)%bs;
1803 	if(m0>m)
1804 		m0 = m;
1805 	m1 = m - m0;
1806 	if(m0>0)
1807 		{
1808 		printf("\nblasfeo_pack_l_dmat: feature not implemented yet: ai!=0\n");
1809 		exit(1);
1810 		}
1811 	jj = 0;
1812 	for( ; jj<n-3; jj+=4)
1813 		{
1814 		B  =  A + jj + jj*lda;
1815 		pB = pA + jj*bs + jj*sda;
1816 		ii = jj;
1817 		// col 0
1818 		pB[0+bs*0] = B[0+lda*0];
1819 		pB[1+bs*0] = B[1+lda*0];
1820 		pB[2+bs*0] = B[2+lda*0];
1821 		pB[3+bs*0] = B[3+lda*0];
1822 		// col 1
1823 //		pB[0+bs*1] = B[0+lda*1];
1824 		pB[1+bs*1] = B[1+lda*1];
1825 		pB[2+bs*1] = B[2+lda*1];
1826 		pB[3+bs*1] = B[3+lda*1];
1827 		// col 2
1828 //		pB[0+bs*2] = B[0+lda*2];
1829 //		pB[1+bs*2] = B[1+lda*2];
1830 		pB[2+bs*2] = B[2+lda*2];
1831 		pB[3+bs*2] = B[3+lda*2];
1832 		// col 3
1833 //		pB[0+bs*3] = B[0+lda*3];
1834 //		pB[1+bs*3] = B[1+lda*3];
1835 //		pB[2+bs*3] = B[2+lda*3];
1836 		pB[3+bs*3] = B[3+lda*3];
1837 		B  += 4;
1838 		pB += bs*sda;
1839 		ii += 4;
1840 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1841 		for( ; ii<m-3; ii+=4)
1842 			{
1843 			tmp = _mm256_loadu_pd( &B[0+lda*0] );
1844 			_mm256_store_pd( &pB[0+bs*0], tmp );
1845 			tmp = _mm256_loadu_pd( &B[0+lda*1] );
1846 			_mm256_store_pd( &pB[0+bs*1], tmp );
1847 			tmp = _mm256_loadu_pd( &B[0+lda*2] );
1848 			_mm256_store_pd( &pB[0+bs*2], tmp );
1849 			tmp = _mm256_loadu_pd( &B[0+lda*3] );
1850 			_mm256_store_pd( &pB[0+bs*3], tmp );
1851 			B  += 4;
1852 			pB += bs*sda;
1853 			}
1854 #else
1855 		for( ; ii<m-3; ii+=4)
1856 			{
1857 			// col 0
1858 			pB[0+bs*0] = B[0+lda*0];
1859 			pB[1+bs*0] = B[1+lda*0];
1860 			pB[2+bs*0] = B[2+lda*0];
1861 			pB[3+bs*0] = B[3+lda*0];
1862 			// col 1
1863 			pB[0+bs*1] = B[0+lda*1];
1864 			pB[1+bs*1] = B[1+lda*1];
1865 			pB[2+bs*1] = B[2+lda*1];
1866 			pB[3+bs*1] = B[3+lda*1];
1867 			// col 2
1868 			pB[0+bs*2] = B[0+lda*2];
1869 			pB[1+bs*2] = B[1+lda*2];
1870 			pB[2+bs*2] = B[2+lda*2];
1871 			pB[3+bs*2] = B[3+lda*2];
1872 			// col 3
1873 			pB[0+bs*3] = B[0+lda*3];
1874 			pB[1+bs*3] = B[1+lda*3];
1875 			pB[2+bs*3] = B[2+lda*3];
1876 			pB[3+bs*3] = B[3+lda*3];
1877 			// update
1878 			B  += 4;
1879 			pB += bs*sda;
1880 			}
1881 #endif
1882 		for( ; ii<m; ii++)
1883 			{
1884 			// col 0
1885 			pB[0+bs*0] = B[0+lda*0];
1886 			// col 1
1887 			pB[0+bs*1] = B[0+lda*1];
1888 			// col 2
1889 			pB[0+bs*2] = B[0+lda*2];
1890 			// col 3
1891 			pB[0+bs*3] = B[0+lda*3];
1892 			// update
1893 			B  += 1;
1894 			pB += 1;
1895 			}
1896 		}
1897 	if(jj<n)
1898 		{
1899 		B  =  A + jj + jj*lda;
1900 		pB = pA + jj*bs + jj*sda;
1901 		if(n-jj==1)
1902 			{
1903 			// col 0
1904 			pB[0+bs*0] = B[0+lda*0];
1905 			}
1906 		else if(n-jj==2)
1907 			{
1908 			// col 0
1909 			pB[0+bs*0] = B[0+lda*0];
1910 			pB[1+bs*0] = B[1+lda*0];
1911 			// col 1
1912 //			pB[0+bs*1] = B[0+lda*1];
1913 			pB[1+bs*1] = B[1+lda*1];
1914 			}
1915 		else //if(n-jj==3)
1916 			{
1917 			// col 0
1918 			pB[0+bs*0] = B[0+lda*0];
1919 			pB[1+bs*0] = B[1+lda*0];
1920 			pB[2+bs*0] = B[2+lda*0];
1921 			// col 1
1922 //			pB[0+bs*1] = B[0+lda*1];
1923 			pB[1+bs*1] = B[1+lda*1];
1924 			pB[2+bs*1] = B[2+lda*1];
1925 			// col 2
1926 //			pB[0+bs*2] = B[0+lda*2];
1927 //			pB[1+bs*2] = B[1+lda*2];
1928 			pB[2+bs*2] = B[2+lda*2];
1929 			}
1930 		}
1931 	return;
1932 	}
1933 
1934 
1935 
1936 // convert a upper triangular matrix into a matrix structure
1937 // TODO	vectorize triangle in the 4x4 diag blocks
blasfeo_pack_u_dmat(int m,int n,double * A,int lda,struct blasfeo_dmat * sA,int ai,int aj)1938 void blasfeo_pack_u_dmat(int m, int n, double *A, int lda, struct blasfeo_dmat *sA, int ai, int aj)
1939 	{
1940 	if(m<=0 || n<=0)
1941 		return;
1942 
1943 	// invalidate stored inverse diagonal
1944 	sA->use_dA = 0;
1945 
1946 	const int bs = 4;
1947 	int sda = sA->cn;
1948 	double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
1949 	int i, ii, j, jj, m0, m1, m2;
1950 	double 	*B, *pB;
1951 	sA->use_dA = 0;
1952 
1953 	// row vector in sA
1954 	if(m==1)
1955 		{
1956 		for(jj=0; jj<n; jj++)
1957 			{
1958 			pA[jj*bs] = A[jj*lda];
1959 			}
1960 		return;
1961 		}
1962 
1963 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1964 	__m256d
1965 		tmp;
1966 #endif
1967 	m0 = (bs-ai%bs)%bs;
1968 	if(m0>m)
1969 		m0 = m;
1970 	m1 = m - m0;
1971 	jj = 0;
1972 	for( ; jj<n-3; jj+=4)
1973 		{
1974 		B  =  A + jj*lda;
1975 		pB = pA + jj*bs;
1976 		ii = 0;
1977 		if(m0>0)
1978 			{
1979 			for( ; ii<m0; ii++)
1980 				{
1981 				pB[ii+bs*0] = B[ii+lda*0];
1982 				pB[ii+bs*1] = B[ii+lda*1];
1983 				pB[ii+bs*2] = B[ii+lda*2];
1984 				pB[ii+bs*3] = B[ii+lda*3];
1985 				}
1986 			B  += m0;
1987 			pB += m0 + bs*(sda-1);
1988 			}
1989 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1990 		for( ; ii<jj-3; ii+=4)
1991 			{
1992 			tmp = _mm256_loadu_pd( &B[0+lda*0] );
1993 			_mm256_store_pd( &pB[0+bs*0], tmp );
1994 			tmp = _mm256_loadu_pd( &B[0+lda*1] );
1995 			_mm256_store_pd( &pB[0+bs*1], tmp );
1996 			tmp = _mm256_loadu_pd( &B[0+lda*2] );
1997 			_mm256_store_pd( &pB[0+bs*2], tmp );
1998 			tmp = _mm256_loadu_pd( &B[0+lda*3] );
1999 			_mm256_store_pd( &pB[0+bs*3], tmp );
2000 			B  += 4;
2001 			pB += bs*sda;
2002 			}
2003 #else
2004 		for( ; ii<jj-3; ii+=4)
2005 			{
2006 			// col 0
2007 			pB[0+bs*0] = B[0+lda*0];
2008 			pB[1+bs*0] = B[1+lda*0];
2009 			pB[2+bs*0] = B[2+lda*0];
2010 			pB[3+bs*0] = B[3+lda*0];
2011 			// col 1
2012 			pB[0+bs*1] = B[0+lda*1];
2013 			pB[1+bs*1] = B[1+lda*1];
2014 			pB[2+bs*1] = B[2+lda*1];
2015 			pB[3+bs*1] = B[3+lda*1];
2016 			// col 2
2017 			pB[0+bs*2] = B[0+lda*2];
2018 			pB[1+bs*2] = B[1+lda*2];
2019 			pB[2+bs*2] = B[2+lda*2];
2020 			pB[3+bs*2] = B[3+lda*2];
2021 			// col 3
2022 			pB[0+bs*3] = B[0+lda*3];
2023 			pB[1+bs*3] = B[1+lda*3];
2024 			pB[2+bs*3] = B[2+lda*3];
2025 			pB[3+bs*3] = B[3+lda*3];
2026 			// update
2027 			B  += 4;
2028 			pB += bs*sda;
2029 			}
2030 #endif
2031 		// col 0
2032 		pB[0+bs*0] = B[0+lda*0];
2033 //		pB[1+bs*0] = B[1+lda*0];
2034 //		pB[2+bs*0] = B[2+lda*0];
2035 //		pB[3+bs*0] = B[3+lda*0];
2036 		// col 1
2037 		pB[0+bs*1] = B[0+lda*1];
2038 		pB[1+bs*1] = B[1+lda*1];
2039 //		pB[2+bs*1] = B[2+lda*1];
2040 //		pB[3+bs*1] = B[3+lda*1];
2041 		// col 2
2042 		pB[0+bs*2] = B[0+lda*2];
2043 		pB[1+bs*2] = B[1+lda*2];
2044 		pB[2+bs*2] = B[2+lda*2];
2045 //		pB[3+bs*2] = B[3+lda*2];
2046 		// col 3
2047 		pB[0+bs*3] = B[0+lda*3];
2048 		pB[1+bs*3] = B[1+lda*3];
2049 		pB[2+bs*3] = B[2+lda*3];
2050 		pB[3+bs*3] = B[3+lda*3];
2051 		}
2052 	for( ; jj<n; jj++)
2053 		{
2054 
2055 		B  =  A + jj*lda;
2056 		pB = pA + jj*bs;
2057 
2058 		ii = 0;
2059 		if(m0>0)
2060 			{
2061 			for( ; ii<m0; ii++)
2062 				{
2063 				pB[ii+bs*0] = B[ii+lda*0];
2064 				}
2065 			B  += m0;
2066 			pB += m0 + bs*(sda-1);
2067 			}
2068 		for( ; ii<jj-3; ii+=4)
2069 			{
2070 			// col 0
2071 			pB[0+bs*0] = B[0+lda*0];
2072 			pB[1+bs*0] = B[1+lda*0];
2073 			pB[2+bs*0] = B[2+lda*0];
2074 			pB[3+bs*0] = B[3+lda*0];
2075 			// update
2076 			B  += 4;
2077 			pB += bs*sda;
2078 			}
2079 		for( ; ii<=jj; ii++)
2080 			{
2081 			// col 0
2082 			pB[0+bs*0] = B[0+lda*0];
2083 			// update
2084 			B  += 1;
2085 			pB += 1;
2086 			}
2087 		}
2088 	return;
2089 	}
2090 
2091 
2092 
2093 // convert and transpose a matrix into a matrix structure
blasfeo_pack_tran_dmat(int m,int n,double * A,int lda,struct blasfeo_dmat * sA,int ai,int aj)2094 void blasfeo_pack_tran_dmat(int m, int n, double *A, int lda, struct blasfeo_dmat *sA, int ai, int aj)
2095 	{
2096 
2097 	// invalidate stored inverse diagonal
2098 	sA->use_dA = 0;
2099 
2100 	const int bs = 4;
2101 	int sda = sA->cn;
2102 	double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
2103 	int i, ii, j, m0, m1, m2;
2104 	double 	*B, *pB;
2105 	sA->use_dA = 0;
2106 
2107 	// row vector in sA
2108 	if(n==1)
2109 		{
2110 		for(ii=0; ii<m; ii++)
2111 			{
2112 			pA[ii*bs] = A[ii];
2113 			}
2114 		return;
2115 		}
2116 
2117 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2118 	__m256d
2119 		v0, v1, v2, v3,
2120 		v4, v5, v6, v7;
2121 #endif
2122 	m0 = (bs-ai%bs)%bs;
2123 	if(m0>n)
2124 		m0 = n;
2125 	m1 = n - m0;
2126 	ii = 0;
2127 	if(m0>0)
2128 		{
2129 		for(j=0; j<m; j++)
2130 			{
2131 			for(i=0; i<m0; i++)
2132 				{
2133 				pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
2134 				}
2135 			}
2136 		A  += m0*lda;
2137 		pA += m0 + bs*(sda-1);
2138 		}
2139 	ii = 0;
2140 	for(; ii<m1-3; ii+=bs)
2141 		{
2142 		j=0;
2143 		B  = A + ii*lda;
2144 		pB = pA + ii*sda;
2145 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2146 		for(; j<m-3; j+=4)
2147 			{
2148 			v0 = _mm256_loadu_pd( &B[0+0*lda] ); // 00 10 20 30
2149 			v1 = _mm256_loadu_pd( &B[0+1*lda] ); // 01 11 21 31
2150 			v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
2151 			v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
2152 			v2 = _mm256_loadu_pd( &B[0+2*lda] ); // 02 12 22 32
2153 			v3 = _mm256_loadu_pd( &B[0+3*lda] ); // 03 13 23 33
2154 			v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
2155 			v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
2156 
2157 			B += 4;
2158 
2159 			v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
2160 			_mm256_store_pd( &pB[0+bs*0], v0 );
2161 			v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
2162 			_mm256_store_pd( &pB[0+bs*2], v2 );
2163 			v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
2164 			_mm256_store_pd( &pB[0+bs*1], v1 );
2165 			v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
2166 			_mm256_store_pd( &pB[0+bs*3], v3 );
2167 
2168 			pB += 4*bs;
2169 			}
2170 #else
2171 		for(; j<m-3; j+=4)
2172 			{
2173 			// unroll 0
2174 			pB[0+0*bs] = B[0+0*lda];
2175 			pB[1+0*bs] = B[0+1*lda];
2176 			pB[2+0*bs] = B[0+2*lda];
2177 			pB[3+0*bs] = B[0+3*lda];
2178 			// unroll 1
2179 			pB[0+1*bs] = B[1+0*lda];
2180 			pB[1+1*bs] = B[1+1*lda];
2181 			pB[2+1*bs] = B[1+2*lda];
2182 			pB[3+1*bs] = B[1+3*lda];
2183 			// unroll 2
2184 			pB[0+2*bs] = B[2+0*lda];
2185 			pB[1+2*bs] = B[2+1*lda];
2186 			pB[2+2*bs] = B[2+2*lda];
2187 			pB[3+2*bs] = B[2+3*lda];
2188 			// unroll 3
2189 			pB[0+3*bs] = B[3+0*lda];
2190 			pB[1+3*bs] = B[3+1*lda];
2191 			pB[2+3*bs] = B[3+2*lda];
2192 			pB[3+3*bs] = B[3+3*lda];
2193 			B  += 4;
2194 			pB += 4*bs;
2195 			}
2196 #endif
2197 		for(; j<m; j++)
2198 			{
2199 			// unroll 0
2200 			pB[0+0*bs] = B[0+0*lda];
2201 			pB[1+0*bs] = B[0+1*lda];
2202 			pB[2+0*bs] = B[0+2*lda];
2203 			pB[3+0*bs] = B[0+3*lda];
2204 			B  += 1;
2205 			pB += 1*bs;
2206 			}
2207 		}
2208 	if(ii<m1)
2209 		{
2210 		m2 = m1-ii;
2211 		if(bs<m2) m2 = bs;
2212 		for(j=0; j<m; j++)
2213 			{
2214 			for(i=0; i<m2; i++)
2215 				{
2216 				pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
2217 				}
2218 			}
2219 		}
2220 	return;
2221 	}
2222 
2223 
2224 
2225 // convert a vector into a vector structure
blasfeo_pack_dvec(int m,double * a,struct blasfeo_dvec * sa,int ai)2226 void blasfeo_pack_dvec(int m, double *a, struct blasfeo_dvec *sa, int ai)
2227 	{
2228 	double *pa = sa->pa + ai;
2229 	int ii;
2230 	for(ii=0; ii<m; ii++)
2231 		pa[ii] = a[ii];
2232 	return;
2233 	}
2234 
2235 
2236 
2237 // convert a matrix structure into a matrix
blasfeo_unpack_dmat(int m,int n,struct blasfeo_dmat * sA,int ai,int aj,double * A,int lda)2238 void blasfeo_unpack_dmat(int m, int n, struct blasfeo_dmat *sA, int ai, int aj, double *A, int lda)
2239 	{
2240 	const int bs = 4;
2241 	int sda = sA->cn;
2242 	double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
2243 	int i, ii, jj;
2244 	int m0 = (bs-ai%bs)%bs;
2245 	if(m0>m)
2246 		m0 = m;
2247 	double *ptr_pA;
2248 
2249 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2250 	__m256d
2251 		tmp;
2252 #endif
2253 
2254 	jj=0;
2255 	for(; jj<n-3; jj+=4)
2256 		{
2257 		ptr_pA = pA + jj*bs;
2258 		ii = 0;
2259 		if(m0>0)
2260 			{
2261 			for(; ii<m0; ii++)
2262 				{
2263 				// unroll 0
2264 				A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
2265 				// unroll 1
2266 				A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
2267 				// unroll 2
2268 				A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
2269 				// unroll 3
2270 				A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
2271 				ptr_pA++;
2272 				}
2273 			ptr_pA += (sda-1)*bs;
2274 			}
2275 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2276 		for(; ii<m-bs+1; ii+=bs)
2277 			{
2278 			tmp = _mm256_load_pd( &ptr_pA[0+bs*0] );
2279 			_mm256_storeu_pd( &A[ii+lda*(jj+0)], tmp );
2280 			tmp = _mm256_load_pd( &ptr_pA[0+bs*1] );
2281 			_mm256_storeu_pd( &A[ii+lda*(jj+1)], tmp );
2282 			tmp = _mm256_load_pd( &ptr_pA[0+bs*2] );
2283 			_mm256_storeu_pd( &A[ii+lda*(jj+2)], tmp );
2284 			tmp = _mm256_load_pd( &ptr_pA[0+bs*3] );
2285 			_mm256_storeu_pd( &A[ii+lda*(jj+3)], tmp );
2286 			ptr_pA += sda*bs;
2287 			}
2288 #else
2289 		for(; ii<m-bs+1; ii+=bs)
2290 			{
2291 			// unroll 0
2292 			A[0+ii+lda*(jj+0)] = ptr_pA[0+bs*0];
2293 			A[1+ii+lda*(jj+0)] = ptr_pA[1+bs*0];
2294 			A[2+ii+lda*(jj+0)] = ptr_pA[2+bs*0];
2295 			A[3+ii+lda*(jj+0)] = ptr_pA[3+bs*0];
2296 			// unroll 0
2297 			A[0+ii+lda*(jj+1)] = ptr_pA[0+bs*1];
2298 			A[1+ii+lda*(jj+1)] = ptr_pA[1+bs*1];
2299 			A[2+ii+lda*(jj+1)] = ptr_pA[2+bs*1];
2300 			A[3+ii+lda*(jj+1)] = ptr_pA[3+bs*1];
2301 			// unroll 0
2302 			A[0+ii+lda*(jj+2)] = ptr_pA[0+bs*2];
2303 			A[1+ii+lda*(jj+2)] = ptr_pA[1+bs*2];
2304 			A[2+ii+lda*(jj+2)] = ptr_pA[2+bs*2];
2305 			A[3+ii+lda*(jj+2)] = ptr_pA[3+bs*2];
2306 			// unroll 0
2307 			A[0+ii+lda*(jj+3)] = ptr_pA[0+bs*3];
2308 			A[1+ii+lda*(jj+3)] = ptr_pA[1+bs*3];
2309 			A[2+ii+lda*(jj+3)] = ptr_pA[2+bs*3];
2310 			A[3+ii+lda*(jj+3)] = ptr_pA[3+bs*3];
2311 			ptr_pA += sda*bs;
2312 			}
2313 #endif
2314 		for(; ii<m; ii++)
2315 			{
2316 			// unroll 0
2317 			A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
2318 			// unroll 1
2319 			A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
2320 			// unroll 2
2321 			A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
2322 			// unroll 3
2323 			A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
2324 			ptr_pA++;
2325 			}
2326 		}
2327 	for(; jj<n; jj++)
2328 		{
2329 		ptr_pA = pA + jj*bs;
2330 		ii = 0;
2331 		if(m0>0)
2332 			{
2333 			for(; ii<m0; ii++)
2334 				{
2335 				A[ii+lda*jj] = ptr_pA[0];
2336 				ptr_pA++;
2337 				}
2338 			ptr_pA += (sda-1)*bs;
2339 			}
2340 		for(; ii<m-bs+1; ii+=bs)
2341 			{
2342 			A[0+ii+lda*jj] = ptr_pA[0];
2343 			A[1+ii+lda*jj] = ptr_pA[1];
2344 			A[2+ii+lda*jj] = ptr_pA[2];
2345 			A[3+ii+lda*jj] = ptr_pA[3];
2346 			ptr_pA += sda*bs;
2347 			}
2348 		for(; ii<m; ii++)
2349 			{
2350 			A[ii+lda*jj] = ptr_pA[0];
2351 			ptr_pA++;
2352 			}
2353 		}
2354 	return;
2355 	}
2356 
2357 
2358 
2359 // convert and transpose a matrix structure into a matrix
blasfeo_unpack_tran_dmat(int m,int n,struct blasfeo_dmat * sA,int ai,int aj,double * A,int lda)2360 void blasfeo_unpack_tran_dmat(int m, int n, struct blasfeo_dmat *sA, int ai, int aj, double *A, int lda)
2361 	{
2362 	const int bs = 4;
2363 	int sda = sA->cn;
2364 	double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
2365 	int i, ii, jj;
2366 	int m0 = (bs-ai%bs)%bs;
2367 	if(m0>m)
2368 		m0 = m;
2369 	double *ptr_pA;
2370 
2371 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2372 	__m256d
2373 		v0, v1, v2, v3,
2374 		v4, v5, v6, v7;
2375 #endif
2376 
2377 	jj=0;
2378 	for(; jj<n-3; jj+=4)
2379 		{
2380 		ptr_pA = pA + jj*bs;
2381 		ii = 0;
2382 		if(m0>0)
2383 			{
2384 			for(; ii<m0; ii++)
2385 				{
2386 				// unroll 0
2387 				A[jj+0+lda*ii] = ptr_pA[0+bs*0];
2388 				// unroll 1
2389 				A[jj+1+lda*ii] = ptr_pA[0+bs*1];
2390 				// unroll 2
2391 				A[jj+2+lda*ii] = ptr_pA[0+bs*2];
2392 				// unroll 3
2393 				A[jj+3+lda*ii] = ptr_pA[0+bs*3];
2394 				ptr_pA++;
2395 				}
2396 			ptr_pA += (sda-1)*bs;
2397 			}
2398 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2399 		for(; ii<m-bs+1; ii+=bs)
2400 			{
2401 			v0 = _mm256_load_pd( &ptr_pA[0+bs*0] ); // 00 10 20 30
2402 			v1 = _mm256_load_pd( &ptr_pA[0+bs*1] ); // 01 11 21 31
2403 			v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
2404 			v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
2405 			v2 = _mm256_load_pd( &ptr_pA[0+bs*2] ); // 02 12 22 32
2406 			v3 = _mm256_load_pd( &ptr_pA[0+bs*3] ); // 03 13 23 33
2407 			v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
2408 			v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
2409 
2410 			v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
2411 			_mm256_storeu_pd( &A[jj+lda*(ii+0)], v0 );
2412 			v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
2413 			_mm256_storeu_pd( &A[jj+lda*(ii+2)], v2 );
2414 			v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
2415 			_mm256_storeu_pd( &A[jj+lda*(ii+1)], v1 );
2416 			v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
2417 			_mm256_storeu_pd( &A[jj+lda*(ii+3)], v3 );
2418 
2419 			ptr_pA += sda*bs;
2420 			}
2421 #else
2422 		for(; ii<m-bs+1; ii+=bs)
2423 			{
2424 			// unroll 0
2425 			A[jj+0+lda*(ii+0)] = ptr_pA[0+bs*0];
2426 			A[jj+0+lda*(ii+1)] = ptr_pA[1+bs*0];
2427 			A[jj+0+lda*(ii+2)] = ptr_pA[2+bs*0];
2428 			A[jj+0+lda*(ii+3)] = ptr_pA[3+bs*0];
2429 			// unroll 1
2430 			A[jj+1+lda*(ii+0)] = ptr_pA[0+bs*1];
2431 			A[jj+1+lda*(ii+1)] = ptr_pA[1+bs*1];
2432 			A[jj+1+lda*(ii+2)] = ptr_pA[2+bs*1];
2433 			A[jj+1+lda*(ii+3)] = ptr_pA[3+bs*1];
2434 			// unroll 2
2435 			A[jj+2+lda*(ii+0)] = ptr_pA[0+bs*2];
2436 			A[jj+2+lda*(ii+1)] = ptr_pA[1+bs*2];
2437 			A[jj+2+lda*(ii+2)] = ptr_pA[2+bs*2];
2438 			A[jj+2+lda*(ii+3)] = ptr_pA[3+bs*2];
2439 			// unroll 3
2440 			A[jj+3+lda*(ii+0)] = ptr_pA[0+bs*3];
2441 			A[jj+3+lda*(ii+1)] = ptr_pA[1+bs*3];
2442 			A[jj+3+lda*(ii+2)] = ptr_pA[2+bs*3];
2443 			A[jj+3+lda*(ii+3)] = ptr_pA[3+bs*3];
2444 			ptr_pA += sda*bs;
2445 			}
2446 #endif
2447 		for(; ii<m; ii++)
2448 			{
2449 			// unroll 0
2450 			A[jj+0+lda*ii] = ptr_pA[0+bs*0];
2451 			// unroll 1
2452 			A[jj+1+lda*ii] = ptr_pA[0+bs*1];
2453 			// unroll 2
2454 			A[jj+2+lda*ii] = ptr_pA[0+bs*2];
2455 			// unroll 3
2456 			A[jj+3+lda*ii] = ptr_pA[0+bs*3];
2457 			ptr_pA++;
2458 			}
2459 		}
2460 	for(; jj<n; jj++)
2461 		{
2462 		ptr_pA = pA + jj*bs;
2463 		ii = 0;
2464 		if(m0>0)
2465 			{
2466 			for(; ii<m0; ii++)
2467 				{
2468 				A[jj+lda*ii] = ptr_pA[0];
2469 				ptr_pA++;
2470 				}
2471 			ptr_pA += (sda-1)*bs;
2472 			}
2473 		for(; ii<m-bs+1; ii+=bs)
2474 			{
2475 			i=0;
2476 			for(; i<bs; i++)
2477 				{
2478 				A[jj+lda*(i+ii)] = ptr_pA[0];
2479 				ptr_pA++;
2480 				}
2481 			ptr_pA += (sda-1)*bs;
2482 			}
2483 		for(; ii<m; ii++)
2484 			{
2485 			A[jj+lda*ii] = ptr_pA[0];
2486 			ptr_pA++;
2487 			}
2488 		}
2489 	return;
2490 	}
2491 
2492 
2493 
2494 // convert a vector structure into a vector
blasfeo_unpack_dvec(int m,struct blasfeo_dvec * sa,int ai,double * a)2495 void blasfeo_unpack_dvec(int m, struct blasfeo_dvec *sa, int ai, double *a)
2496 	{
2497 	double *pa = sa->pa + ai;
2498 	int ii;
2499 	for(ii=0; ii<m; ii++)
2500 		a[ii] = pa[ii];
2501 	return;
2502 	}
2503 
2504 
2505 
2506 // cast a matrix into a matrix structure
d_cast_mat2strmat(double * A,struct blasfeo_dmat * sA)2507 void d_cast_mat2strmat(double *A, struct blasfeo_dmat *sA)
2508 	{
2509 
2510 	// invalidate stored inverse diagonal
2511 	sA->use_dA = 0;
2512 
2513 	sA->pA = A;
2514 	return;
2515 	}
2516 
2517 
2518 
2519 // cast a matrix into the diagonal of a matrix structure
d_cast_diag_mat2strmat(double * dA,struct blasfeo_dmat * sA)2520 void d_cast_diag_mat2strmat(double *dA, struct blasfeo_dmat *sA)
2521 	{
2522 
2523 	// invalidate stored inverse diagonal
2524 	sA->use_dA = 0;
2525 
2526 	sA->dA = dA;
2527 	return;
2528 	}
2529 
2530 
2531 
2532 // cast a vector into a vector structure
d_cast_vec2vecmat(double * a,struct blasfeo_dvec * sa)2533 void d_cast_vec2vecmat(double *a, struct blasfeo_dvec *sa)
2534 	{
2535 	sa->pa = a;
2536 	return;
2537 	}
2538 
2539 
2540 
2541 // insert element into strmat
blasfeo_dgein1(double a,struct blasfeo_dmat * sA,int ai,int aj)2542 void blasfeo_dgein1(double a, struct blasfeo_dmat *sA, int ai, int aj)
2543 	{
2544 
2545 	if (ai==aj)
2546 		{
2547 		// invalidate stored inverse diagonal
2548 		sA->use_dA = 0;
2549 		}
2550 
2551 	const int bs = 4;
2552 	int sda = sA->cn;
2553 	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2554 	pA[0] = a;
2555 	return;
2556 	}
2557 
2558 
2559 
2560 // extract element from strmat
blasfeo_dgeex1(struct blasfeo_dmat * sA,int ai,int aj)2561 double blasfeo_dgeex1(struct blasfeo_dmat *sA, int ai, int aj)
2562 	{
2563 	const int bs = 4;
2564 	int sda = sA->cn;
2565 	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2566 	return pA[0];
2567 	}
2568 
2569 
2570 
2571 // insert element into strvec
blasfeo_dvecin1(double a,struct blasfeo_dvec * sx,int xi)2572 void blasfeo_dvecin1(double a, struct blasfeo_dvec *sx, int xi)
2573 	{
2574 	const int bs = 4;
2575 	double *x = sx->pa + xi;
2576 	x[0] = a;
2577 	return;
2578 	}
2579 
2580 
2581 
2582 // extract element from strvec
blasfeo_dvecex1(struct blasfeo_dvec * sx,int xi)2583 double blasfeo_dvecex1(struct blasfeo_dvec *sx, int xi)
2584 	{
2585 	const int bs = 4;
2586 	double *x = sx->pa + xi;
2587 	return x[0];
2588 	}
2589 
2590 
2591 
2592 // set all elements of a strmat to a value
blasfeo_dgese(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj)2593 void blasfeo_dgese(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj)
2594 	{
2595 
2596 	// invalidate stored inverse diagonal
2597 	sA->use_dA = 0;
2598 
2599 	const int bs = 4;
2600 	int sda = sA->cn;
2601 	double *pA = sA->pA + ai%bs + ai/bs*bs*sda + aj*bs;
2602 	int m0 = m<(bs-ai%bs)%bs ? m : (bs-ai%bs)%bs;
2603 	int ii, jj;
2604 	if(m0>0)
2605 		{
2606 		for(ii=0; ii<m0; ii++)
2607 			{
2608 			for(jj=0; jj<n; jj++)
2609 				{
2610 				pA[jj*bs] = alpha;
2611 				}
2612 			pA += 1;
2613 			}
2614 		pA += bs*(sda-1);
2615 		m -= m0;
2616 		}
2617 	for(ii=0; ii<m-3; ii+=4)
2618 		{
2619 		for(jj=0; jj<n; jj++)
2620 			{
2621 			pA[0+jj*bs] = alpha;
2622 			pA[1+jj*bs] = alpha;
2623 			pA[2+jj*bs] = alpha;
2624 			pA[3+jj*bs] = alpha;
2625 			}
2626 		pA += bs*sda;
2627 		}
2628 	for( ; ii<m; ii++)
2629 		{
2630 		for(jj=0; jj<n; jj++)
2631 			{
2632 			pA[jj*bs] = alpha;
2633 			}
2634 		pA += 1;
2635 		}
2636 	return;
2637 	}
2638 
2639 
2640 
2641 // set all elements of a strvec to a value
blasfeo_dvecse(int m,double alpha,struct blasfeo_dvec * sx,int xi)2642 void blasfeo_dvecse(int m, double alpha, struct blasfeo_dvec *sx, int xi)
2643 	{
2644 	double *x = sx->pa + xi;
2645 	int ii;
2646 	for(ii=0; ii<m; ii++)
2647 		x[ii] = alpha;
2648 	return;
2649 	}
2650 
2651 
2652 
2653 // insert a vector into diagonal
blasfeo_ddiain(int kmax,double alpha,struct blasfeo_dvec * sx,int xi,struct blasfeo_dmat * sA,int ai,int aj)2654 void blasfeo_ddiain(int kmax, double alpha, struct blasfeo_dvec *sx, int xi, struct blasfeo_dmat *sA, int ai, int aj)
2655 	{
2656 
2657 	// invalidate stored inverse diagonal
2658 	sA->use_dA = 0;
2659 
2660 	const int bs = 4;
2661 	int sda = sA->cn;
2662 	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2663 	double *x = sx->pa + xi;
2664 	int offsetA = ai%bs;
2665 
2666 	int kna = (bs-offsetA%bs)%bs;
2667 	kna = kmax<kna ? kmax : kna;
2668 
2669 	int jj, ll;
2670 
2671 	if(kna>0)
2672 		{
2673 		for(ll=0; ll<kna; ll++)
2674 			{
2675 			pA[ll+bs*ll] = alpha*x[ll];
2676 			}
2677 		pA += kna + bs*(sda-1) + kna*bs;
2678 		x  += kna;
2679 		kmax -= kna;
2680 		}
2681 	for(jj=0; jj<kmax-3; jj+=4)
2682 		{
2683 		pA[jj*sda+(jj+0)*bs+0] = alpha*x[jj+0];
2684 		pA[jj*sda+(jj+1)*bs+1] = alpha*x[jj+1];
2685 		pA[jj*sda+(jj+2)*bs+2] = alpha*x[jj+2];
2686 		pA[jj*sda+(jj+3)*bs+3] = alpha*x[jj+3];
2687 		}
2688 	for(ll=0; ll<kmax-jj; ll++)
2689 		{
2690 		pA[jj*sda+(jj+ll)*bs+ll] = alpha*x[jj+ll];
2691 		}
2692 	return;
2693 	}
2694 
2695 
2696 
2697 // add scalar to diagonal
blasfeo_ddiare(int kmax,double alpha,struct blasfeo_dmat * sA,int ai,int aj)2698 void blasfeo_ddiare(int kmax, double alpha, struct blasfeo_dmat *sA, int ai, int aj)
2699 	{
2700 
2701 	// invalidate stored inverse diagonal
2702 	sA->use_dA = 0;
2703 
2704 	const int bs = 4;
2705 	int sda = sA->cn;
2706 	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2707 	int offsetA = ai%bs;
2708 
2709 	int kna = (bs-offsetA%bs)%bs;
2710 	kna = kmax<kna ? kmax : kna;
2711 
2712 	int jj, ll;
2713 
2714 	if(kna>0)
2715 		{
2716 		for(ll=0; ll<kna; ll++)
2717 			{
2718 			pA[ll+bs*ll] += alpha;
2719 			}
2720 		pA += kna + bs*(sda-1) + kna*bs;
2721 		kmax -= kna;
2722 		}
2723 	for(jj=0; jj<kmax-3; jj+=4)
2724 		{
2725 		pA[jj*sda+(jj+0)*bs+0] += alpha;
2726 		pA[jj*sda+(jj+1)*bs+1] += alpha;
2727 		pA[jj*sda+(jj+2)*bs+2] += alpha;
2728 		pA[jj*sda+(jj+3)*bs+3] += alpha;
2729 		}
2730 	for(ll=0; ll<kmax-jj; ll++)
2731 		{
2732 		pA[jj*sda+(jj+ll)*bs+ll] += alpha;
2733 		}
2734 	return;
2735 	}
2736 
2737 
2738 
2739 // swap two rows of two matrix structs
blasfeo_drowsw(int kmax,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sC,int ci,int cj)2740 void blasfeo_drowsw(int kmax, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sC, int ci, int cj)
2741 	{
2742 
2743 	// invalidate stored inverse diagonal
2744 	sA->use_dA = 0;
2745 	sC->use_dA = 0;
2746 
2747 	const int bs = 4;
2748 	int sda = sA->cn;
2749 	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2750 	int sdc = sC->cn;
2751 	double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
2752 	kernel_drowsw_lib4(kmax, pA, pC);
2753 	return;
2754 	}
2755 
2756 
2757 
2758 // permute the rows of a matrix struct
blasfeo_drowpe(int kmax,int * ipiv,struct blasfeo_dmat * sA)2759 void blasfeo_drowpe(int kmax, int *ipiv, struct blasfeo_dmat *sA)
2760 	{
2761 
2762 	// invalidate stored inverse diagonal
2763 	sA->use_dA = 0;
2764 
2765 	int ii;
2766 	for(ii=0; ii<kmax; ii++)
2767 		{
2768 		if(ipiv[ii]!=ii)
2769 			blasfeo_drowsw(sA->n, sA, ii, 0, sA, ipiv[ii], 0);
2770 		}
2771 	return;
2772 	}
2773 
2774 
2775 // inverse permute the rows of a matrix struct
blasfeo_drowpei(int kmax,int * ipiv,struct blasfeo_dmat * sA)2776 void blasfeo_drowpei(int kmax, int *ipiv, struct blasfeo_dmat *sA)
2777 	{
2778 
2779 	// invalidate stored inverse diagonal
2780 	sA->use_dA = 0;
2781 
2782 	int ii;
2783 	for(ii=kmax-1; ii>=0; ii--)
2784 		{
2785 		if(ipiv[ii]!=ii)
2786 			blasfeo_drowsw(sA->n, sA, ii, 0, sA, ipiv[ii], 0);
2787 		}
2788 	return;
2789 	}
2790 
2791 
2792 // extract a row int a vector
blasfeo_drowex(int kmax,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi)2793 void blasfeo_drowex(int kmax, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi)
2794 	{
2795 	const int bs = 4;
2796 	int sda = sA->cn;
2797 	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2798 	double *x = sx->pa + xi;
2799 	drowex_lib(kmax, alpha, pA, x);
2800 	return;
2801 	}
2802 
2803 
2804 
2805 // insert a vector into a row
blasfeo_drowin(int kmax,double alpha,struct blasfeo_dvec * sx,int xi,struct blasfeo_dmat * sA,int ai,int aj)2806 void blasfeo_drowin(int kmax, double alpha, struct blasfeo_dvec *sx, int xi, struct blasfeo_dmat *sA, int ai, int aj)
2807 	{
2808 
2809 	// invalidate stored inverse diagonal
2810 	sA->use_dA = 0;
2811 
2812 	const int bs = 4;
2813 	int sda = sA->cn;
2814 	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2815 	double *x = sx->pa + xi;
2816 	drowin_lib(kmax, alpha, x, pA);
2817 	return;
2818 	}
2819 
2820 
2821 
2822 // add a vector to a row
blasfeo_drowad(int kmax,double alpha,struct blasfeo_dvec * sx,int xi,struct blasfeo_dmat * sA,int ai,int aj)2823 void blasfeo_drowad(int kmax, double alpha, struct blasfeo_dvec *sx, int xi, struct blasfeo_dmat *sA, int ai, int aj)
2824 	{
2825 
2826 	// invalidate stored inverse diagonal
2827 	sA->use_dA = 0;
2828 
2829 	const int bs = 4;
2830 	int sda = sA->cn;
2831 	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2832 	double *x = sx->pa + xi;
2833 	drowad_lib(kmax, alpha, x, pA);
2834 	return;
2835 	}
2836 
2837 
2838 
2839 // extract vector from column
blasfeo_dcolex(int kmax,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi)2840 void blasfeo_dcolex(int kmax, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi)
2841 	{
2842 
2843 	// invalidate stored inverse diagonal
2844 	sA->use_dA = 0;
2845 
2846 	const int bs = 4;
2847 	int sda = sA->cn;
2848 	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2849 	double *x = sx->pa + xi;
2850 	dcolex_lib(kmax, ai%bs, pA, sda, x);
2851 	return;
2852 	}
2853 
2854 
2855 
2856 // insert as vector as a column
blasfeo_dcolin(int kmax,struct blasfeo_dvec * sx,int xi,struct blasfeo_dmat * sA,int ai,int aj)2857 void blasfeo_dcolin(int kmax, struct blasfeo_dvec *sx, int xi, struct blasfeo_dmat *sA, int ai, int aj)
2858 	{
2859 
2860 	// invalidate stored inverse diagonal
2861 	sA->use_dA = 0;
2862 
2863 	const int bs = 4;
2864 	int sda = sA->cn;
2865 	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2866 	double *x = sx->pa + xi;
2867 	dcolin_lib(kmax, x, ai%bs, pA, sda);
2868 	return;
2869 	}
2870 
2871 
2872 
2873 // add scaled vector to column
blasfeo_dcolad(int kmax,double alpha,struct blasfeo_dvec * sx,int xi,struct blasfeo_dmat * sA,int ai,int aj)2874 void blasfeo_dcolad(int kmax, double alpha, struct blasfeo_dvec *sx, int xi, struct blasfeo_dmat *sA, int ai, int aj)
2875 	{
2876 
2877 	// invalidate stored inverse diagonal
2878 	sA->use_dA = 0;
2879 
2880 	const int bs = 4;
2881 	int sda = sA->cn;
2882 	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2883 	double *x = sx->pa + xi;
2884 	dcolad_lib(kmax, alpha, x, ai%bs, pA, sda);
2885 	return;
2886 	}
2887 
2888 
2889 
2890 // scale a column
blasfeo_dcolsc(int kmax,double alpha,struct blasfeo_dmat * sA,int ai,int aj)2891 void blasfeo_dcolsc(int kmax, double alpha, struct blasfeo_dmat *sA, int ai, int aj)
2892 	{
2893 
2894 	// invalidate stored inverse diagonal
2895 	sA->use_dA = 0;
2896 
2897 	const int bs = 4;
2898 
2899 	int sda = sA->cn;
2900 	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2901 
2902 	int kna = (bs-ai%bs)%bs;
2903 	kna = kmax<kna ? kmax : kna;
2904 
2905 	int jj, ll;
2906 
2907 	if(kna>0)
2908 		{
2909 		for(ll=0; ll<kna; ll++)
2910 			{
2911 			pA[ll] *= alpha;
2912 			}
2913 		pA += kna + bs*(sda-1);
2914 		kmax -= kna;
2915 		}
2916 	for(jj=0; jj<kmax-3; jj+=4)
2917 		{
2918 		pA[jj*sda+0] *= alpha;
2919 		pA[jj*sda+1] *= alpha;
2920 		pA[jj*sda+2] *= alpha;
2921 		pA[jj*sda+3] *= alpha;
2922 		}
2923 	for(ll=0; ll<kmax-jj; ll++)
2924 		{
2925 		pA[jj*sda+ll] *= alpha;
2926 		}
2927 
2928 	return;
2929 	}
2930 
2931 
2932 
2933 // swap two cols of two matrix structs
blasfeo_dcolsw(int kmax,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sC,int ci,int cj)2934 void blasfeo_dcolsw(int kmax, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sC, int ci, int cj)
2935 	{
2936 
2937 	// invalidate stored inverse diagonal
2938 	sA->use_dA = 0;
2939 	sC->use_dA = 0;
2940 
2941 	const int bs = 4;
2942 	int sda = sA->cn;
2943 	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2944 	int sdc = sC->cn;
2945 	double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
2946 	dcolsw_lib(kmax, ai%bs, pA, sda, ci%bs, pC, sdc);
2947 	return;
2948 	}
2949 
2950 
2951 
2952 // permute the cols of a matrix struct
blasfeo_dcolpe(int kmax,int * ipiv,struct blasfeo_dmat * sA)2953 void blasfeo_dcolpe(int kmax, int *ipiv, struct blasfeo_dmat *sA)
2954 	{
2955 
2956 	// invalidate stored inverse diagonal
2957 	sA->use_dA = 0;
2958 
2959 	int ii;
2960 	for(ii=0; ii<kmax; ii++)
2961 		{
2962 		if(ipiv[ii]!=ii)
2963 			blasfeo_dcolsw(sA->m, sA, 0, ii, sA, 0, ipiv[ii]);
2964 		}
2965 	return;
2966 	}
2967 
2968 
2969 
2970 // inverse permute the cols of a matrix struct
blasfeo_dcolpei(int kmax,int * ipiv,struct blasfeo_dmat * sA)2971 void blasfeo_dcolpei(int kmax, int *ipiv, struct blasfeo_dmat *sA)
2972 	{
2973 
2974 	// invalidate stored inverse diagonal
2975 	sA->use_dA = 0;
2976 
2977 	int ii;
2978 	for(ii=kmax-1; ii>=0; ii--)
2979 		{
2980 		if(ipiv[ii]!=ii)
2981 			blasfeo_dcolsw(sA->m, sA, 0, ii, sA, 0, ipiv[ii]);
2982 		}
2983 	return;
2984 	}
2985 
2986 
2987 
2988 // copy a generic strmat into a generic strmat
blasfeo_dgecp(int m,int n,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj)2989 void blasfeo_dgecp(int m, int n, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj)
2990 	{
2991 
2992 	// invalidate stored inverse diagonal
2993 	sB->use_dA = 0;
2994 
2995 	const int bs = 4;
2996 	const double alpha = 1.0;
2997 
2998 	// get submatrices
2999 	int sda = sA->cn;
3000 	double *A = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
3001 	int sdb = sB->cn;
3002 	double *B = sB->pA + bi/bs*bs*sdb + bi%bs + bj*bs;
3003 
3004 	if(m<=0 || n<=0)
3005 		return;
3006 
3007 	int mna, ii;
3008 
3009 	// compute offset from closest panels
3010 	int offA = ai%bs;
3011 	int offB = bi%bs;
3012 
3013 	// A at the beginning of the block
3014 	A -= offA;
3015 	// A at the beginning of the block
3016 	B -= offB;
3017 
3018 	// same alignment
3019 	if(offA==offB)
3020 		{
3021 		ii = 0;
3022 		// clean up at the beginning
3023 		mna = (4-offB)%bs;
3024 		if(mna>0)
3025 			{
3026 			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
3027 				{
3028 				if(m==1)
3029 					{
3030 					kernel_dgecpsc_1_0_lib4(0, n, alpha, A+offA, B+offB);
3031 					return;
3032 					}
3033 				else //if(m==2 && mna==3)
3034 					{
3035 					kernel_dgecpsc_2_0_lib4(0, n, alpha, A+offA, B+offB);
3036 					return;
3037 					}
3038 				}
3039 			if(mna==1)
3040 				{
3041 				kernel_dgecpsc_1_0_lib4(0, n, alpha, A+offA, B+offB);
3042 				A += 4*sda;
3043 				B += 4*sdb;
3044 				ii += 1;
3045 				}
3046 			else if(mna==2)
3047 				{
3048 				kernel_dgecpsc_2_0_lib4(0, n, alpha, A+offA, B+offB);
3049 				A += 4*sda;
3050 				B += 4*sdb;
3051 				ii += 2;
3052 				}
3053 			else // if(mna==3)
3054 				{
3055 				kernel_dgecpsc_3_0_lib4(0, n, alpha, A+offA, B+offB);
3056 				A += 4*sda;
3057 				B += 4*sdb;
3058 				ii += 3;
3059 				}
3060 			}
3061 		// main loop
3062 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
3063 		for(; ii<m-7; ii+=8)
3064 			{
3065 			kernel_dgecp_8_0_lib4(0, n, A, sda, B, sdb);
3066 			A += 8*sda;
3067 			B += 8*sdb;
3068 			}
3069 #endif
3070 		for(; ii<m-3; ii+=4)
3071 			{
3072 			kernel_dgecp_4_0_lib4(0, n, A, B);
3073 			A += 4*sda;
3074 			B += 4*sdb;
3075 			}
3076 		// clean up at the end
3077 		if(ii<m)
3078 			{
3079 			if(m-ii==1)
3080 				kernel_dgecpsc_1_0_lib4(0, n, alpha, A, B);
3081 			else if(m-ii==2)
3082 				kernel_dgecpsc_2_0_lib4(0, n, alpha, A, B);
3083 			else // if(m-ii==3)
3084 				kernel_dgecpsc_3_0_lib4(0, n, alpha, A, B);
3085 			}
3086 		}
3087 	// skip one element of A
3088 	else if(offA==(offB+1)%bs)
3089 		{
3090 		ii = 0;
3091 		// clean up at the beginning
3092 		mna = (4-offB)%bs;
3093 		if(mna>0)
3094 			{
3095 			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
3096 				{
3097 				if(m==1)
3098 					{
3099 					kernel_dgecpsc_1_0_lib4(0, n, alpha, A+offA, B+offB);
3100 					return;
3101 					}
3102 				else //if(m==2 && mna==3)
3103 					{
3104 					kernel_dgecpsc_2_0_lib4(0, n, alpha, A+offA, B+offB);
3105 					return;
3106 					}
3107 				}
3108 			if(mna==1)
3109 				{
3110 				kernel_dgecpsc_1_0_lib4(0, n, alpha, A+offA, B+offB);
3111 				//A += 4*sda;
3112 				B += 4*sdb;
3113 				ii += 1;
3114 				}
3115 			else if(mna==2)
3116 				{
3117 				kernel_dgecpsc_2_3_lib4(0, n, alpha, A, sda, B+2);
3118 				A += 4*sda;
3119 				B += 4*sdb;
3120 				ii += 2;
3121 				}
3122 			else // if(mna==3)
3123 				{
3124 				kernel_dgecpsc_3_2_lib4(0, n, alpha, A, sda, B+1);
3125 				A += 4*sda;
3126 				B += 4*sdb;
3127 				ii += 3;
3128 				}
3129 			}
3130 		// main loop
3131 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
3132 		for( ; ii<m-7; ii+=8)
3133 			{
3134 			kernel_dgecpsc_8_1_lib4(0, n, alpha, A, sda, B, sdb);
3135 			A += 8*sda;
3136 			B += 8*sdb;
3137 			}
3138 #endif
3139 		for( ; ii<m-3; ii+=4)
3140 			{
3141 			kernel_dgecpsc_4_1_lib4(0, n, alpha, A, sda, B);
3142 			A += 4*sda;
3143 			B += 4*sdb;
3144 			}
3145 		// clean up at the end
3146 		if(ii<m)
3147 			{
3148 			if(m-ii==1)
3149 				kernel_dgecpsc_1_0_lib4(0, n, alpha, A+1, B);
3150 			else if(m-ii==2)
3151 				kernel_dgecpsc_2_0_lib4(0, n, alpha, A+1, B);
3152 			else // if(m-ii==3)
3153 				kernel_dgecpsc_3_0_lib4(0, n, alpha, A+1, B);
3154 			}
3155 		}
3156 	// skip 2 elements of A
3157 	else if(offA==(offB+2)%bs)
3158 		{
3159 		ii = 0;
3160 		// clean up at the beginning
3161 		mna = (4-offB)%bs;
3162 		if(mna>0)
3163 			{
3164 			if(m<mna)
3165 				{
3166 				if(m==1)
3167 					{
3168 					kernel_dgecpsc_1_0_lib4(0, n, alpha, A+offA, B+offB);
3169 					return;
3170 					}
3171 				else // if(m==2 && mna==3)
3172 					{
3173 					kernel_dgecpsc_2_3_lib4(0, n, alpha, A, sda, B+1);
3174 					return;
3175 					}
3176 				}
3177 			if(mna==1)
3178 				{
3179 				kernel_dgecpsc_1_0_lib4(0, n, alpha, A+1, B+3);
3180 				// A += 4*sda;
3181 				B += 4*sdb;
3182 				ii += 1;
3183 				}
3184 			else if(mna==2)
3185 				{
3186 				kernel_dgecpsc_2_0_lib4(0, n, alpha, A, B+2);
3187 				// A += 4*sda;
3188 				B += 4*sdb;
3189 				ii += 2;
3190 				}
3191 			else // if(mna==3)
3192 				{
3193 				kernel_dgecpsc_3_3_lib4(0, n, alpha, A, sda, B+1);
3194 				A += 4*sda;
3195 				B += 4*sdb;
3196 				ii += 3;
3197 				}
3198 			}
3199 		// main loop
3200 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
3201 		for(; ii<m-7; ii+=8)
3202 			{
3203 			kernel_dgecpsc_8_2_lib4(0, n, alpha, A, sda, B, sdb);
3204 			A += 8*sda;
3205 			B += 8*sdb;
3206 			}
3207 #endif
3208 		for(; ii<m-3; ii+=4)
3209 			{
3210 			kernel_dgecpsc_4_2_lib4(0, n, alpha, A, sda, B);
3211 			A += 4*sda;
3212 			B += 4*sdb;
3213 			}
3214 		// clean up at the end
3215 		if(ii<m)
3216 			{
3217 			if(m-ii==1)
3218 				kernel_dgecpsc_1_0_lib4(0, n, alpha, A+2, B);
3219 			else if(m-ii==2)
3220 				kernel_dgecpsc_2_0_lib4(0, n, alpha, A+2, B);
3221 			else // if(m-ii==3)
3222 				kernel_dgecpsc_3_2_lib4(0, n, alpha, A, sda, B);
3223 			}
3224 		}
3225 	// skip 3 elements of A
3226 	else // if(offA==(offB+3)%bs)
3227 		{
3228 		ii = 0;
3229 		// clean up at the beginning
3230 		mna = (4-offB)%bs;
3231 		if(mna>0)
3232 			{
3233 			if(m<mna)
3234 				{
3235 				if(m==1)
3236 					{
3237 					kernel_dgecpsc_1_0_lib4(0, n, alpha, A+offA, B+offB);
3238 					return;
3239 					}
3240 				else // if(m==2 && mna==3)
3241 					{
3242 					kernel_dgecpsc_2_0_lib4(0, n, alpha, A+offA, B+offB);
3243 					return;
3244 					}
3245 				}
3246 			if(mna==1)
3247 				{
3248 				kernel_dgecpsc_1_0_lib4(0, n, alpha, A+offA, B+offB);
3249 				// A += 4*sda;
3250 				B += 4*sdb;
3251 				ii += 1;
3252 				}
3253 			else if(mna==2)
3254 				{
3255 				kernel_dgecpsc_2_0_lib4(0, n, alpha, A+offA, B+offB);
3256 				// A += 4*sda;
3257 				B += 4*sdb;
3258 				ii += 2;
3259 				}
3260 			else // if(mna==3)
3261 				{
3262 				kernel_dgecpsc_3_0_lib4(0, n, alpha, A+offA, B+offB);
3263 				// A += 4*sda;
3264 				B += 4*sdb;
3265 				ii += 3;
3266 				}
3267 			}
3268 		// main loop
3269 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
3270 		for(; ii<m-7; ii+=8)
3271 			{
3272 			kernel_dgecpsc_8_3_lib4(0, n, alpha, A, sda, B, sdb);
3273 			A += 8*sda;
3274 			B += 8*sdb;
3275 			}
3276 #endif
3277 		for(; ii<m-3; ii+=4)
3278 			{
3279 			kernel_dgecpsc_4_3_lib4(0, n, alpha, A, sda, B);
3280 			A += 4*sda;
3281 			B += 4*sdb;
3282 			}
3283 		// clean up at the end
3284 		if(ii<m)
3285 			{
3286 			if(m-ii==1)
3287 				kernel_dgecpsc_1_0_lib4(0, n, alpha, A+3, B);
3288 			else if(m-ii==2)
3289 				kernel_dgecpsc_2_3_lib4(0, n, alpha, A, sda, B);
3290 			else // if(m-ii==3)
3291 				kernel_dgecpsc_3_3_lib4(0, n, alpha, A, sda, B);
3292 			}
3293 		}
3294 
3295 	return;
3296 	}
3297 
3298 
3299 
3300 // copy a lower triangular strmat into a lower triangular strmat
blasfeo_dtrcp_l(int m,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj)3301 void blasfeo_dtrcp_l(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj)
3302 	{
3303 
3304 	// invalidate stored inverse diagonal
3305 	sB->use_dA = 0;
3306 
3307 	const int bs = 4;
3308 	const double alpha = 1;
3309 
3310 	// get submatrices
3311 	int sda = sA->cn;
3312 	double *A = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
3313 	int sdb = sB->cn;
3314 	double *B = sB->pA + bi/bs*bs*sdb + bi%bs + bj*bs;
3315 
3316 	if(m<=0)
3317 		return;
3318 
3319 	int n = m;
3320 
3321 	int mna, ii;
3322 
3323 	int offA = ai%bs;
3324 	int offB = bi%bs;
3325 
3326 	// A at the beginning of the block
3327 	A -= offA;
3328 
3329 	// A at the beginning of the block
3330 	B -= offB;
3331 
3332 	// same alignment
3333 	if(offA==offB)
3334 		{
3335 		ii = 0;
3336 		// clean up at the beginning
3337 		mna = (4-offB)%bs;
3338 		if(mna>0)
3339 			{
3340 			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
3341 				{
3342 				if(m==1)
3343 					{
3344 					kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+offA, B+offB);
3345 					return;
3346 					}
3347 				else //if(m==2 && mna==3)
3348 					{
3349 					kernel_dgecpsc_2_0_lib4(1, ii, alpha, A+offA, B+offB);
3350 					return;
3351 					}
3352 				}
3353 			if(mna==1)
3354 				{
3355 				kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+offA, B+offB);
3356 				A += 4*sda;
3357 				B += 4*sdb;
3358 				ii += 1;
3359 				}
3360 			else if(mna==2)
3361 				{
3362 				kernel_dgecpsc_2_0_lib4(1, ii, alpha, A+offA, B+offB);
3363 				A += 4*sda;
3364 				B += 4*sdb;
3365 				ii += 2;
3366 				}
3367 			else // if(mna==3)
3368 				{
3369 				kernel_dgecpsc_3_0_lib4(1, ii, alpha, A+offA, B+offB);
3370 				A += 4*sda;
3371 				B += 4*sdb;
3372 				ii += 3;
3373 				}
3374 			}
3375 		// main loop
3376 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
3377 		for(; ii<m-7; ii+=8)
3378 			{
3379 			kernel_dgecp_8_0_lib4(1, ii, A, sda, B, sdb);
3380 			A += 8*sda;
3381 			B += 8*sdb;
3382 			}
3383 #endif
3384 		for(; ii<m-3; ii+=4)
3385 			{
3386 			kernel_dgecp_4_0_lib4(1, ii, A, B);
3387 			A += 4*sda;
3388 			B += 4*sdb;
3389 			}
3390 		// clean up at the end
3391 		if(ii<m)
3392 			{
3393 			if(m-ii==1)
3394 				kernel_dgecpsc_1_0_lib4(1, ii, alpha, A, B);
3395 			else if(m-ii==2)
3396 				kernel_dgecpsc_2_0_lib4(1, ii, alpha, A, B);
3397 			else // if(m-ii==3)
3398 				kernel_dgecpsc_3_0_lib4(1, ii, alpha, A, B);
3399 			}
3400 		}
3401 	// skip one element of A
3402 	else if(offA==(offB+1)%bs)
3403 		{
3404 		ii = 0;
3405 		// clean up at the beginning
3406 		mna = (4-offB)%bs;
3407 		if(mna>0)
3408 			{
3409 			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
3410 				{
3411 				if(m==1)
3412 					{
3413 					kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+offA, B+offB);
3414 					return;
3415 					}
3416 				else //if(m==2 && mna==3)
3417 					{
3418 					kernel_dgecpsc_2_0_lib4(1, ii, alpha, A+offA, B+offB);
3419 					return;
3420 					}
3421 				}
3422 			if(mna==1)
3423 				{
3424 				kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+offA, B+offB);
3425 				//A += 4*sda;
3426 				B += 4*sdb;
3427 				ii += 1;
3428 				}
3429 			else if(mna==2)
3430 				{
3431 				kernel_dgecpsc_2_3_lib4(1, ii, alpha, A, sda, B+2);
3432 				A += 4*sda;
3433 				B += 4*sdb;
3434 				ii += 2;
3435 				}
3436 			else // if(mna==3)
3437 				{
3438 				kernel_dgecpsc_3_2_lib4(1, ii, alpha, A, sda, B+1);
3439 				A += 4*sda;
3440 				B += 4*sdb;
3441 				ii += 3;
3442 				}
3443 			}
3444 		// main loop
3445 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
3446 		for( ; ii<m-7; ii+=8)
3447 			{
3448 			kernel_dgecpsc_8_1_lib4(1, ii, alpha, A, sda, B, sdb);
3449 			A += 8*sda;
3450 			B += 8*sdb;
3451 			}
3452 #endif
3453 		for( ; ii<m-3; ii+=4)
3454 			{
3455 			kernel_dgecpsc_4_1_lib4(1, ii, alpha, A, sda, B);
3456 			A += 4*sda;
3457 			B += 4*sdb;
3458 			}
3459 		// clean up at the end
3460 		if(ii<m)
3461 			{
3462 			if(m-ii==1)
3463 				kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+1, B);
3464 			else if(m-ii==2)
3465 				kernel_dgecpsc_2_0_lib4(1, ii, alpha, A+1, B);
3466 			else // if(m-ii==3)
3467 				kernel_dgecpsc_3_0_lib4(1, ii, alpha, A+1, B);
3468 			}
3469 		}
3470 	// skip 2 elements of A
3471 	else if(offA==(offB+2)%bs)
3472 		{
3473 		ii = 0;
3474 		// clean up at the beginning
3475 		mna = (4-offB)%bs;
3476 		if(mna>0)
3477 			{
3478 			if(m<mna)
3479 				{
3480 				if(m==1)
3481 					{
3482 					kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+offA, B+offB);
3483 					return;
3484 					}
3485 				else // if(m==2 && mna==3)
3486 					{
3487 					kernel_dgecpsc_2_3_lib4(1, ii, alpha, A, sda, B+1);
3488 					return;
3489 					}
3490 				}
3491 			if(mna==1)
3492 				{
3493 				kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+1, B+3);
3494 				// A += 4*sda;
3495 				B += 4*sdb;
3496 				ii += 1;
3497 				}
3498 			else if(mna==2)
3499 				{
3500 				kernel_dgecpsc_2_0_lib4(1, ii, alpha, A, B+2);
3501 				// A += 4*sda;
3502 				B += 4*sdb;
3503 				ii += 2;
3504 				}
3505 			else // if(mna==3)
3506 				{
3507 				kernel_dgecpsc_3_3_lib4(1, ii, alpha, A, sda, B+1);
3508 				A += 4*sda;
3509 				B += 4*sdb;
3510 				ii += 3;
3511 				}
3512 			}
3513 		// main loop
3514 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
3515 		for(; ii<m-7; ii+=8)
3516 			{
3517 			kernel_dgecpsc_8_2_lib4(1, ii, alpha, A, sda, B, sdb);
3518 			A += 8*sda;
3519 			B += 8*sdb;
3520 			}
3521 #endif
3522 		for(; ii<m-3; ii+=4)
3523 			{
3524 			kernel_dgecpsc_4_2_lib4(1, ii, alpha, A, sda, B);
3525 			A += 4*sda;
3526 			B += 4*sdb;
3527 			}
3528 		// clean up at the end
3529 		if(ii<m)
3530 			{
3531 			if(m-ii==1)
3532 				kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+2, B);
3533 			else if(m-ii==2)
3534 				kernel_dgecpsc_2_0_lib4(1, ii, alpha, A+2, B);
3535 			else // if(m-ii==3)
3536 				kernel_dgecpsc_3_2_lib4(1, ii, alpha, A, sda, B);
3537 			}
3538 		}
3539 	// skip 3 elements of A
3540 	else // if(offA==(offB+3)%bs)
3541 		{
3542 		ii = 0;
3543 		// clean up at the beginning
3544 		mna = (4-offB)%bs;
3545 		if(mna>0)
3546 			{
3547 			if(m<mna)
3548 				{
3549 				if(m==1)
3550 					{
3551 					kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+offA, B+offB);
3552 					return;
3553 					}
3554 				else // if(m==2 && mna==3)
3555 					{
3556 					kernel_dgecpsc_2_0_lib4(1, ii, alpha, A+offA, B+offB);
3557 					return;
3558 					}
3559 				}
3560 			if(mna==1)
3561 				{
3562 				kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+offA, B+offB);
3563 				// A += 4*sda;
3564 				B += 4*sdb;
3565 				ii += 1;
3566 				}
3567 			else if(mna==2)
3568 				{
3569 				kernel_dgecpsc_2_0_lib4(1, ii, alpha, A+offA, B+offB);
3570 				// A += 4*sda;
3571 				B += 4*sdb;
3572 				ii += 2;
3573 				}
3574 			else // if(mna==3)
3575 				{
3576 				kernel_dgecpsc_3_0_lib4(1, ii, alpha, A+offA, B+offB);
3577 				// A += 4*sda;
3578 				B += 4*sdb;
3579 				ii += 3;
3580 				}
3581 			}
3582 		// main loop
3583 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
3584 		for(; ii<m-7; ii+=8)
3585 			{
3586 			kernel_dgecpsc_8_3_lib4(1, ii, alpha, A, sda, B, sdb);
3587 			A += 8*sda;
3588 			B += 8*sdb;
3589 			}
3590 #endif
3591 		for(; ii<m-3; ii+=4)
3592 			{
3593 			kernel_dgecpsc_4_3_lib4(1, ii, alpha, A, sda, B);
3594 			A += 4*sda;
3595 			B += 4*sdb;
3596 			}
3597 		// clean up at the end
3598 		if(ii<m)
3599 			{
3600 			if(m-ii==1)
3601 				kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+3, B);
3602 			else if(m-ii==2)
3603 				kernel_dgecpsc_2_3_lib4(1, ii, alpha, A, sda, B);
3604 			else // if(m-ii==3)
3605 				kernel_dgecpsc_3_3_lib4(1, ii, alpha, A, sda, B);
3606 			}
3607 		}
3608 
3609 	/* } */
3610 	return;
3611 	}
3612 
3613 
3614 
3615 // copy and scale a generic strmat into a generic strmat
blasfeo_dgecpsc(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj)3616 void blasfeo_dgecpsc(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj)
3617 	{
3618 
3619 	// invalidate stored inverse diagonal
3620 	sB->use_dA = 0;
3621 
3622 	const int bs = 4;
3623 
3624 	// extract dimension
3625 	int sda = sA->cn;
3626 	int sdb = sB->cn;
3627 
3628 	// extract submatrix
3629 	double *A = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
3630 	double *B = sB->pA + bi/bs*bs*sdb + bi%bs + bj*bs;
3631 
3632 	if(m<=0 || n<=0)
3633 		return;
3634 
3635 	int mna, ii;
3636 
3637 	int offA = ai%bs;
3638 	int offB = bi%bs;
3639 
3640 	// A at the beginning of the block
3641 	A -= offA;
3642 
3643 	// A at the beginning of the block
3644 	B -= offB;
3645 
3646 	// same alignment
3647 	if(offA==offB)
3648 		{
3649 		ii = 0;
3650 		// clean up at the beginning
3651 		mna = (4-offB)%bs;
3652 		if(mna>0)
3653 			{
3654 			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
3655 				{
3656 				if(m==1)
3657 					{
3658 					kernel_dgecpsc_1_0_lib4(0, n, alpha, A+offA, B+offB);
3659 					return;
3660 					}
3661 				else //if(m==2 && mna==3)
3662 					{
3663 					kernel_dgecpsc_2_0_lib4(0, n, alpha, A+offA, B+offB);
3664 					return;
3665 					}
3666 				}
3667 			if(mna==1)
3668 				{
3669 				kernel_dgecpsc_1_0_lib4(0, n, alpha, A+offA, B+offB);
3670 				A += 4*sda;
3671 				B += 4*sdb;
3672 				ii += 1;
3673 				}
3674 			else if(mna==2)
3675 				{
3676 				kernel_dgecpsc_2_0_lib4(0, n, alpha, A+offA, B+offB);
3677 				A += 4*sda;
3678 				B += 4*sdb;
3679 				ii += 2;
3680 				}
3681 			else // if(mna==3)
3682 				{
3683 				kernel_dgecpsc_3_0_lib4(0, n, alpha, A+offA, B+offB);
3684 				A += 4*sda;
3685 				B += 4*sdb;
3686 				ii += 3;
3687 				}
3688 			}
3689 		// main loop
3690 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
3691 		for(; ii<m-7; ii+=8)
3692 			{
3693 			kernel_dgecpsc_8_0_lib4(0, n, alpha, A, sda, B, sdb);
3694 			A += 8*sda;
3695 			B += 8*sdb;
3696 			}
3697 #endif
3698 		for(; ii<m-3; ii+=4)
3699 			{
3700 			kernel_dgecpsc_4_0_lib4(0, n, alpha, A, B);
3701 			A += 4*sda;
3702 			B += 4*sdb;
3703 			}
3704 		// clean up at the end
3705 		if(ii<m)
3706 			{
3707 			if(m-ii==1)
3708 				kernel_dgecpsc_1_0_lib4(0, n, alpha, A, B);
3709 			else if(m-ii==2)
3710 				kernel_dgecpsc_2_0_lib4(0, n, alpha, A, B);
3711 			else // if(m-ii==3)
3712 				kernel_dgecpsc_3_0_lib4(0, n, alpha, A, B);
3713 			}
3714 		}
3715 	// skip one element of A
3716 	else if(offA==(offB+1)%bs)
3717 		{
3718 		ii = 0;
3719 		// clean up at the beginning
3720 		mna = (4-offB)%bs;
3721 		if(mna>0)
3722 			{
3723 			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
3724 				{
3725 				if(m==1)
3726 					{
3727 					kernel_dgecpsc_1_0_lib4(0, n, alpha, A+offA, B+offB);
3728 					return;
3729 					}
3730 				else //if(m==2 && mna==3)
3731 					{
3732 					kernel_dgecpsc_2_0_lib4(0, n, alpha, A+offA, B+offB);
3733 					return;
3734 					}
3735 				}
3736 			if(mna==1)
3737 				{
3738 				kernel_dgecpsc_1_0_lib4(0, n, alpha, A+offA, B+offB);
3739 				//A += 4*sda;
3740 				B += 4*sdb;
3741 				ii += 1;
3742 				}
3743 			else if(mna==2)
3744 				{
3745 				kernel_dgecpsc_2_3_lib4(0, n, alpha, A, sda, B+2);
3746 				A += 4*sda;
3747 				B += 4*sdb;
3748 				ii += 2;
3749 				}
3750 			else // if(mna==3)
3751 				{
3752 				kernel_dgecpsc_3_2_lib4(0, n, alpha, A, sda, B+1);
3753 				A += 4*sda;
3754 				B += 4*sdb;
3755 				ii += 3;
3756 				}
3757 			}
3758 		// main loop
3759 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
3760 		for( ; ii<m-7; ii+=8)
3761 			{
3762 			kernel_dgecpsc_8_1_lib4(0, n, alpha, A, sda, B, sdb);
3763 			A += 8*sda;
3764 			B += 8*sdb;
3765 			}
3766 #endif
3767 		for( ; ii<m-3; ii+=4)
3768 			{
3769 			kernel_dgecpsc_4_1_lib4(0, n, alpha, A, sda, B);
3770 			A += 4*sda;
3771 			B += 4*sdb;
3772 			}
3773 		// clean up at the end
3774 		if(ii<m)
3775 			{
3776 			if(m-ii==1)
3777 				kernel_dgecpsc_1_0_lib4(0, n, alpha, A+1, B);
3778 			else if(m-ii==2)
3779 				kernel_dgecpsc_2_0_lib4(0, n, alpha, A+1, B);
3780 			else // if(m-ii==3)
3781 				kernel_dgecpsc_3_0_lib4(0, n, alpha, A+1, B);
3782 			}
3783 		}
3784 	// skip 2 elements of A
3785 	else if(offA==(offB+2)%bs)
3786 		{
3787 		ii = 0;
3788 		// clean up at the beginning
3789 		mna = (4-offB)%bs;
3790 		if(mna>0)
3791 			{
3792 			if(m<mna)
3793 				{
3794 				if(m==1)
3795 					{
3796 					kernel_dgecpsc_1_0_lib4(0, n, alpha, A+offA, B+offB);
3797 					return;
3798 					}
3799 				else // if(m==2 && mna==3)
3800 					{
3801 					kernel_dgecpsc_2_3_lib4(0, n, alpha, A, sda, B+1);
3802 					return;
3803 					}
3804 				}
3805 			if(mna==1)
3806 				{
3807 				kernel_dgecpsc_1_0_lib4(0, n, alpha, A+1, B+3);
3808 				// A += 4*sda;
3809 				B += 4*sdb;
3810 				ii += 1;
3811 				}
3812 			else if(mna==2)
3813 				{
3814 				kernel_dgecpsc_2_0_lib4(0, n, alpha, A, B+2);
3815 				// A += 4*sda;
3816 				B += 4*sdb;
3817 				ii += 2;
3818 				}
3819 			else // if(mna==3)
3820 				{
3821 				kernel_dgecpsc_3_3_lib4(0, n, alpha, A, sda, B+1);
3822 				A += 4*sda;
3823 				B += 4*sdb;
3824 				ii += 3;
3825 				}
3826 			}
3827 		// main loop
3828 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
3829 		for(; ii<m-7; ii+=8)
3830 			{
3831 			kernel_dgecpsc_8_2_lib4(0, n, alpha, A, sda, B, sdb);
3832 			A += 8*sda;
3833 			B += 8*sdb;
3834 			}
3835 #endif
3836 		for(; ii<m-3; ii+=4)
3837 			{
3838 			kernel_dgecpsc_4_2_lib4(0, n, alpha, A, sda, B);
3839 			A += 4*sda;
3840 			B += 4*sdb;
3841 			}
3842 		// clean up at the end
3843 		if(ii<m)
3844 			{
3845 			if(m-ii==1)
3846 				kernel_dgecpsc_1_0_lib4(0, n, alpha, A+2, B);
3847 			else if(m-ii==2)
3848 				kernel_dgecpsc_2_0_lib4(0, n, alpha, A+2, B);
3849 			else // if(m-ii==3)
3850 				kernel_dgecpsc_3_2_lib4(0, n, alpha, A, sda, B);
3851 			}
3852 		}
3853 	// skip 3 elements of A
3854 	else // if(offA==(offB+3)%bs)
3855 		{
3856 		ii = 0;
3857 		// clean up at the beginning
3858 		mna = (4-offB)%bs;
3859 		if(mna>0)
3860 			{
3861 			if(m<mna)
3862 				{
3863 				if(m==1)
3864 					{
3865 					kernel_dgecpsc_1_0_lib4(0, n, alpha, A+offA, B+offB);
3866 					return;
3867 					}
3868 				else // if(m==2 && mna==3)
3869 					{
3870 					kernel_dgecpsc_2_0_lib4(0, n, alpha, A+offA, B+offB);
3871 					return;
3872 					}
3873 				}
3874 			if(mna==1)
3875 				{
3876 				kernel_dgecpsc_1_0_lib4(0, n, alpha, A+offA, B+offB);
3877 				// A += 4*sda;
3878 				B += 4*sdb;
3879 				ii += 1;
3880 				}
3881 			else if(mna==2)
3882 				{
3883 				kernel_dgecpsc_2_0_lib4(0, n, alpha, A+offA, B+offB);
3884 				// A += 4*sda;
3885 				B += 4*sdb;
3886 				ii += 2;
3887 				}
3888 			else // if(mna==3)
3889 				{
3890 				kernel_dgecpsc_3_0_lib4(0, n, alpha, A+offA, B+offB);
3891 				// A += 4*sda;
3892 				B += 4*sdb;
3893 				ii += 3;
3894 				}
3895 			}
3896 		// main loop
3897 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
3898 		for(; ii<m-7; ii+=8)
3899 			{
3900 			kernel_dgecpsc_8_3_lib4(0, n, alpha, A, sda, B, sdb);
3901 			A += 8*sda;
3902 			B += 8*sdb;
3903 			}
3904 #endif
3905 		for(; ii<m-3; ii+=4)
3906 			{
3907 			kernel_dgecpsc_4_3_lib4(0, n, alpha, A, sda, B);
3908 			A += 4*sda;
3909 			B += 4*sdb;
3910 			}
3911 		// clean up at the end
3912 		if(ii<m)
3913 			{
3914 			if(m-ii==1)
3915 				kernel_dgecpsc_1_0_lib4(0, n, alpha, A+3, B);
3916 			else if(m-ii==2)
3917 				kernel_dgecpsc_2_3_lib4(0, n, alpha, A, sda, B);
3918 			else // if(m-ii==3)
3919 				kernel_dgecpsc_3_3_lib4(0, n, alpha, A, sda, B);
3920 			}
3921 		}
3922 	return;
3923 	}
3924 
3925 
3926 
3927 // copy  and scale a lower triangular strmat into a lower triangular strmat
blasfeo_dtrcpsc_l(int m,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj)3928 void blasfeo_dtrcpsc_l(int m, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj)
3929 	{
3930 
3931 	// invalidate stored inverse diagonal
3932 	sB->use_dA = 0;
3933 
3934 	const int bs = 4;
3935 	int sda = sA->cn;
3936 	double *A = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
3937 	int sdb = sB->cn;
3938 	double *B = sB->pA + bi/bs*bs*sdb + bi%bs + bj*bs;
3939 
3940 	/* dtrcp_l_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); */
3941 	/* // copies a lower triangular packed matrix into a lower triangular packed matrix */
3942 	/* void dtrcp_l_lib(int m, double alpha, int offsetA, double *A, int sda, int offsetB, double *B, int sdb) */
3943 	/* { */
3944 
3945 	if(m<=0)
3946 		return;
3947 
3948 	int n = m;
3949 
3950 	int mna, ii;
3951 
3952 	int offA = ai%bs;
3953 	int offB = bi%bs;
3954 
3955 	// A at the beginning of the block
3956 	A -= offA;
3957 
3958 	// A at the beginning of the block
3959 	B -= offB;
3960 
3961 	// same alignment
3962 	if(offA==offB)
3963 		{
3964 		ii = 0;
3965 		// clean up at the beginning
3966 		mna = (4-offB)%bs;
3967 		if(mna>0)
3968 			{
3969 			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
3970 				{
3971 				if(m==1)
3972 					{
3973 					kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+offA, B+offB);
3974 					return;
3975 					}
3976 				else //if(m==2 && mna==3)
3977 					{
3978 					kernel_dgecpsc_2_0_lib4(1, ii, alpha, A+offA, B+offB);
3979 					return;
3980 					}
3981 				}
3982 			if(mna==1)
3983 				{
3984 				kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+offA, B+offB);
3985 				A += 4*sda;
3986 				B += 4*sdb;
3987 				ii += 1;
3988 				}
3989 			else if(mna==2)
3990 				{
3991 				kernel_dgecpsc_2_0_lib4(1, ii, alpha, A+offA, B+offB);
3992 				A += 4*sda;
3993 				B += 4*sdb;
3994 				ii += 2;
3995 				}
3996 			else // if(mna==3)
3997 				{
3998 				kernel_dgecpsc_3_0_lib4(1, ii, alpha, A+offA, B+offB);
3999 				A += 4*sda;
4000 				B += 4*sdb;
4001 				ii += 3;
4002 				}
4003 			}
4004 		// main loop
4005 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
4006 		for(; ii<m-7; ii+=8)
4007 			{
4008 			kernel_dgecpsc_8_0_lib4(1, ii, alpha, A, sda, B, sdb);
4009 			A += 8*sda;
4010 			B += 8*sdb;
4011 			}
4012 #endif
4013 		for(; ii<m-3; ii+=4)
4014 			{
4015 			kernel_dgecpsc_4_0_lib4(1, ii, alpha, A, B);
4016 			A += 4*sda;
4017 			B += 4*sdb;
4018 			}
4019 		// clean up at the end
4020 		if(ii<m)
4021 			{
4022 			if(m-ii==1)
4023 				kernel_dgecpsc_1_0_lib4(1, ii, alpha, A, B);
4024 			else if(m-ii==2)
4025 				kernel_dgecpsc_2_0_lib4(1, ii, alpha, A, B);
4026 			else // if(m-ii==3)
4027 				kernel_dgecpsc_3_0_lib4(1, ii, alpha, A, B);
4028 			}
4029 		}
4030 	// skip one element of A
4031 	else if(offA==(offB+1)%bs)
4032 		{
4033 		ii = 0;
4034 		// clean up at the beginning
4035 		mna = (4-offB)%bs;
4036 		if(mna>0)
4037 			{
4038 			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
4039 				{
4040 				if(m==1)
4041 					{
4042 					kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+offA, B+offB);
4043 					return;
4044 					}
4045 				else //if(m==2 && mna==3)
4046 					{
4047 					kernel_dgecpsc_2_0_lib4(1, ii, alpha, A+offA, B+offB);
4048 					return;
4049 					}
4050 				}
4051 			if(mna==1)
4052 				{
4053 				kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+offA, B+offB);
4054 				//A += 4*sda;
4055 				B += 4*sdb;
4056 				ii += 1;
4057 				}
4058 			else if(mna==2)
4059 				{
4060 				kernel_dgecpsc_2_3_lib4(1, ii, alpha, A, sda, B+2);
4061 				A += 4*sda;
4062 				B += 4*sdb;
4063 				ii += 2;
4064 				}
4065 			else // if(mna==3)
4066 				{
4067 				kernel_dgecpsc_3_2_lib4(1, ii, alpha, A, sda, B+1);
4068 				A += 4*sda;
4069 				B += 4*sdb;
4070 				ii += 3;
4071 				}
4072 			}
4073 		// main loop
4074 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
4075 		for( ; ii<m-7; ii+=8)
4076 			{
4077 			kernel_dgecpsc_8_1_lib4(1, ii, alpha, A, sda, B, sdb);
4078 			A += 8*sda;
4079 			B += 8*sdb;
4080 			}
4081 #endif
4082 		for( ; ii<m-3; ii+=4)
4083 			{
4084 			kernel_dgecpsc_4_1_lib4(1, ii, alpha, A, sda, B);
4085 			A += 4*sda;
4086 			B += 4*sdb;
4087 			}
4088 		// clean up at the end
4089 		if(ii<m)
4090 			{
4091 			if(m-ii==1)
4092 				kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+1, B);
4093 			else if(m-ii==2)
4094 				kernel_dgecpsc_2_0_lib4(1, ii, alpha, A+1, B);
4095 			else // if(m-ii==3)
4096 				kernel_dgecpsc_3_0_lib4(1, ii, alpha, A+1, B);
4097 			}
4098 		}
4099 	// skip 2 elements of A
4100 	else if(offA==(offB+2)%bs)
4101 		{
4102 		ii = 0;
4103 		// clean up at the beginning
4104 		mna = (4-offB)%bs;
4105 		if(mna>0)
4106 			{
4107 			if(m<mna)
4108 				{
4109 				if(m==1)
4110 					{
4111 					kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+offA, B+offB);
4112 					return;
4113 					}
4114 				else // if(m==2 && mna==3)
4115 					{
4116 					kernel_dgecpsc_2_3_lib4(1, ii, alpha, A, sda, B+1);
4117 					return;
4118 					}
4119 				}
4120 			if(mna==1)
4121 				{
4122 				kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+1, B+3);
4123 				// A += 4*sda;
4124 				B += 4*sdb;
4125 				ii += 1;
4126 				}
4127 			else if(mna==2)
4128 				{
4129 				kernel_dgecpsc_2_0_lib4(1, ii, alpha, A, B+2);
4130 				// A += 4*sda;
4131 				B += 4*sdb;
4132 				ii += 2;
4133 				}
4134 			else // if(mna==3)
4135 				{
4136 				kernel_dgecpsc_3_3_lib4(1, ii, alpha, A, sda, B+1);
4137 				A += 4*sda;
4138 				B += 4*sdb;
4139 				ii += 3;
4140 				}
4141 			}
4142 		// main loop
4143 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
4144 		for(; ii<m-7; ii+=8)
4145 			{
4146 			kernel_dgecpsc_8_2_lib4(1, ii, alpha, A, sda, B, sdb);
4147 			A += 8*sda;
4148 			B += 8*sdb;
4149 			}
4150 #endif
4151 		for(; ii<m-3; ii+=4)
4152 			{
4153 			kernel_dgecpsc_4_2_lib4(1, ii, alpha, A, sda, B);
4154 			A += 4*sda;
4155 			B += 4*sdb;
4156 			}
4157 		// clean up at the end
4158 		if(ii<m)
4159 			{
4160 			if(m-ii==1)
4161 				kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+2, B);
4162 			else if(m-ii==2)
4163 				kernel_dgecpsc_2_0_lib4(1, ii, alpha, A+2, B);
4164 			else // if(m-ii==3)
4165 				kernel_dgecpsc_3_2_lib4(1, ii, alpha, A, sda, B);
4166 			}
4167 		}
4168 	// skip 3 elements of A
4169 	else // if(offA==(offB+3)%bs)
4170 		{
4171 		ii = 0;
4172 		// clean up at the beginning
4173 		mna = (4-offB)%bs;
4174 		if(mna>0)
4175 			{
4176 			if(m<mna)
4177 				{
4178 				if(m==1)
4179 					{
4180 					kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+offA, B+offB);
4181 					return;
4182 					}
4183 				else // if(m==2 && mna==3)
4184 					{
4185 					kernel_dgecpsc_2_0_lib4(1, ii, alpha, A+offA, B+offB);
4186 					return;
4187 					}
4188 				}
4189 			if(mna==1)
4190 				{
4191 				kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+offA, B+offB);
4192 				// A += 4*sda;
4193 				B += 4*sdb;
4194 				ii += 1;
4195 				}
4196 			else if(mna==2)
4197 				{
4198 				kernel_dgecpsc_2_0_lib4(1, ii, alpha, A+offA, B+offB);
4199 				// A += 4*sda;
4200 				B += 4*sdb;
4201 				ii += 2;
4202 				}
4203 			else // if(mna==3)
4204 				{
4205 				kernel_dgecpsc_3_0_lib4(1, ii, alpha, A+offA, B+offB);
4206 				// A += 4*sda;
4207 				B += 4*sdb;
4208 				ii += 3;
4209 				}
4210 			}
4211 		// main loop
4212 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
4213 		for(; ii<m-7; ii+=8)
4214 			{
4215 			kernel_dgecpsc_8_3_lib4(1, ii, alpha, A, sda, B, sdb);
4216 			A += 8*sda;
4217 			B += 8*sdb;
4218 			}
4219 #endif
4220 		for(; ii<m-3; ii+=4)
4221 			{
4222 			kernel_dgecpsc_4_3_lib4(1, ii, alpha, A, sda, B);
4223 			A += 4*sda;
4224 			B += 4*sdb;
4225 			}
4226 		// clean up at the end
4227 		if(ii<m)
4228 			{
4229 			if(m-ii==1)
4230 				kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+3, B);
4231 			else if(m-ii==2)
4232 				kernel_dgecpsc_2_3_lib4(1, ii, alpha, A, sda, B);
4233 			else // if(m-ii==3)
4234 				kernel_dgecpsc_3_3_lib4(1, ii, alpha, A, sda, B);
4235 			}
4236 		}
4237 
4238 	/* } */
4239 	return;
4240 	}
4241 
4242 
4243 
4244 // scale a generic strmat
blasfeo_dgesc(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj)4245 void blasfeo_dgesc(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj)
4246 	{
4247 	// invalidate stored inverse diagonal
4248 	sA->use_dA = 0;
4249 
4250 	blasfeo_dgecpsc(m, n, alpha, sA, ai, aj, sA, ai, aj);
4251 	}
4252 
4253 
4254 
4255 // scale a triangular strmat
blasfeo_dtrsc_l(int m,double alpha,struct blasfeo_dmat * sA,int ai,int aj)4256 void blasfeo_dtrsc_l(int m, double alpha, struct blasfeo_dmat *sA, int ai, int aj)
4257 	{
4258 	// invalidate stored inverse diagonal
4259 	sA->use_dA = 0;
4260 
4261 	blasfeo_dtrcpsc_l(m, alpha, sA, ai, aj, sA, ai, aj);
4262 	}
4263 
4264 
4265 
4266 // copy a strvec into a strvec
blasfeo_dveccp(int m,struct blasfeo_dvec * sa,int ai,struct blasfeo_dvec * sc,int ci)4267 void blasfeo_dveccp(int m, struct blasfeo_dvec *sa, int ai, struct blasfeo_dvec *sc, int ci)
4268 	{
4269 	double *pa = sa->pa + ai;
4270 	double *pc = sc->pa + ci;
4271 	int ii;
4272 	ii = 0;
4273 	for(; ii<m-3; ii+=4)
4274 		{
4275 		pc[ii+0] = pa[ii+0];
4276 		pc[ii+1] = pa[ii+1];
4277 		pc[ii+2] = pa[ii+2];
4278 		pc[ii+3] = pa[ii+3];
4279 		}
4280 	for(; ii<m; ii++)
4281 		{
4282 		pc[ii+0] = pa[ii+0];
4283 		}
4284 	return;
4285 	}
4286 
4287 
4288 
4289 // scale a strvec
blasfeo_dvecsc(int m,double alpha,struct blasfeo_dvec * sa,int ai)4290 void blasfeo_dvecsc(int m, double alpha, struct blasfeo_dvec *sa, int ai)
4291 	{
4292 	double *pa = sa->pa + ai;
4293 	int ii;
4294 	ii = 0;
4295 	for(; ii<m-3; ii+=4)
4296 		{
4297 		pa[ii+0] *= alpha;
4298 		pa[ii+1] *= alpha;
4299 		pa[ii+2] *= alpha;
4300 		pa[ii+3] *= alpha;
4301 		}
4302 	for(; ii<m; ii++)
4303 		{
4304 		pa[ii+0] *= alpha;
4305 		}
4306 	return;
4307 	}
4308 
4309 
4310 
4311 // copy and scale a strvec into a strvec
blasfeo_dveccpsc(int m,double alpha,struct blasfeo_dvec * sa,int ai,struct blasfeo_dvec * sc,int ci)4312 void blasfeo_dveccpsc(int m, double alpha, struct blasfeo_dvec *sa, int ai, struct blasfeo_dvec *sc, int ci)
4313 	{
4314 	double *pa = sa->pa + ai;
4315 	double *pc = sc->pa + ci;
4316 	int ii;
4317 	ii = 0;
4318 	for(; ii<m-3; ii+=4)
4319 		{
4320 		pc[ii+0] = alpha*pa[ii+0];
4321 		pc[ii+1] = alpha*pa[ii+1];
4322 		pc[ii+2] = alpha*pa[ii+2];
4323 		pc[ii+3] = alpha*pa[ii+3];
4324 		}
4325 	for(; ii<m; ii++)
4326 		{
4327 		pc[ii+0] = alpha*pa[ii+0];
4328 		}
4329 	return;
4330 	}
4331 
4332 
4333 
4334 // scale and add a generic strmat into a generic strmat
blasfeo_dgead(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sC,int ci,int cj)4335 void blasfeo_dgead(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sC, int ci, int cj)
4336 	{
4337 	// invalidate stored inverse diagonal
4338 	sC->use_dA = 0;
4339 
4340 	const int bs = 4;
4341 	int sda = sA->cn;
4342 	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
4343 	int sdc = sC->cn;
4344 	double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
4345 	dgead_lib(m, n, alpha, ai%bs, pA, sda, ci%bs, pC, sdc);
4346 	return;
4347 	}
4348 
4349 
4350 
4351 // copy and transpose a generic strmat into a generic strmat
blasfeo_dgetr(int m,int n,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sC,int ci,int cj)4352 void blasfeo_dgetr(int m, int n, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sC, int ci, int cj)
4353 	{
4354 	// invalidate stored inverse diagonal
4355 	sC->use_dA = 0;
4356 
4357 	const int bs = 4;
4358 	int sda = sA->cn;
4359 	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
4360 	int sdc = sC->cn;
4361 	double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
4362 	dgetr_lib(m, n, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
4363 	return;
4364 	}
4365 
4366 
4367 
4368 // copy and transpose a lower triangular strmat into an upper triangular strmat
blasfeo_dtrtr_l(int m,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sC,int ci,int cj)4369 void blasfeo_dtrtr_l(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sC, int ci, int cj)
4370 	{
4371 	// invalidate stored inverse diagonal
4372 	sC->use_dA = 0;
4373 
4374 	const int bs = 4;
4375 	int sda = sA->cn;
4376 	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
4377 	int sdc = sC->cn;
4378 	double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
4379 	dtrtr_l_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
4380 	return;
4381 	}
4382 
4383 
4384 
4385 // copy and transpose an upper triangular strmat into a lower triangular strmat
blasfeo_dtrtr_u(int m,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sC,int ci,int cj)4386 void blasfeo_dtrtr_u(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sC, int ci, int cj)
4387 	{
4388 	// invalidate stored inverse diagonal
4389 	sC->use_dA = 0;
4390 
4391 	const int bs = 4;
4392 	int sda = sA->cn;
4393 	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
4394 	int sdc = sC->cn;
4395 	double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
4396 	dtrtr_u_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
4397 	return;
4398 	}
4399 
4400 
4401 
4402 // insert a strvec to diagonal of strmat, sparse formulation
blasfeo_ddiain_sp(int kmax,double alpha,struct blasfeo_dvec * sx,int xi,int * idx,struct blasfeo_dmat * sD,int di,int dj)4403 void blasfeo_ddiain_sp(int kmax, double alpha, struct blasfeo_dvec *sx, int xi, int *idx, struct blasfeo_dmat *sD, int di, int dj)
4404 	{
4405 	// invalidate stored inverse diagonal
4406 	sD->use_dA = 0;
4407 
4408 	const int bs = 4;
4409 	double *x = sx->pa + xi;
4410 	int sdd = sD->cn;
4411 	double *pD = sD->pA;
4412 	int ii, jj;
4413 	for(jj=0; jj<kmax; jj++)
4414 		{
4415 		ii = idx[jj];
4416 		pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = alpha * x[jj];
4417 		}
4418 	return;
4419 	}
4420 
4421 
4422 
4423 // extract a vector from diagonal
blasfeo_ddiaex(int kmax,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi)4424 void blasfeo_ddiaex(int kmax, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi)
4425 	{
4426 	const int bs = 4;
4427 	int sda = sA->cn;
4428 	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
4429 	double *x = sx->pa + xi;
4430 	int offsetA = ai%bs;
4431 
4432 	int kna = (bs-offsetA%bs)%bs;
4433 	kna = kmax<kna ? kmax : kna;
4434 
4435 	int jj, ll;
4436 
4437 	if(kna>0)
4438 		{
4439 		for(ll=0; ll<kna; ll++)
4440 			{
4441 			x[ll] = alpha*pA[ll+bs*ll];
4442 			}
4443 		pA += kna + bs*(sda-1) + kna*bs;
4444 		x  += kna;
4445 		kmax -= kna;
4446 		}
4447 	for(jj=0; jj<kmax-3; jj+=4)
4448 		{
4449 		x[jj+0] = alpha*pA[jj*sda+(jj+0)*bs+0];
4450 		x[jj+1] = alpha*pA[jj*sda+(jj+1)*bs+1];
4451 		x[jj+2] = alpha*pA[jj*sda+(jj+2)*bs+2];
4452 		x[jj+3] = alpha*pA[jj*sda+(jj+3)*bs+3];
4453 		}
4454 	for(ll=0; ll<kmax-jj; ll++)
4455 		{
4456 		x[jj+ll] = alpha*pA[jj*sda+(jj+ll)*bs+ll];
4457 		}
4458 	return;
4459 	}
4460 
4461 
4462 
4463 // extract the diagonal of a strmat to a strvec, sparse formulation
blasfeo_ddiaex_sp(int kmax,double alpha,int * idx,struct blasfeo_dmat * sD,int di,int dj,struct blasfeo_dvec * sx,int xi)4464 void blasfeo_ddiaex_sp(int kmax, double alpha, int *idx, struct blasfeo_dmat *sD, int di, int dj, struct blasfeo_dvec *sx, int xi)
4465 	{
4466 	const int bs = 4;
4467 	double *x = sx->pa + xi;
4468 	int sdd = sD->cn;
4469 	double *pD = sD->pA;
4470 	int ii, jj;
4471 	for(jj=0; jj<kmax; jj++)
4472 		{
4473 		ii = idx[jj];
4474 		x[jj] = alpha * pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs];
4475 		}
4476 	return;
4477 	}
4478 
4479 
4480 
4481 // add a vector to diagonal
blasfeo_ddiaad(int kmax,double alpha,struct blasfeo_dvec * sx,int xi,struct blasfeo_dmat * sA,int ai,int aj)4482 void blasfeo_ddiaad(int kmax, double alpha, struct blasfeo_dvec *sx, int xi, struct blasfeo_dmat *sA, int ai, int aj)
4483 	{
4484 
4485 	// invalidate stored inverse diagonal
4486 	sA->use_dA = 0;
4487 
4488 	const int bs = 4;
4489 	int sda = sA->cn;
4490 	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
4491 	double *x = sx->pa + xi;
4492 	int offsetA = ai%bs;
4493 
4494 	int kna = (bs-offsetA%bs)%bs;
4495 	kna = kmax<kna ? kmax : kna;
4496 
4497 	int jj, ll;
4498 
4499 	if(kna>0)
4500 		{
4501 		for(ll=0; ll<kna; ll++)
4502 			{
4503 			pA[ll+bs*ll] += alpha*x[ll];
4504 			}
4505 		pA += kna + bs*(sda-1) + kna*bs;
4506 		x  += kna;
4507 		kmax -= kna;
4508 		}
4509 	for(jj=0; jj<kmax-3; jj+=4)
4510 		{
4511 		pA[jj*sda+(jj+0)*bs+0] += alpha*x[jj+0];
4512 		pA[jj*sda+(jj+1)*bs+1] += alpha*x[jj+1];
4513 		pA[jj*sda+(jj+2)*bs+2] += alpha*x[jj+2];
4514 		pA[jj*sda+(jj+3)*bs+3] += alpha*x[jj+3];
4515 		}
4516 	for(ll=0; ll<kmax-jj; ll++)
4517 		{
4518 		pA[jj*sda+(jj+ll)*bs+ll] += alpha*x[jj+ll];
4519 		}
4520 	return;
4521 	}
4522 
4523 
4524 
4525 // add scaled strvec to diagonal of strmat, sparse formulation
blasfeo_ddiaad_sp(int kmax,double alpha,struct blasfeo_dvec * sx,int xi,int * idx,struct blasfeo_dmat * sD,int di,int dj)4526 void blasfeo_ddiaad_sp(int kmax, double alpha, struct blasfeo_dvec *sx, int xi, int *idx, struct blasfeo_dmat *sD, int di, int dj)
4527 	{
4528 
4529 	// invalidate stored inverse diagonal
4530 	sD->use_dA = 0;
4531 
4532 	const int bs = 4;
4533 	double *x = sx->pa + xi;
4534 	int sdd = sD->cn;
4535 	double *pD = sD->pA;
4536 	int ii, jj;
4537 	for(jj=0; jj<kmax; jj++)
4538 		{
4539 		ii = idx[jj];
4540 		pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] += alpha * x[jj];
4541 		}
4542 	return;
4543 	}
4544 
4545 
4546 
4547 // add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation
blasfeo_ddiaadin_sp(int kmax,double alpha,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sy,int yi,int * idx,struct blasfeo_dmat * sD,int di,int dj)4548 void blasfeo_ddiaadin_sp(int kmax, double alpha, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sy, int yi, int *idx, struct blasfeo_dmat *sD, int di, int dj)
4549 	{
4550 
4551 	// invalidate stored inverse diagonal
4552 	sD->use_dA = 0;
4553 
4554 	const int bs = 4;
4555 	double *x = sx->pa + xi;
4556 	double *y = sy->pa + yi;
4557 	int sdd = sD->cn;
4558 	double *pD = sD->pA;
4559 	int ii, jj;
4560 	for(jj=0; jj<kmax; jj++)
4561 		{
4562 		ii = idx[jj];
4563 		pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = y[jj] + alpha * x[jj];
4564 		}
4565 	return;
4566 	}
4567 
4568 
4569 
4570 // add scaled strvec to row of strmat, sparse formulation
blasfeo_drowad_sp(int kmax,double alpha,struct blasfeo_dvec * sx,int xi,int * idx,struct blasfeo_dmat * sD,int di,int dj)4571 void blasfeo_drowad_sp(int kmax, double alpha, struct blasfeo_dvec *sx, int xi, int *idx, struct blasfeo_dmat *sD, int di, int dj)
4572 	{
4573 
4574 	// invalidate stored inverse diagonal
4575 	sD->use_dA = 0;
4576 
4577 	const int bs = 4;
4578 	double *x = sx->pa + xi;
4579 	int sdd = sD->cn;
4580 	double *pD = sD->pA + di/bs*bs*sdd + di%bs + dj*bs;
4581 	drowad_libsp(kmax, idx, alpha, x, pD);
4582 	return;
4583 	}
4584 
4585 
4586 
4587 // add scaled strvec to strvec, sparse formulation
blasfeo_dvecad_sp(int m,double alpha,struct blasfeo_dvec * sx,int xi,int * idx,struct blasfeo_dvec * sz,int zi)4588 void blasfeo_dvecad_sp(int m, double alpha, struct blasfeo_dvec *sx, int xi, int *idx, struct blasfeo_dvec *sz, int zi)
4589 	{
4590 	double *x = sx->pa + xi;
4591 	double *z = sz->pa + zi;
4592 	int ii;
4593 	for(ii=0; ii<m; ii++)
4594 		z[idx[ii]] += alpha * x[ii];
4595 	return;
4596 	}
4597 
4598 
4599 
4600 // insert scaled strvec to strvec, sparse formulation
blasfeo_dvecin_sp(int m,double alpha,struct blasfeo_dvec * sx,int xi,int * idx,struct blasfeo_dvec * sz,int zi)4601 void blasfeo_dvecin_sp(int m, double alpha, struct blasfeo_dvec *sx, int xi, int *idx, struct blasfeo_dvec *sz, int zi)
4602 	{
4603 	double *x = sx->pa + xi;
4604 	double *z = sz->pa + zi;
4605 	int ii;
4606 	for(ii=0; ii<m; ii++)
4607 		z[idx[ii]] = alpha * x[ii];
4608 	return;
4609 	}
4610 
4611 
4612 
4613 // extract scaled strvec to strvec, sparse formulation
blasfeo_dvecex_sp(int m,double alpha,int * idx,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sz,int zi)4614 void blasfeo_dvecex_sp(int m, double alpha, int *idx, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi)
4615 	{
4616 	double *x = sx->pa + xi;
4617 	double *z = sz->pa + zi;
4618 	int ii;
4619 	for(ii=0; ii<m; ii++)
4620 		z[ii] = alpha * x[idx[ii]];
4621 	return;
4622 	}
4623 
4624 
4625 
4626 // clip strvec between two strvec
blasfeo_dveccl(int m,struct blasfeo_dvec * sxm,int xim,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sxp,int xip,struct blasfeo_dvec * sz,int zi)4627 void blasfeo_dveccl(int m, struct blasfeo_dvec *sxm, int xim, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sxp, int xip, struct blasfeo_dvec *sz, int zi)
4628 	{
4629 
4630 	double *xm = sxm->pa + xim;
4631 	double *x  = sx->pa + xi;
4632 	double *xp = sxp->pa + xip;
4633 	double *z  = sz->pa + zi;
4634 
4635 	int ii;
4636 
4637 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
4638 	double d0;
4639 
4640 	__m256d
4641 		xm0, x0, xp0, z0, tmp0, tmp1, ones, mones, mask1, mask2;
4642 
4643 	ones = _mm256_set_pd( 1.0, 1.0, 1.0, 1.0 );
4644 	mones = _mm256_set_pd( -1.0, -1.0, -1.0, -1.0 );
4645 	mask1 = _mm256_set_pd( 3.5, 2.5, 1.5, 0.5 );
4646 
4647 	for(ii=0; ii<m-3; ii+=4)
4648 		{
4649 		x0  = _mm256_loadu_pd( &x[ii] );
4650 		xp0 = _mm256_loadu_pd( &xp[ii] );
4651 		xm0 = _mm256_loadu_pd( &xm[ii] );
4652 		tmp0 = _mm256_cmp_pd( xp0, x0, 0x2 );
4653 		tmp1 = _mm256_cmp_pd( x0, xm0, 0x2 );
4654 		z0 = _mm256_blendv_pd( x0, xp0, tmp0 );
4655 		z0 = _mm256_blendv_pd( z0, xm0, tmp1 );
4656 		_mm256_storeu_pd( &z[ii], z0 );
4657 		}
4658 	if(ii<m)
4659 		{
4660 		d0 = (double) m-ii;
4661 		mask2 = _mm256_broadcast_sd( &d0 );
4662 		mask2 = _mm256_sub_pd( mask1, mask2 );
4663 		x0  = _mm256_loadu_pd( &x[ii] );
4664 		xp0 = _mm256_loadu_pd( &xp[ii] );
4665 		xm0 = _mm256_loadu_pd( &xm[ii] );
4666 		tmp0 = _mm256_cmp_pd( xp0, x0, 0x2 );
4667 		tmp1 = _mm256_cmp_pd( x0, xm0, 0x2 );
4668 		z0 = _mm256_blendv_pd( x0, xp0, tmp0 );
4669 		z0 = _mm256_blendv_pd( z0, xm0, tmp1 );
4670 		_mm256_maskstore_pd( &z[ii], _mm256_castpd_si256( mask2 ), z0 );
4671 		}
4672 #else
4673 	for(ii=0; ii<m; ii++)
4674 		{
4675 		if(x[ii]>=xp[ii])
4676 			{
4677 			z[ii] = xp[ii];
4678 			}
4679 		else if(x[ii]<=xm[ii])
4680 			{
4681 			z[ii] = xm[ii];
4682 			}
4683 		else
4684 			{
4685 			z[ii] = x[ii];
4686 			}
4687 		}
4688 #endif
4689 
4690 	return;
4691 
4692 	}
4693 
4694 
4695 
4696 // clip strvec between two strvec, with mask
blasfeo_dveccl_mask(int m,struct blasfeo_dvec * sxm,int xim,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sxp,int xip,struct blasfeo_dvec * sz,int zi,struct blasfeo_dvec * sm,int mi)4697 void blasfeo_dveccl_mask(int m, struct blasfeo_dvec *sxm, int xim, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sxp, int xip, struct blasfeo_dvec *sz, int zi, struct blasfeo_dvec *sm, int mi)
4698 	{
4699 
4700 	double *xm = sxm->pa + xim;
4701 	double *x  = sx->pa + xi;
4702 	double *xp = sxp->pa + xip;
4703 	double *z  = sz->pa + zi;
4704 	double *mask  = sm->pa + mi;
4705 
4706 	int ii;
4707 
4708 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
4709 	double d0;
4710 
4711 	__m256d
4712 		xm0, x0, xp0, z0, mask0, tmp0, tmp1, ones, mones, mask1, mask2;
4713 
4714 	ones = _mm256_set_pd( 1.0, 1.0, 1.0, 1.0 );
4715 	mones = _mm256_set_pd( -1.0, -1.0, -1.0, -1.0 );
4716 	mask1 = _mm256_set_pd( 3.5, 2.5, 1.5, 0.5 );
4717 
4718 	for(ii=0; ii<m-3; ii+=4)
4719 		{
4720 		mask0 = _mm256_setzero_pd();
4721 		x0  = _mm256_loadu_pd( &x[ii] );
4722 		xp0 = _mm256_loadu_pd( &xp[ii] );
4723 		xm0 = _mm256_loadu_pd( &xm[ii] );
4724 		tmp0 = _mm256_cmp_pd( xp0, x0, 0x2 );
4725 		tmp1 = _mm256_cmp_pd( x0, xm0, 0x2 );
4726 		z0 = _mm256_blendv_pd( x0, xp0, tmp0 );
4727 		z0 = _mm256_blendv_pd( z0, xm0, tmp1 );
4728 		mask0 = _mm256_blendv_pd( mask0, ones, tmp0 );
4729 		mask0 = _mm256_blendv_pd( mask0, mones, tmp1 );
4730 		_mm256_storeu_pd( &z[ii], z0 );
4731 		_mm256_storeu_pd( &mask[ii], mask0 );
4732 		}
4733 	if(ii<m)
4734 		{
4735 		d0 = (double) m-ii;
4736 		mask2 = _mm256_broadcast_sd( &d0 );
4737 		mask2 = _mm256_sub_pd( mask1, mask2 );
4738 		mask0 = _mm256_setzero_pd();
4739 		x0  = _mm256_loadu_pd( &x[ii] );
4740 		xp0 = _mm256_loadu_pd( &xp[ii] );
4741 		xm0 = _mm256_loadu_pd( &xm[ii] );
4742 		tmp0 = _mm256_cmp_pd( xp0, x0, 0x2 );
4743 		tmp1 = _mm256_cmp_pd( x0, xm0, 0x2 );
4744 		z0 = _mm256_blendv_pd( x0, xp0, tmp0 );
4745 		z0 = _mm256_blendv_pd( z0, xm0, tmp1 );
4746 		mask0 = _mm256_blendv_pd( mask0, ones, tmp0 );
4747 		mask0 = _mm256_blendv_pd( mask0, mones, tmp1 );
4748 		_mm256_maskstore_pd( &z[ii], _mm256_castpd_si256( mask2 ), z0 );
4749 		_mm256_maskstore_pd( &mask[ii], _mm256_castpd_si256( mask2 ), mask0 );
4750 		}
4751 #else
4752 	for(ii=0; ii<m; ii++)
4753 		{
4754 		if(x[ii]>=xp[ii])
4755 			{
4756 			z[ii] = xp[ii];
4757 			mask[ii] = 1.0;
4758 			}
4759 		else if(x[ii]<=xm[ii])
4760 			{
4761 			z[ii] = xm[ii];
4762 			mask[ii] = -1.0;
4763 			}
4764 		else
4765 			{
4766 			z[ii] = x[ii];
4767 			mask[ii] = 0.0;
4768 			}
4769 		}
4770 #endif
4771 
4772 	return;
4773 
4774 	}
4775 
4776 
4777 // zero out strvec to strvec with mask
blasfeo_dvecze(int m,struct blasfeo_dvec * sm,int mi,struct blasfeo_dvec * sv,int vi,struct blasfeo_dvec * se,int ei)4778 void blasfeo_dvecze(int m, struct blasfeo_dvec *sm, int mi, struct blasfeo_dvec *sv, int vi, struct blasfeo_dvec *se, int ei)
4779 	{
4780 	double *mask = sm->pa + mi;
4781 	double *v = sv->pa + vi;
4782 	double *e = se->pa + ei;
4783 
4784 	int ii;
4785 
4786 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
4787 	double d0;
4788 
4789 	__m256d
4790 		mask0, mask1, mask2, mask3, fives, zeros, e0, v0;
4791 
4792 	fives = _mm256_set_pd( 0.5, 0.5, 0.5, 0.5 );
4793 	zeros = _mm256_setzero_pd();
4794 	mask3 = _mm256_set_pd( 3.5, 2.5, 1.5, 0.5 );
4795 
4796 	for(ii=0; ii<m-3; ii+=4)
4797 		{
4798 		v0 = _mm256_loadu_pd( &v[ii] );
4799 		mask0 = _mm256_loadu_pd( &mask[ii] );
4800 		mask1 = mask0;
4801 		mask0 = _mm256_sub_pd( mask0, fives);
4802 		mask1 = _mm256_add_pd( mask1, fives);
4803 		mask0 = _mm256_xor_pd( mask0, mask1);
4804 		e0 = _mm256_blendv_pd( zeros, v0, mask0 );
4805 		_mm256_storeu_pd( &e[ii], e0 );
4806 		}
4807 	if(ii<m)
4808 		{
4809 		d0 = (double) m-ii;
4810 		mask2 = _mm256_broadcast_sd( &d0 );
4811 		mask2 = _mm256_sub_pd( mask3, mask2 );
4812 		v0 = _mm256_loadu_pd( &v[ii] );
4813 		mask0 = _mm256_loadu_pd( &mask[ii] );
4814 		mask1 = mask0;
4815 		mask0 = _mm256_sub_pd( mask0, fives);
4816 		mask1 = _mm256_add_pd( mask1, fives);
4817 		mask0 = _mm256_xor_pd( mask0, mask1);
4818 		e0 = _mm256_blendv_pd( zeros, v0, mask0 );
4819 		_mm256_maskstore_pd( &e[ii], _mm256_castpd_si256( mask2 ), e0 );
4820 		}
4821 #else
4822 	for(ii=0; ii<m; ii++)
4823 		{
4824 		if(mask[ii]==0)
4825 			{
4826 			e[ii] = v[ii];
4827 			}
4828 		else
4829 			{
4830 			e[ii] = 0;
4831 			}
4832 		}
4833 #endif
4834 
4835 	}
4836 
4837 
4838 // compute inf norm of strvec
blasfeo_dvecnrm_inf(int m,struct blasfeo_dvec * sx,int xi,double * ptr_norm)4839 void blasfeo_dvecnrm_inf(int m, struct blasfeo_dvec *sx, int xi, double *ptr_norm)
4840 	{
4841 	int ii;
4842 	double *x = sx->pa + xi;
4843 	double norm = 0.0;
4844 	double tmp;
4845 	for(ii=0; ii<m; ii++)
4846 		{
4847 #ifdef USE_C99_MATH
4848 		norm = fmax(norm, fabs(x[ii]));
4849 #else
4850 		tmp = fabs(x[ii]);
4851 		norm = tmp>norm ? tmp : norm;
4852 #endif
4853 		}
4854 	*ptr_norm = norm;
4855 	return;
4856 	}
4857 
4858 
4859 
4860 // permute elements of a vector struct
blasfeo_dvecpe(int kmax,int * ipiv,struct blasfeo_dvec * sx,int xi)4861 void blasfeo_dvecpe(int kmax, int *ipiv, struct blasfeo_dvec *sx, int xi)
4862 	{
4863 	int ii;
4864 	double tmp;
4865 	double *x = sx->pa + xi;
4866 	for(ii=0; ii<kmax; ii++)
4867 		{
4868 		if(ipiv[ii]!=ii)
4869 			{
4870 			tmp = x[ipiv[ii]];
4871 			x[ipiv[ii]] = x[ii];
4872 			x[ii] = tmp;
4873 			}
4874 		}
4875 	return;
4876 	}
4877 
4878 
4879 
4880 // inverse permute elements of a vector struct
blasfeo_dvecpei(int kmax,int * ipiv,struct blasfeo_dvec * sx,int xi)4881 void blasfeo_dvecpei(int kmax, int *ipiv, struct blasfeo_dvec *sx, int xi)
4882 	{
4883 	int ii;
4884 	double tmp;
4885 	double *x = sx->pa + xi;
4886 	for(ii=kmax-1; ii>=0; ii--)
4887 		{
4888 		if(ipiv[ii]!=ii)
4889 			{
4890 			tmp = x[ipiv[ii]];
4891 			x[ipiv[ii]] = x[ii];
4892 			x[ii] = tmp;
4893 			}
4894 		}
4895 	return;
4896 	}
4897 
4898 
4899 
4900 #else
4901 
4902 #error : wrong LA choice
4903 
4904 #endif
4905