1 /**************************************************************************************************
2 * *
3 * This file is part of BLASFEO. *
4 * *
5 * BLASFEO -- BLAS For Embedded Optimization. *
6 * Copyright (C) 2019 by Gianluca Frison. *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8 * All rights reserved. *
9 * *
10 * The 2-Clause BSD License *
11 * *
12 * Redistribution and use in source and binary forms, with or without *
13 * modification, are permitted provided that the following conditions are met: *
14 * *
15 * 1. Redistributions of source code must retain the above copyright notice, this *
16 * list of conditions and the following disclaimer. *
17 * 2. Redistributions in binary form must reproduce the above copyright notice, *
18 * this list of conditions and the following disclaimer in the documentation *
19 * and/or other materials provided with the distribution. *
20 * *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
31 * *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de *
33 * *
34 **************************************************************************************************/
35 /*
36 * auxiliary functions for LA:HIGH_PERFORMANCE PS_4 (panel major)
37 *
38 * auxiliary/d_aux_lib*.c
39 *
40 */
41
42 #include <stdlib.h>
43 #include <stdio.h>
44 #include <math.h>
45
46 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
47 #include <mmintrin.h>
48 #include <xmmintrin.h> // SSE
49 #include <emmintrin.h> // SSE2
50 #include <pmmintrin.h> // SSE3
51 #include <smmintrin.h> // SSE4
52 #include <immintrin.h> // AVX
53 #endif
54
55 #include "../include/blasfeo_common.h"
56 #include "../include/blasfeo_block_size.h"
57 #include "../include/blasfeo_d_aux.h"
58 #include "../include/blasfeo_d_kernel.h"
59
60
61 /*
62 * Auxiliary functions for PS:4 (panel-major)
63 *
64 * auxiliary/d_aux_lib4.c
65 * TOFIX: move inside str* interface
66 *
67 */
68
69
70
71 // scales and adds a packed matrix into a packed matrix: B = B + alpha*A
dgead_lib(int m,int n,double alpha,int offsetA,double * A,int sda,int offsetB,double * B,int sdb)72 void dgead_lib(int m, int n, double alpha, int offsetA, double *A, int sda, int offsetB, double *B, int sdb)
73 {
74
75 if(m<=0 || n<=0)
76 return;
77
78 const int bs = 4;
79
80 int mna, ii;
81
82 int offA = offsetA%bs;
83 int offB = offsetB%bs;
84
85 // A at the beginning of the block
86 A -= offA;
87
88 // A at the beginning of the block
89 B -= offB;
90
91 // same alignment
92 if(offA==offB)
93 {
94 ii = 0;
95 // clean up at the beginning
96 mna = (4-offB)%bs;
97 if(mna>0)
98 {
99 if(m<mna) // mna<=3 ==> m = { 1, 2 }
100 {
101 if(m==1)
102 {
103 kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
104 return;
105 }
106 else //if(m==2 && mna==3)
107 {
108 kernel_dgead_2_0_lib4(n, alpha, A+offA, B+offB);
109 return;
110 }
111 }
112 if(mna==1)
113 {
114 kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
115 A += 4*sda;
116 B += 4*sdb;
117 ii += 1;
118 }
119 else if(mna==2)
120 {
121 kernel_dgead_2_0_lib4(n, alpha, A+offA, B+offB);
122 A += 4*sda;
123 B += 4*sdb;
124 ii += 2;
125 }
126 else // if(mna==3)
127 {
128 kernel_dgead_3_0_lib4(n, alpha, A+offA, B+offB);
129 A += 4*sda;
130 B += 4*sdb;
131 ii += 3;
132 }
133 }
134 // main loop
135 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
136 for(; ii<m-7; ii+=8)
137 {
138 kernel_dgead_8_0_lib4(n, alpha, A, sda, B, sdb);
139 A += 8*sda;
140 B += 8*sdb;
141 }
142 #endif
143 for(; ii<m-3; ii+=4)
144 {
145 kernel_dgead_4_0_lib4(n, alpha, A, B);
146 A += 4*sda;
147 B += 4*sdb;
148 }
149 // clean up at the end
150 if(ii<m)
151 {
152 if(m-ii==1)
153 kernel_dgead_1_0_lib4(n, alpha, A, B);
154 else if(m-ii==2)
155 kernel_dgead_2_0_lib4(n, alpha, A, B);
156 else // if(m-ii==3)
157 kernel_dgead_3_0_lib4(n, alpha, A, B);
158 }
159 }
160 // skip one element of A
161 else if(offA==(offB+1)%bs)
162 {
163 ii = 0;
164 // clean up at the beginning
165 mna = (4-offB)%bs;
166 if(mna>0)
167 {
168 if(m<mna) // mna<=3 ==> m = { 1, 2 }
169 {
170 if(m==1)
171 {
172 kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
173 return;
174 }
175 else //if(m==2 && mna==3)
176 {
177 kernel_dgead_2_0_lib4(n, alpha, A+offA, B+offB);
178 return;
179 }
180 }
181 if(mna==1)
182 {
183 kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
184 //A += 4*sda;
185 B += 4*sdb;
186 ii += 1;
187 }
188 else if(mna==2)
189 {
190 kernel_dgead_2_3_lib4(n, alpha, A, sda, B+2);
191 A += 4*sda;
192 B += 4*sdb;
193 ii += 2;
194 }
195 else // if(mna==3)
196 {
197 kernel_dgead_3_2_lib4(n, alpha, A, sda, B+1);
198 A += 4*sda;
199 B += 4*sdb;
200 ii += 3;
201 }
202 }
203 // main loop
204 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
205 for( ; ii<m-7; ii+=8)
206 {
207 kernel_dgead_8_1_lib4(n, alpha, A, sda, B, sdb);
208 A += 8*sda;
209 B += 8*sdb;
210 }
211 #endif
212 for( ; ii<m-3; ii+=4)
213 {
214 kernel_dgead_4_1_lib4(n, alpha, A, sda, B);
215 A += 4*sda;
216 B += 4*sdb;
217 }
218 // clean up at the end
219 if(ii<m)
220 {
221 if(m-ii==1)
222 kernel_dgead_1_0_lib4(n, alpha, A+1, B);
223 else if(m-ii==2)
224 kernel_dgead_2_0_lib4(n, alpha, A+1, B);
225 else // if(m-ii==3)
226 kernel_dgead_3_0_lib4(n, alpha, A+1, B);
227 }
228 }
229 // skip 2 elements of A
230 else if(offA==(offB+2)%bs)
231 {
232 ii = 0;
233 // clean up at the beginning
234 mna = (4-offB)%bs;
235 if(mna>0)
236 {
237 if(m<mna)
238 {
239 if(m==1)
240 {
241 kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
242 return;
243 }
244 else // if(m==2 && mna==3)
245 {
246 kernel_dgead_2_3_lib4(n, alpha, A, sda, B+1);
247 return;
248 }
249 }
250 if(mna==1)
251 {
252 kernel_dgead_1_0_lib4(n, alpha, A+1, B+3);
253 // A += 4*sda;
254 B += 4*sdb;
255 ii += 1;
256 }
257 else if(mna==2)
258 {
259 kernel_dgead_2_0_lib4(n, alpha, A, B+2);
260 // A += 4*sda;
261 B += 4*sdb;
262 ii += 2;
263 }
264 else // if(mna==3)
265 {
266 kernel_dgead_3_3_lib4(n, alpha, A, sda, B+1);
267 A += 4*sda;
268 B += 4*sdb;
269 ii += 3;
270 }
271 }
272 // main loop
273 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
274 for(; ii<m-7; ii+=8)
275 {
276 kernel_dgead_8_2_lib4(n, alpha, A, sda, B, sdb);
277 A += 8*sda;
278 B += 8*sdb;
279 }
280 #endif
281 for(; ii<m-3; ii+=4)
282 {
283 kernel_dgead_4_2_lib4(n, alpha, A, sda, B);
284 A += 4*sda;
285 B += 4*sdb;
286 }
287 // clean up at the end
288 if(ii<m)
289 {
290 if(m-ii==1)
291 kernel_dgead_1_0_lib4(n, alpha, A+2, B);
292 else if(m-ii==2)
293 kernel_dgead_2_0_lib4(n, alpha, A+2, B);
294 else // if(m-ii==3)
295 kernel_dgead_3_2_lib4(n, alpha, A, sda, B);
296 }
297 }
298 // skip 3 elements of A
299 else // if(offA==(offB+3)%bs)
300 {
301 ii = 0;
302 // clean up at the beginning
303 mna = (4-offB)%bs;
304 if(mna>0)
305 {
306 if(m<mna)
307 {
308 if(m==1)
309 {
310 kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
311 return;
312 }
313 else // if(m==2 && mna==3)
314 {
315 kernel_dgead_2_0_lib4(n, alpha, A+offA, B+offB);
316 return;
317 }
318 }
319 if(mna==1)
320 {
321 kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
322 // A += 4*sda;
323 B += 4*sdb;
324 ii += 1;
325 }
326 else if(mna==2)
327 {
328 kernel_dgead_2_0_lib4(n, alpha, A+offA, B+offB);
329 // A += 4*sda;
330 B += 4*sdb;
331 ii += 2;
332 }
333 else // if(mna==3)
334 {
335 kernel_dgead_3_0_lib4(n, alpha, A+offA, B+offB);
336 // A += 4*sda;
337 B += 4*sdb;
338 ii += 3;
339 }
340 }
341 // main loop
342 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
343 for(; ii<m-7; ii+=8)
344 {
345 kernel_dgead_8_3_lib4(n, alpha, A, sda, B, sdb);
346 A += 8*sda;
347 B += 8*sdb;
348 }
349 #endif
350 for(; ii<m-3; ii+=4)
351 {
352 kernel_dgead_4_3_lib4(n, alpha, A, sda, B);
353 A += 4*sda;
354 B += 4*sdb;
355 }
356 // clean up at the end
357 if(ii<m)
358 {
359 if(m-ii==1)
360 kernel_dgead_1_0_lib4(n, alpha, A+3, B);
361 else if(m-ii==2)
362 kernel_dgead_2_3_lib4(n, alpha, A, sda, B);
363 else // if(m-ii==3)
364 kernel_dgead_3_3_lib4(n, alpha, A, sda, B);
365 }
366 }
367
368 }
369
370
371
372 // scales and adds a strvec into a strvec
blasfeo_dvecad(int m,double alpha,struct blasfeo_dvec * sa,int ai,struct blasfeo_dvec * sc,int ci)373 void blasfeo_dvecad(int m, double alpha, struct blasfeo_dvec *sa, int ai, struct blasfeo_dvec *sc, int ci)
374 {
375 double *pa = sa->pa + ai;
376 double *pc = sc->pa + ci;
377 int ii;
378 ii = 0;
379 for(; ii<m-3; ii+=4)
380 {
381 pc[ii+0] += alpha*pa[ii+0];
382 pc[ii+1] += alpha*pa[ii+1];
383 pc[ii+2] += alpha*pa[ii+2];
384 pc[ii+3] += alpha*pa[ii+3];
385 }
386 for(; ii<m; ii++)
387 {
388 pc[ii+0] += alpha*pa[ii+0];
389 }
390 return;
391 }
392
393
394 // --------- Transpose
395
396 // transpose general matrix; m and n are referred to the original matrix
dgetr_lib(int m,int n,double alpha,int offsetA,double * pA,int sda,int offsetC,double * pC,int sdc)397 void dgetr_lib(int m, int n, double alpha, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc)
398 {
399
400 /*
401
402 m = 5
403 n = 3
404 offsetA = 1
405 offsetC = 2
406
407 A =
408 x x x
409 -
410 x x x
411 x x x
412 x x x
413 x x x
414
415 C =
416 x x x x x
417 x x x x x
418 -
419 x x x x x
420
421 */
422
423
424 if(m<=0 || n<=0)
425 return;
426
427 const int bs = 4;
428
429 int mna = (bs-offsetA%bs)%bs;
430 mna = m<mna ? m : mna;
431 int nna = (bs-offsetC%bs)%bs;
432 nna = n<nna ? n : nna;
433
434 int ii;
435
436 ii = 0;
437
438 if(mna>0)
439 {
440 if(mna==1)
441 kernel_dgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
442 else if(mna==2)
443 kernel_dgetr_2_lib4(0, n, nna, alpha, pA, pC, sdc);
444 else //if(mna==3)
445 kernel_dgetr_3_lib4(0, n, nna, alpha, pA, pC, sdc);
446 ii += mna;
447 pA += mna + bs*(sda-1);
448 pC += mna*bs;
449 }
450 #if defined(TARGET_X64_INTEL_HASWELL)
451 for( ; ii<m-7; ii+=8)
452 {
453 kernel_dgetr_8_lib4(0, n, nna, alpha, pA, sda, pC, sdc);
454 pA += 2*bs*sda;
455 pC += 2*bs*bs;
456 }
457 #endif
458 for( ; ii<m-3; ii+=4)
459 // for( ; ii<m; ii+=4)
460 {
461 kernel_dgetr_4_lib4(0, n, nna, alpha, pA, pC, sdc);
462 pA += bs*sda;
463 pC += bs*bs;
464 }
465
466 // clean-up at the end using smaller kernels
467 if(ii==m)
468 return;
469
470 if(m-ii==1)
471 kernel_dgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
472 else if(m-ii==2)
473 kernel_dgetr_2_lib4(0, n, nna, alpha, pA, pC, sdc);
474 else if(m-ii==3)
475 kernel_dgetr_3_lib4(0, n, nna, alpha, pA, pC, sdc);
476
477 return;
478
479 }
480
481
482
483 // transpose lower triangular matrix
dtrtr_l_lib(int m,double alpha,int offsetA,double * pA,int sda,int offsetC,double * pC,int sdc)484 void dtrtr_l_lib(int m, double alpha, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc)
485 {
486
487 /*
488
489 A =
490 x
491 x x
492 x x x
493 x x x x
494
495 x x x x x
496 x x x x x x
497 x x x x x x x
498 x x x x x x x x
499
500 C =
501 x x x x x x x x
502
503 x x x x x x x
504 x x x x x x
505 x x x x x
506 x x x x
507
508 x x x
509 x x
510 x
511
512 */
513
514 int n = m;
515
516 if(m<=0 || n<=0)
517 return;
518
519 const int bs = 4;
520
521 int mna = (bs-offsetA%bs)%bs;
522 mna = m<mna ? m : mna;
523 int nna = (bs-offsetC%bs)%bs;
524 nna = n<nna ? n : nna;
525
526 int ii;
527
528 ii = 0;
529
530 if(mna>0)
531 {
532 if(mna==1)
533 {
534 pC[0] = alpha * pA[0];
535 }
536 else if(mna==2)
537 {
538 if(nna==1)
539 {
540 pC[0+bs*0] = alpha * pA[0+bs*0];
541 pC[0+bs*1] = alpha * pA[1+bs*0];
542 pC[1+bs*(0+sdc)] = alpha * pA[1+bs*1];
543 }
544 else
545 {
546 pC[0+bs*0] = alpha * pA[0+bs*0];
547 pC[0+bs*1] = alpha * pA[1+bs*0];
548 pC[1+bs*1] = alpha * pA[1+bs*1];
549 }
550 }
551 else //if(mna==3)
552 {
553 if(nna==1)
554 {
555 pC[0+bs*0] = alpha * pA[0+bs*0];
556 pC[0+bs*1] = alpha * pA[1+bs*0];
557 pC[0+bs*2] = alpha * pA[2+bs*0];
558 pC[1+bs*(0+sdc)] = alpha * pA[1+bs*1];
559 pC[1+bs*(1+sdc)] = alpha * pA[2+bs*1];
560 pC[2+bs*(1+sdc)] = alpha * pA[2+bs*2];
561 }
562 else if(nna==2)
563 {
564 pC[0+bs*0] = alpha * pA[0+bs*0];
565 pC[0+bs*1] = alpha * pA[1+bs*0];
566 pC[0+bs*2] = alpha * pA[2+bs*0];
567 pC[1+bs*1] = alpha * pA[1+bs*1];
568 pC[1+bs*2] = alpha * pA[2+bs*1];
569 pC[2+bs*(1+sdc)] = alpha * pA[2+bs*2];
570 }
571 else
572 {
573 pC[0+bs*0] = alpha * pA[0+bs*0];
574 pC[0+bs*1] = alpha * pA[1+bs*0];
575 pC[0+bs*2] = alpha * pA[2+bs*0];
576 pC[1+bs*1] = alpha * pA[1+bs*1];
577 pC[1+bs*2] = alpha * pA[2+bs*1];
578 pC[2+bs*2] = alpha * pA[2+bs*2];
579 }
580 }
581 ii += mna;
582 pA += mna + bs*(sda-1);
583 pC += mna*bs;
584 }
585 #if 0 //defined(TARGET_X64_INTEL_HASWELL)
586 for( ; ii<m-7; ii+=8)
587 {
588 kernel_dgetr_8_lib4(1, n, nna, alpha, pA, sda, pC, sdc);
589 pA += 2*bs*sda;
590 pC += 2*bs*bs;
591 }
592 #endif
593 for( ; ii<m-3; ii+=4)
594 {
595 kernel_dgetr_4_lib4(1, ii, nna, alpha, pA, pC, sdc);
596 pA += bs*sda;
597 pC += bs*bs;
598 }
599
600 // clean-up at the end using smaller kernels
601 if(ii==m)
602 return;
603
604 if(m-ii==1)
605 kernel_dgetr_1_lib4(1, ii, nna, alpha, pA, pC, sdc);
606 else if(m-ii==2)
607 kernel_dgetr_2_lib4(1, ii, nna, alpha, pA, pC, sdc);
608 else if(m-ii==3)
609 kernel_dgetr_3_lib4(1, ii, nna, alpha, pA, pC, sdc);
610
611 return;
612
613 }
614
615
616
617 // transpose an aligned upper triangular matrix into an aligned lower triangular matrix
dtrtr_u_lib(int m,double alpha,int offsetA,double * pA,int sda,int offsetC,double * pC,int sdc)618 void dtrtr_u_lib(int m, double alpha, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc)
619 {
620
621 /*
622
623 A =
624 x x x x x x x x
625 x x x x x x x
626
627 x x x x x x
628 x x x x x
629 x x x x
630 x x x
631 x x
632 x
633
634 C =
635 x
636
637 x x
638 x x x
639 x x x x
640 x x x x x
641 x x x x x x
642 x x x x x x x
643 x x x x x x x x
644
645 */
646
647 int n = m;
648
649 if(m<=0 || n<=0)
650 return;
651
652 const int bs = 4;
653
654 int mna = (bs-offsetA%bs)%bs;
655 mna = m<mna ? m : mna;
656 int nna = (bs-offsetC%bs)%bs;
657 nna = n<nna ? n : nna;
658 int tna = nna;
659
660 int ii;
661
662 ii = 0;
663
664 if(mna>0)
665 {
666 if(mna==1)
667 {
668 kernel_dgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
669 if(nna!=1)
670 {
671 // pC[0+bs*0] = alpha * pA[0+bs*0];
672 pA += 1*bs;
673 pC += 1;
674 tna = (bs-(offsetC+1)%bs)%bs;
675 }
676 else //if(nna==1)
677 {
678 // pC[0+bs*0] = alpha * pA[0+bs*0];
679 pA += 1*bs;
680 pC += 1 + (sdc-1)*bs;
681 tna = 0; //(bs-(offsetC+1)%bs)%bs;
682 }
683 // kernel_dgetr_1_lib4(0, n-1, tna, alpha, pA, pC, sdc);
684 }
685 else if(mna==2)
686 {
687 if(nna==0 || nna==3)
688 {
689 pC[0+bs*0] = alpha * pA[0+bs*0];
690 pC[1+bs*0] = alpha * pA[0+bs*1];
691 pC[1+bs*1] = alpha * pA[1+bs*1];
692 pA += 2*bs;
693 pC += 2;
694 tna = (bs-(offsetC+2)%bs)%bs;
695 kernel_dgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
696 }
697 else if(nna==1)
698 {
699 pC[0+bs*0] = alpha * pA[0+bs*0];
700 pA += 1*bs;
701 pC += 1 + (sdc-1)*bs;
702 // pC[0+bs*0] = alpha * pA[0+bs*0];
703 // pC[0+bs*1] = alpha * pA[1+bs*0];
704 kernel_dgetr_2_lib4(0, n-1, 0, alpha, pA, pC, sdc);
705 pA += 1*bs;
706 pC += 1;
707 tna = 3; //(bs-(offsetC+2)%bs)%bs;
708 // kernel_dgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
709 }
710 else if(nna==2)
711 {
712 pC[0+bs*0] = alpha * pA[0+bs*0];
713 pC[1+bs*0] = alpha * pA[0+bs*1];
714 pC[1+bs*1] = alpha * pA[1+bs*1];
715 pA += 2*bs;
716 pC += 2 + (sdc-1)*bs;
717 tna = 0; //(bs-(offsetC+2)%bs)%bs;
718 kernel_dgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
719 }
720 }
721 else //if(mna==3)
722 {
723 if(nna==0)
724 {
725 pC[0+bs*0] = alpha * pA[0+bs*0];
726 pC[1+bs*0] = alpha * pA[0+bs*1];
727 pC[1+bs*1] = alpha * pA[1+bs*1];
728 pC[2+bs*0] = alpha * pA[0+bs*2];
729 pC[2+bs*1] = alpha * pA[1+bs*2];
730 pC[2+bs*2] = alpha * pA[2+bs*2];
731 pA += 3*bs;
732 pC += 3;
733 tna = 1;
734 kernel_dgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
735 }
736 else if(nna==1)
737 {
738 pC[0+bs*0] = alpha * pA[0+bs*0];
739 pA += bs;
740 pC += 1 + (sdc-1)*bs;
741 pC[0+bs*0] = alpha * pA[0+bs*0];
742 pC[0+bs*1] = alpha * pA[1+bs*0];
743 pC[1+bs*0] = alpha * pA[0+bs*1];
744 pC[1+bs*1] = alpha * pA[1+bs*1];
745 pC[1+bs*2] = alpha * pA[2+bs*1];
746 pA += 2*bs;
747 pC += 2;
748 tna = 2;
749 kernel_dgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
750 }
751 else if(nna==2)
752 {
753 pC[0+bs*0] = alpha * pA[0+bs*0];
754 pC[1+bs*0] = alpha * pA[0+bs*1];
755 pC[1+bs*1] = alpha * pA[1+bs*1];
756 pA += 2*bs;
757 pC += 2 + (sdc-1)*bs;
758 // pC[0+bs*0] = alpha * pA[0+bs*0];
759 // pC[0+bs*1] = alpha * pA[1+bs*0];
760 // pC[0+bs*2] = alpha * pA[2+bs*0];
761 kernel_dgetr_3_lib4(0, n-2, 0, alpha, pA, pC, sdc);
762 pA += 1*bs;
763 pC += 1;
764 tna = 3;
765 // kernel_dgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
766 }
767 else //if(nna==3)
768 {
769 pC[0+bs*0] = alpha * pA[0+bs*0];
770 pC[1+bs*0] = alpha * pA[0+bs*1];
771 pC[1+bs*1] = alpha * pA[1+bs*1];
772 pC[2+bs*0] = alpha * pA[0+bs*2];
773 pC[2+bs*1] = alpha * pA[1+bs*2];
774 pC[2+bs*2] = alpha * pA[2+bs*2];
775 pA += 3*bs;
776 pC += 3 + (sdc-1)*bs;
777 tna = 0;
778 kernel_dgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
779 }
780 }
781 ii += mna;
782 pA += mna + bs*(sda-1);
783 pC += mna*bs;
784 }
785 #if 0 //defined(TARGET_X64_AVX2)
786 for( ; ii<m-7; ii+=8)
787 {
788 kernel_dgetr_8_lib4(0, n, nna, alpha, pA, sda, pC, sdc);
789 pA += 2*bs*sda;
790 pC += 2*bs*bs;
791 }
792 #endif
793 for( ; ii<m-3; ii+=4)
794 {
795 if(tna==0)
796 {
797 pC[0+bs*0] = alpha * pA[0+bs*0];
798 pC[1+bs*0] = alpha * pA[0+bs*1];
799 pC[1+bs*1] = alpha * pA[1+bs*1];
800 pC[2+bs*0] = alpha * pA[0+bs*2];
801 pC[2+bs*1] = alpha * pA[1+bs*2];
802 pC[2+bs*2] = alpha * pA[2+bs*2];
803 pC[3+bs*0] = alpha * pA[0+bs*3];
804 pC[3+bs*1] = alpha * pA[1+bs*3];
805 pC[3+bs*2] = alpha * pA[2+bs*3];
806 pC[3+bs*3] = alpha * pA[3+bs*3];
807 pA += 4*bs;
808 pC += sdc*bs;
809 kernel_dgetr_4_lib4(0, n-ii-4, 0, alpha, pA, pC, sdc);
810 }
811 else if(tna==1)
812 {
813 pC[0+bs*0] = alpha * pA[0+bs*0];
814 pA += bs;
815 pC += 1 + (sdc-1)*bs;
816 pC[0+bs*0] = alpha * pA[0+bs*0];
817 pC[0+bs*1] = alpha * pA[1+bs*0];
818 pC[1+bs*0] = alpha * pA[0+bs*1];
819 pC[1+bs*1] = alpha * pA[1+bs*1];
820 pC[1+bs*2] = alpha * pA[2+bs*1];
821 pC[2+bs*0] = alpha * pA[0+bs*2];
822 pC[2+bs*1] = alpha * pA[1+bs*2];
823 pC[2+bs*2] = alpha * pA[2+bs*2];
824 pC[2+bs*3] = alpha * pA[3+bs*2];
825 pA += 3*bs;
826 pC += 3;
827 kernel_dgetr_4_lib4(0, n-ii-4, 1, alpha, pA, pC, sdc);
828 }
829 else if(tna==2)
830 {
831 pC[0+bs*0] = alpha * pA[0+bs*0];
832 pC[1+bs*0] = alpha * pA[0+bs*1];
833 pC[1+bs*1] = alpha * pA[1+bs*1];
834 pA += 2*bs;
835 pC += 2 + (sdc-1)*bs;
836 pC[0+bs*0] = alpha * pA[0+bs*0];
837 pC[0+bs*1] = alpha * pA[1+bs*0];
838 pC[0+bs*2] = alpha * pA[2+bs*0];
839 pC[1+bs*0] = alpha * pA[0+bs*1];
840 pC[1+bs*1] = alpha * pA[1+bs*1];
841 pC[1+bs*2] = alpha * pA[2+bs*1];
842 pC[1+bs*3] = alpha * pA[3+bs*1];
843 pA += 2*bs;
844 pC += 2;
845 kernel_dgetr_4_lib4(0, n-ii-4, 2, alpha, pA, pC, sdc);
846 }
847 else //if(tna==3)
848 {
849 pC[0+bs*0] = alpha * pA[0+bs*0];
850 pC[1+bs*0] = alpha * pA[0+bs*1];
851 pC[1+bs*1] = alpha * pA[1+bs*1];
852 pC[2+bs*0] = alpha * pA[0+bs*2];
853 pC[2+bs*1] = alpha * pA[1+bs*2];
854 pC[2+bs*2] = alpha * pA[2+bs*2];
855 pA += 3*bs;
856 pC += 3 + (sdc-1)*bs;
857 kernel_dgetr_4_lib4(0, n-ii-3, 0, alpha, pA, pC, sdc);
858 // pC[0+bs*0] = alpha * pA[0+bs*0];
859 // pC[0+bs*1] = alpha * pA[1+bs*0];
860 // pC[0+bs*2] = alpha * pA[2+bs*0];
861 // pC[0+bs*3] = alpha * pA[3+bs*0];
862 pA += bs;
863 pC += 1;
864 // kernel_dgetr_4_lib4(0, n-ii-4, tna, alpha, pA, pC, sdc);
865 }
866 pA += bs*sda;
867 pC += bs*bs;
868 }
869
870 // clean-up at the end
871 if(ii==m)
872 return;
873
874 if(m-ii==1)
875 {
876 pC[0+bs*0] = alpha * pA[0+bs*0];
877 }
878 else if(m-ii==2)
879 {
880 if(tna!=1)
881 {
882 pC[0+bs*0] = alpha * pA[0+bs*0];
883 pC[1+bs*0] = alpha * pA[0+bs*1];
884 pC[1+bs*1] = alpha * pA[1+bs*1];
885 }
886 else //if(tna==1)
887 {
888 pC[0+bs*0] = alpha * pA[0+bs*0];
889 pA += bs;
890 pC += 1 + (sdc-1)*bs;
891 pC[0+bs*0] = alpha * pA[0+bs*0];
892 pC[0+bs*1] = alpha * pA[1+bs*0];
893 }
894 }
895 else if(m-ii==3)
896 {
897 if(tna==0 || tna==3)
898 {
899 pC[0+bs*0] = alpha * pA[0+bs*0];
900 pC[1+bs*0] = alpha * pA[0+bs*1];
901 pC[1+bs*1] = alpha * pA[1+bs*1];
902 pC[2+bs*0] = alpha * pA[0+bs*2];
903 pC[2+bs*1] = alpha * pA[1+bs*2];
904 pC[2+bs*2] = alpha * pA[2+bs*2];
905 }
906 else if(tna==1)
907 {
908 pC[0+bs*0] = alpha * pA[0+bs*0];
909 pA += bs;
910 pC += 1 + (sdc-1)*bs;
911 pC[0+bs*0] = alpha * pA[0+bs*0];
912 pC[0+bs*1] = alpha * pA[1+bs*0];
913 pC[1+bs*0] = alpha * pA[0+bs*0];
914 pC[1+bs*1] = alpha * pA[1+bs*1];
915 pC[1+bs*2] = alpha * pA[2+bs*1];
916 }
917 else //if(tna==2)
918 {
919 pC[0+bs*0] = alpha * pA[0+bs*0];
920 pC[1+bs*0] = alpha * pA[0+bs*1];
921 pC[1+bs*1] = alpha * pA[1+bs*1];
922 pA += 2*bs;
923 pC += 2 + (sdc-1)*bs;
924 pC[0+bs*0] = alpha * pA[0+bs*0];
925 pC[0+bs*1] = alpha * pA[1+bs*0];
926 pC[0+bs*2] = alpha * pA[2+bs*0];
927 }
928 }
929
930 return;
931
932 }
933
934
935
936 // regularize diagonal
ddiareg_lib(int kmax,double reg,int offset,double * pD,int sdd)937 void ddiareg_lib(int kmax, double reg, int offset, double *pD, int sdd)
938 {
939
940 const int bs = 4;
941
942 int kna = (bs-offset%bs)%bs;
943 kna = kmax<kna ? kmax : kna;
944
945 int jj, ll;
946
947 if(kna>0)
948 {
949 for(ll=0; ll<kna; ll++)
950 {
951 pD[ll+bs*ll] += reg;
952 }
953 pD += kna + bs*(sdd-1) + kna*bs;
954 kmax -= kna;
955 }
956 for(jj=0; jj<kmax-3; jj+=4)
957 {
958 pD[jj*sdd+(jj+0)*bs+0] += reg;
959 pD[jj*sdd+(jj+1)*bs+1] += reg;
960 pD[jj*sdd+(jj+2)*bs+2] += reg;
961 pD[jj*sdd+(jj+3)*bs+3] += reg;
962 }
963 for(ll=0; ll<kmax-jj; ll++)
964 {
965 pD[jj*sdd+(jj+ll)*bs+ll] += reg;
966 }
967
968 }
969
970
971
972 // insert sqrt of vector to diagonal
ddiain_sqrt_lib(int kmax,double * x,int offset,double * pD,int sdd)973 void ddiain_sqrt_lib(int kmax, double *x, int offset, double *pD, int sdd)
974 {
975
976 const int bs = 4;
977
978 int kna = (bs-offset%bs)%bs;
979 kna = kmax<kna ? kmax : kna;
980
981 int jj, ll;
982
983 if(kna>0)
984 {
985 for(ll=0; ll<kna; ll++)
986 {
987 pD[ll+bs*ll] = sqrt(x[ll]);
988 }
989 pD += kna + bs*(sdd-1) + kna*bs;
990 x += kna;
991 kmax -= kna;
992 }
993 for(jj=0; jj<kmax-3; jj+=4)
994 {
995 pD[jj*sdd+(jj+0)*bs+0] = sqrt(x[jj+0]);
996 pD[jj*sdd+(jj+1)*bs+1] = sqrt(x[jj+1]);
997 pD[jj*sdd+(jj+2)*bs+2] = sqrt(x[jj+2]);
998 pD[jj*sdd+(jj+3)*bs+3] = sqrt(x[jj+3]);
999 }
1000 for(ll=0; ll<kmax-jj; ll++)
1001 {
1002 pD[jj*sdd+(jj+ll)*bs+ll] = sqrt(x[jj+ll]);
1003 }
1004
1005 }
1006
1007
1008
1009 // extract diagonal to vector
ddiaex_lib(int kmax,double alpha,int offset,double * pD,int sdd,double * x)1010 void ddiaex_lib(int kmax, double alpha, int offset, double *pD, int sdd, double *x)
1011 {
1012
1013 const int bs = 4;
1014
1015 int kna = (bs-offset%bs)%bs;
1016 kna = kmax<kna ? kmax : kna;
1017
1018 int jj, ll;
1019
1020 if(kna>0)
1021 {
1022 for(ll=0; ll<kna; ll++)
1023 {
1024 x[ll] = alpha * pD[ll+bs*ll];
1025 }
1026 pD += kna + bs*(sdd-1) + kna*bs;
1027 x += kna;
1028 kmax -= kna;
1029 }
1030 for(jj=0; jj<kmax-3; jj+=4)
1031 {
1032 x[jj+0] = alpha * pD[jj*sdd+(jj+0)*bs+0];
1033 x[jj+1] = alpha * pD[jj*sdd+(jj+1)*bs+1];
1034 x[jj+2] = alpha * pD[jj*sdd+(jj+2)*bs+2];
1035 x[jj+3] = alpha * pD[jj*sdd+(jj+3)*bs+3];
1036 }
1037 for(ll=0; ll<kmax-jj; ll++)
1038 {
1039 x[jj+ll] = alpha * pD[jj*sdd+(jj+ll)*bs+ll];
1040 }
1041
1042 }
1043
1044
1045
1046 // add scaled vector to diagonal
ddiaad_lib(int kmax,double alpha,double * x,int offset,double * pD,int sdd)1047 void ddiaad_lib(int kmax, double alpha, double *x, int offset, double *pD, int sdd)
1048 {
1049
1050 const int bs = 4;
1051
1052 int kna = (bs-offset%bs)%bs;
1053 kna = kmax<kna ? kmax : kna;
1054
1055 int jj, ll;
1056
1057 if(kna>0)
1058 {
1059 for(ll=0; ll<kna; ll++)
1060 {
1061 pD[ll+bs*ll] += alpha * x[ll];
1062 }
1063 pD += kna + bs*(sdd-1) + kna*bs;
1064 x += kna;
1065 kmax -= kna;
1066 }
1067 for(jj=0; jj<kmax-3; jj+=4)
1068 {
1069 pD[jj*sdd+(jj+0)*bs+0] += alpha * x[jj+0];
1070 pD[jj*sdd+(jj+1)*bs+1] += alpha * x[jj+1];
1071 pD[jj*sdd+(jj+2)*bs+2] += alpha * x[jj+2];
1072 pD[jj*sdd+(jj+3)*bs+3] += alpha * x[jj+3];
1073 }
1074 for(ll=0; ll<kmax-jj; ll++)
1075 {
1076 pD[jj*sdd+(jj+ll)*bs+ll] += alpha * x[jj+ll];
1077 }
1078
1079 }
1080
1081
1082
1083 // insert vector to diagonal, sparse formulation
ddiain_libsp(int kmax,int * idx,double alpha,double * x,double * pD,int sdd)1084 void ddiain_libsp(int kmax, int *idx, double alpha, double *x, double *pD, int sdd)
1085 {
1086
1087 const int bs = 4;
1088
1089 int ii, jj;
1090
1091 for(jj=0; jj<kmax; jj++)
1092 {
1093 ii = idx[jj];
1094 pD[ii/bs*bs*sdd+ii%bs+ii*bs] = alpha * x[jj];
1095 }
1096
1097 }
1098
1099
1100
1101 // extract diagonal to vector, sparse formulation
ddiaex_libsp(int kmax,int * idx,double alpha,double * pD,int sdd,double * x)1102 void ddiaex_libsp(int kmax, int *idx, double alpha, double *pD, int sdd, double *x)
1103 {
1104
1105 const int bs = 4;
1106
1107 int ii, jj;
1108
1109 for(jj=0; jj<kmax; jj++)
1110 {
1111 ii = idx[jj];
1112 x[jj] = alpha * pD[ii/bs*bs*sdd+ii%bs+ii*bs];
1113 }
1114
1115 }
1116
1117
1118
1119 // add scaled vector to diagonal, sparse formulation
ddiaad_libsp(int kmax,int * idx,double alpha,double * x,double * pD,int sdd)1120 void ddiaad_libsp(int kmax, int *idx, double alpha, double *x, double *pD, int sdd)
1121 {
1122
1123 const int bs = 4;
1124
1125 int ii, jj;
1126
1127 for(jj=0; jj<kmax; jj++)
1128 {
1129 ii = idx[jj];
1130 pD[ii/bs*bs*sdd+ii%bs+ii*bs] += alpha * x[jj];
1131 }
1132
1133 }
1134
1135
1136
1137 // add scaled vector to another vector and insert to diagonal, sparse formulation
ddiaadin_libsp(int kmax,int * idx,double alpha,double * x,double * y,double * pD,int sdd)1138 void ddiaadin_libsp(int kmax, int *idx, double alpha, double *x, double *y, double *pD, int sdd)
1139 {
1140
1141 const int bs = 4;
1142
1143 int ii, jj;
1144
1145 for(jj=0; jj<kmax; jj++)
1146 {
1147 ii = idx[jj];
1148 pD[ii/bs*bs*sdd+ii%bs+ii*bs] = y[jj] + alpha * x[jj];
1149 }
1150
1151 }
1152
1153
1154
1155 // insert vector to row
drowin_lib(int kmax,double alpha,double * x,double * pD)1156 void drowin_lib(int kmax, double alpha, double *x, double *pD)
1157 {
1158
1159 const int bs = 4;
1160
1161 int jj, ll;
1162
1163 for(jj=0; jj<kmax-3; jj+=4)
1164 {
1165 pD[(jj+0)*bs] = alpha*x[jj+0];
1166 pD[(jj+1)*bs] = alpha*x[jj+1];
1167 pD[(jj+2)*bs] = alpha*x[jj+2];
1168 pD[(jj+3)*bs] = alpha*x[jj+3];
1169 }
1170 for(; jj<kmax; jj++)
1171 {
1172 pD[(jj)*bs] = alpha*x[jj];
1173 }
1174
1175 }
1176
1177
1178
1179 // extract row to vector
drowex_lib(int kmax,double alpha,double * pD,double * x)1180 void drowex_lib(int kmax, double alpha, double *pD, double *x)
1181 {
1182
1183 const int bs = 4;
1184
1185 int jj, ll;
1186
1187 for(jj=0; jj<kmax-3; jj+=4)
1188 {
1189 x[jj+0] = alpha*pD[(jj+0)*bs];
1190 x[jj+1] = alpha*pD[(jj+1)*bs];
1191 x[jj+2] = alpha*pD[(jj+2)*bs];
1192 x[jj+3] = alpha*pD[(jj+3)*bs];
1193 }
1194 for(; jj<kmax; jj++)
1195 {
1196 x[jj] = alpha*pD[(jj)*bs];
1197 }
1198
1199 }
1200
1201
1202
1203 // add scaled vector to row
drowad_lib(int kmax,double alpha,double * x,double * pD)1204 void drowad_lib(int kmax, double alpha, double *x, double *pD)
1205 {
1206
1207 const int bs = 4;
1208
1209 int jj, ll;
1210
1211 for(jj=0; jj<kmax-3; jj+=4)
1212 {
1213 pD[(jj+0)*bs] += alpha * x[jj+0];
1214 pD[(jj+1)*bs] += alpha * x[jj+1];
1215 pD[(jj+2)*bs] += alpha * x[jj+2];
1216 pD[(jj+3)*bs] += alpha * x[jj+3];
1217 }
1218 for(; jj<kmax; jj++)
1219 {
1220 pD[(jj)*bs] += alpha * x[jj];
1221 }
1222
1223 }
1224
1225
1226
1227 // insert vector to row, sparse formulation
drowin_libsp(int kmax,double alpha,int * idx,double * x,double * pD)1228 void drowin_libsp(int kmax, double alpha, int *idx, double *x, double *pD)
1229 {
1230
1231 const int bs = 4;
1232
1233 int ii, jj;
1234
1235 for(jj=0; jj<kmax; jj++)
1236 {
1237 ii = idx[jj];
1238 pD[ii*bs] = alpha*x[jj];
1239 }
1240
1241 }
1242
1243
1244
1245 // add scaled vector to row, sparse formulation
drowad_libsp(int kmax,int * idx,double alpha,double * x,double * pD)1246 void drowad_libsp(int kmax, int *idx, double alpha, double *x, double *pD)
1247 {
1248
1249 const int bs = 4;
1250
1251 int ii, jj;
1252
1253 for(jj=0; jj<kmax; jj++)
1254 {
1255 ii = idx[jj];
1256 pD[ii*bs] += alpha * x[jj];
1257 }
1258
1259 }
1260
1261
1262
1263 // add scaled vector to another vector and insert to row, sparse formulation
drowadin_libsp(int kmax,int * idx,double alpha,double * x,double * y,double * pD)1264 void drowadin_libsp(int kmax, int *idx, double alpha, double *x, double *y, double *pD)
1265 {
1266
1267 const int bs = 4;
1268
1269 int ii, jj;
1270
1271 for(jj=0; jj<kmax; jj++)
1272 {
1273 ii = idx[jj];
1274 pD[ii*bs] = y[jj] + alpha * x[jj];
1275 }
1276
1277 }
1278
1279
1280
1281 // extract vector from column
dcolex_lib(int kmax,int offset,double * pD,int sdd,double * x)1282 void dcolex_lib(int kmax, int offset, double *pD, int sdd, double *x)
1283 {
1284
1285 const int bs = 4;
1286
1287 int kna = (bs-offset%bs)%bs;
1288 kna = kmax<kna ? kmax : kna;
1289
1290 int jj, ll;
1291
1292 if(kna>0)
1293 {
1294 for(ll=0; ll<kna; ll++)
1295 {
1296 x[ll] = pD[ll];
1297 }
1298
1299 pD += kna + bs*(sdd-1);
1300 x += kna;
1301 kmax -= kna;
1302 }
1303
1304 for(jj=0; jj<kmax-3; jj+=4)
1305 {
1306 x[jj+0] = pD[jj*sdd+0];
1307 x[jj+1] = pD[jj*sdd+1];
1308 x[jj+2] = pD[jj*sdd+2];
1309 x[jj+3] = pD[jj*sdd+3];
1310 }
1311 for(ll=0; ll<kmax-jj; ll++)
1312 {
1313 x[jj+ll] = pD[jj*sdd+ll];
1314 }
1315
1316 }
1317
1318
1319
1320 // insert vector to column
dcolin_lib(int kmax,double * x,int offset,double * pD,int sdd)1321 void dcolin_lib(int kmax, double *x, int offset, double *pD, int sdd)
1322 {
1323
1324 const int bs = 4;
1325
1326 int kna = (bs-offset%bs)%bs;
1327 kna = kmax<kna ? kmax : kna;
1328
1329 int jj, ll;
1330
1331 if(kna>0)
1332 {
1333 for(ll=0; ll<kna; ll++)
1334 {
1335 pD[ll] = x[ll];
1336 }
1337 pD += kna + bs*(sdd-1);
1338 x += kna;
1339 kmax -= kna;
1340 }
1341 for(jj=0; jj<kmax-3; jj+=4)
1342 {
1343 pD[jj*sdd+0] = x[jj+0];
1344 pD[jj*sdd+1] = x[jj+1];
1345 pD[jj*sdd+2] = x[jj+2];
1346 pD[jj*sdd+3] = x[jj+3];
1347 }
1348 for(ll=0; ll<kmax-jj; ll++)
1349 {
1350 pD[jj*sdd+ll] = x[jj+ll];
1351 }
1352
1353 }
1354
1355
1356
1357 // add scaled vector to column
dcolad_lib(int kmax,double alpha,double * x,int offset,double * pD,int sdd)1358 void dcolad_lib(int kmax, double alpha, double *x, int offset, double *pD, int sdd)
1359 {
1360
1361 const int bs = 4;
1362
1363 int kna = (bs-offset%bs)%bs;
1364 kna = kmax<kna ? kmax : kna;
1365
1366 int jj, ll;
1367
1368 if(kna>0)
1369 {
1370 for(ll=0; ll<kna; ll++)
1371 {
1372 pD[ll] += alpha * x[ll];
1373 }
1374 pD += kna + bs*(sdd-1);
1375 x += kna;
1376 kmax -= kna;
1377 }
1378 for(jj=0; jj<kmax-3; jj+=4)
1379 {
1380 pD[jj*sdd+0] += alpha * x[jj+0];
1381 pD[jj*sdd+1] += alpha * x[jj+1];
1382 pD[jj*sdd+2] += alpha * x[jj+2];
1383 pD[jj*sdd+3] += alpha * x[jj+3];
1384 }
1385 for(ll=0; ll<kmax-jj; ll++)
1386 {
1387 pD[jj*sdd+ll] += alpha * x[jj+ll];
1388 }
1389
1390 }
1391
1392
1393
1394 // insert vector to diagonal, sparse formulation
dcolin_libsp(int kmax,int * idx,double * x,double * pD,int sdd)1395 void dcolin_libsp(int kmax, int *idx, double *x, double *pD, int sdd)
1396 {
1397
1398 const int bs = 4;
1399
1400 int ii, jj;
1401
1402 for(jj=0; jj<kmax; jj++)
1403 {
1404 ii = idx[jj];
1405 pD[ii/bs*bs*sdd+ii%bs] = x[jj];
1406 }
1407
1408 }
1409
1410
1411
1412 // add scaled vector to diagonal, sparse formulation
dcolad_libsp(int kmax,double alpha,int * idx,double * x,double * pD,int sdd)1413 void dcolad_libsp(int kmax, double alpha, int *idx, double *x, double *pD, int sdd)
1414 {
1415
1416 const int bs = 4;
1417
1418 int ii, jj;
1419
1420 for(jj=0; jj<kmax; jj++)
1421 {
1422 ii = idx[jj];
1423 pD[ii/bs*bs*sdd+ii%bs] += alpha * x[jj];
1424 }
1425
1426 }
1427
1428
1429
1430 // swaps two cols
dcolsw_lib(int kmax,int offsetA,double * pA,int sda,int offsetC,double * pC,int sdc)1431 void dcolsw_lib(int kmax, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc)
1432 {
1433
1434 const int bs = 4;
1435
1436 int ii;
1437
1438 double tmp;
1439
1440 if(offsetA==offsetC)
1441 {
1442 if(offsetA>0)
1443 {
1444 ii = 0;
1445 for(; ii<bs-offsetA; ii++)
1446 {
1447 tmp = pA[0+bs*0];
1448 pA[0+bs*0] = pC[0+bs*0];
1449 pC[0+bs*0] = tmp;
1450 pA += 1;
1451 pC += 1;
1452 }
1453 pA += bs*(sda-1);
1454 pC += bs*(sdc-1);
1455 kmax -= bs-offsetA;
1456 }
1457 ii = 0;
1458 for(; ii<kmax-3; ii+=4)
1459 {
1460 tmp = pA[0+bs*0];
1461 pA[0+bs*0] = pC[0+bs*0];
1462 pC[0+bs*0] = tmp;
1463 tmp = pA[1+bs*0];
1464 pA[1+bs*0] = pC[1+bs*0];
1465 pC[1+bs*0] = tmp;
1466 tmp = pA[2+bs*0];
1467 pA[2+bs*0] = pC[2+bs*0];
1468 pC[2+bs*0] = tmp;
1469 tmp = pA[3+bs*0];
1470 pA[3+bs*0] = pC[3+bs*0];
1471 pC[3+bs*0] = tmp;
1472 pA += bs*sda;
1473 pC += bs*sdc;
1474 }
1475 for(; ii<kmax; ii++)
1476 {
1477 tmp = pA[0+bs*0];
1478 pA[0+bs*0] = pC[0+bs*0];
1479 pC[0+bs*0] = tmp;
1480 pA += 1;
1481 pC += 1;
1482 }
1483 }
1484 else
1485 {
1486 printf("\ndcolsw: feature not implemented yet: offsetA!=offsetC\n\n");
1487 exit(1);
1488 }
1489
1490 return;
1491
1492 }
1493
1494
1495
1496 // insert vector to vector, sparse formulation
dvecin_libsp(int kmax,int * idx,double * x,double * y)1497 void dvecin_libsp(int kmax, int *idx, double *x, double *y)
1498 {
1499
1500 int jj;
1501
1502 for(jj=0; jj<kmax; jj++)
1503 {
1504 y[idx[jj]] = x[jj];
1505 }
1506
1507 }
1508
1509
1510
1511 // adds vector to vector, sparse formulation
dvecad_libsp(int kmax,int * idx,double alpha,double * x,double * y)1512 void dvecad_libsp(int kmax, int *idx, double alpha, double *x, double *y)
1513 {
1514
1515 int jj;
1516
1517 for(jj=0; jj<kmax; jj++)
1518 {
1519 y[idx[jj]] += alpha * x[jj];
1520 }
1521
1522 }
1523
1524
1525
1526 /*
1527 * STRMAT interface (General to specific interface)
1528 *
1529 */
1530
1531 #if defined(LA_HIGH_PERFORMANCE)
1532
1533
1534
1535 // return the memory size (in bytes) needed for a strmat
blasfeo_memsize_dmat(int m,int n)1536 int blasfeo_memsize_dmat(int m, int n)
1537 {
1538 const int bs = 4;
1539 int nc = D_NC;
1540 int al = bs*nc;
1541 int pm = (m+bs-1)/bs*bs;
1542 int cn = (n+nc-1)/nc*nc;
1543 int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
1544 int memsize = (pm*cn+tmp)*sizeof(double);
1545 return memsize;
1546 }
1547
1548
1549
1550 // return the memory size (in bytes) needed for the digonal of a strmat
blasfeo_memsize_diag_dmat(int m,int n)1551 int blasfeo_memsize_diag_dmat(int m, int n)
1552 {
1553 const int bs = 4;
1554 int nc = D_NC;
1555 int al = bs*nc;
1556 int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
1557 int memsize = tmp*sizeof(double);
1558 return memsize;
1559 }
1560
1561
1562
1563 // create a matrix structure for a matrix of size m*n by using memory passed by a pointer
blasfeo_create_dmat(int m,int n,struct blasfeo_dmat * sA,void * memory)1564 void blasfeo_create_dmat(int m, int n, struct blasfeo_dmat *sA, void *memory)
1565 {
1566 // invalidate stored inverse diagonal
1567 sA->use_dA = 0;
1568
1569 const int bs = 4;
1570 int nc = D_NC;
1571 int al = bs*nc;
1572 sA->m = m;
1573 sA->n = n;
1574 int pm = (m+bs-1)/bs*bs;
1575 int cn = (n+nc-1)/nc*nc;
1576 sA->pm = pm;
1577 sA->cn = cn;
1578 double *ptr = (double *) memory;
1579 sA->pA = ptr;
1580 ptr += pm*cn;
1581 int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
1582 sA->dA = ptr;
1583 ptr += tmp;
1584 sA->memsize = (pm*cn+tmp)*sizeof(double);
1585 return;
1586 }
1587
1588
1589
1590 // return memory size (in bytes) needed for a strvec
blasfeo_memsize_dvec(int m)1591 int blasfeo_memsize_dvec(int m)
1592 {
1593 const int bs = 4;
1594 // int nc = D_NC;
1595 // int al = bs*nc;
1596 int pm = (m+bs-1)/bs*bs;
1597 int memsize = pm*sizeof(double);
1598 return memsize;
1599 }
1600
1601
1602
1603 // create a vector structure for a vector of size m by using memory passed by a pointer
blasfeo_create_dvec(int m,struct blasfeo_dvec * sa,void * memory)1604 void blasfeo_create_dvec(int m, struct blasfeo_dvec *sa, void *memory)
1605 {
1606 const int bs = 4;
1607 // int nc = D_NC;
1608 // int al = bs*nc;
1609 sa->m = m;
1610 int pm = (m+bs-1)/bs*bs;
1611 sa->pm = pm;
1612 double *ptr = (double *) memory;
1613 sa->pa = ptr;
1614 // ptr += pm;
1615 sa->memsize = pm*sizeof(double);
1616 return;
1617 }
1618
1619
1620
1621 // convert a matrix into a matrix structure
blasfeo_pack_dmat(int m,int n,double * A,int lda,struct blasfeo_dmat * sA,int ai,int aj)1622 void blasfeo_pack_dmat(int m, int n, double *A, int lda, struct blasfeo_dmat *sA, int ai, int aj)
1623 {
1624 if(m<=0 || n<=0)
1625 return;
1626
1627 // invalidate stored inverse diagonal
1628 sA->use_dA = 0;
1629
1630 const int bs = 4;
1631 int sda = sA->cn;
1632 double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
1633 int i, ii, j, jj, m0, m1, m2;
1634 double *B, *pB;
1635 sA->use_dA = 0;
1636
1637 // row vector in sA
1638 if(m==1)
1639 {
1640 for(jj=0; jj<n; jj++)
1641 {
1642 pA[jj*bs] = A[jj*lda];
1643 }
1644 return;
1645 }
1646
1647 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1648 __m256d
1649 tmp;
1650 #endif
1651 m0 = (bs-ai%bs)%bs;
1652 if(m0>m)
1653 m0 = m;
1654 m1 = m - m0;
1655 jj = 0;
1656 for( ; jj<n-3; jj+=4)
1657 {
1658 B = A + jj*lda;
1659 pB = pA + jj*bs;
1660 ii = 0;
1661 if(m0>0)
1662 {
1663 for( ; ii<m0; ii++)
1664 {
1665 pB[ii+bs*0] = B[ii+lda*0];
1666 pB[ii+bs*1] = B[ii+lda*1];
1667 pB[ii+bs*2] = B[ii+lda*2];
1668 pB[ii+bs*3] = B[ii+lda*3];
1669 }
1670 B += m0;
1671 pB += m0 + bs*(sda-1);
1672 }
1673 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1674 for( ; ii<m-3; ii+=4)
1675 {
1676 tmp = _mm256_loadu_pd( &B[0+lda*0] );
1677 _mm256_store_pd( &pB[0+bs*0], tmp );
1678 tmp = _mm256_loadu_pd( &B[0+lda*1] );
1679 _mm256_store_pd( &pB[0+bs*1], tmp );
1680 tmp = _mm256_loadu_pd( &B[0+lda*2] );
1681 _mm256_store_pd( &pB[0+bs*2], tmp );
1682 tmp = _mm256_loadu_pd( &B[0+lda*3] );
1683 _mm256_store_pd( &pB[0+bs*3], tmp );
1684 B += 4;
1685 pB += bs*sda;
1686 }
1687 #else
1688 for( ; ii<m-3; ii+=4)
1689 {
1690 // col 0
1691 pB[0+bs*0] = B[0+lda*0];
1692 pB[1+bs*0] = B[1+lda*0];
1693 pB[2+bs*0] = B[2+lda*0];
1694 pB[3+bs*0] = B[3+lda*0];
1695 // col 1
1696 pB[0+bs*1] = B[0+lda*1];
1697 pB[1+bs*1] = B[1+lda*1];
1698 pB[2+bs*1] = B[2+lda*1];
1699 pB[3+bs*1] = B[3+lda*1];
1700 // col 2
1701 pB[0+bs*2] = B[0+lda*2];
1702 pB[1+bs*2] = B[1+lda*2];
1703 pB[2+bs*2] = B[2+lda*2];
1704 pB[3+bs*2] = B[3+lda*2];
1705 // col 3
1706 pB[0+bs*3] = B[0+lda*3];
1707 pB[1+bs*3] = B[1+lda*3];
1708 pB[2+bs*3] = B[2+lda*3];
1709 pB[3+bs*3] = B[3+lda*3];
1710 // update
1711 B += 4;
1712 pB += bs*sda;
1713 }
1714 #endif
1715 for( ; ii<m; ii++)
1716 {
1717 // col 0
1718 pB[0+bs*0] = B[0+lda*0];
1719 // col 1
1720 pB[0+bs*1] = B[0+lda*1];
1721 // col 2
1722 pB[0+bs*2] = B[0+lda*2];
1723 // col 3
1724 pB[0+bs*3] = B[0+lda*3];
1725 // update
1726 B += 1;
1727 pB += 1;
1728 }
1729 }
1730 for( ; jj<n; jj++)
1731 {
1732
1733 B = A + jj*lda;
1734 pB = pA + jj*bs;
1735
1736 ii = 0;
1737 if(m0>0)
1738 {
1739 for( ; ii<m0; ii++)
1740 {
1741 pB[ii+bs*0] = B[ii+lda*0];
1742 }
1743 B += m0;
1744 pB += m0 + bs*(sda-1);
1745 }
1746 for( ; ii<m-3; ii+=4)
1747 {
1748 // col 0
1749 pB[0+bs*0] = B[0+lda*0];
1750 pB[1+bs*0] = B[1+lda*0];
1751 pB[2+bs*0] = B[2+lda*0];
1752 pB[3+bs*0] = B[3+lda*0];
1753 // update
1754 B += 4;
1755 pB += bs*sda;
1756 }
1757 for( ; ii<m; ii++)
1758 {
1759 // col 0
1760 pB[0+bs*0] = B[0+lda*0];
1761 // update
1762 B += 1;
1763 pB += 1;
1764 }
1765 }
1766 return;
1767 }
1768
1769
1770
1771 // convert a lower triangular matrix into a matrix structure
1772 // TODO vectorize triangle in the 4x4 diag blocks
blasfeo_pack_l_dmat(int m,int n,double * A,int lda,struct blasfeo_dmat * sA,int ai,int aj)1773 void blasfeo_pack_l_dmat(int m, int n, double *A, int lda, struct blasfeo_dmat *sA, int ai, int aj)
1774 {
1775 if(m<=0 || n<=0)
1776 return;
1777
1778 // invalidate stored inverse diagonal
1779 sA->use_dA = 0;
1780
1781 const int bs = 4;
1782 int sda = sA->cn;
1783 double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
1784 int i, ii, j, jj, m0, m1, m2;
1785 double *B, *pB;
1786 sA->use_dA = 0;
1787
1788 // row vector in sA
1789 if(m==1)
1790 {
1791 for(jj=0; jj<n; jj++)
1792 {
1793 pA[jj*bs] = A[jj*lda];
1794 }
1795 return;
1796 }
1797
1798 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1799 __m256d
1800 tmp;
1801 #endif
1802 m0 = (bs-ai%bs)%bs;
1803 if(m0>m)
1804 m0 = m;
1805 m1 = m - m0;
1806 if(m0>0)
1807 {
1808 printf("\nblasfeo_pack_l_dmat: feature not implemented yet: ai!=0\n");
1809 exit(1);
1810 }
1811 jj = 0;
1812 for( ; jj<n-3; jj+=4)
1813 {
1814 B = A + jj + jj*lda;
1815 pB = pA + jj*bs + jj*sda;
1816 ii = jj;
1817 // col 0
1818 pB[0+bs*0] = B[0+lda*0];
1819 pB[1+bs*0] = B[1+lda*0];
1820 pB[2+bs*0] = B[2+lda*0];
1821 pB[3+bs*0] = B[3+lda*0];
1822 // col 1
1823 // pB[0+bs*1] = B[0+lda*1];
1824 pB[1+bs*1] = B[1+lda*1];
1825 pB[2+bs*1] = B[2+lda*1];
1826 pB[3+bs*1] = B[3+lda*1];
1827 // col 2
1828 // pB[0+bs*2] = B[0+lda*2];
1829 // pB[1+bs*2] = B[1+lda*2];
1830 pB[2+bs*2] = B[2+lda*2];
1831 pB[3+bs*2] = B[3+lda*2];
1832 // col 3
1833 // pB[0+bs*3] = B[0+lda*3];
1834 // pB[1+bs*3] = B[1+lda*3];
1835 // pB[2+bs*3] = B[2+lda*3];
1836 pB[3+bs*3] = B[3+lda*3];
1837 B += 4;
1838 pB += bs*sda;
1839 ii += 4;
1840 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1841 for( ; ii<m-3; ii+=4)
1842 {
1843 tmp = _mm256_loadu_pd( &B[0+lda*0] );
1844 _mm256_store_pd( &pB[0+bs*0], tmp );
1845 tmp = _mm256_loadu_pd( &B[0+lda*1] );
1846 _mm256_store_pd( &pB[0+bs*1], tmp );
1847 tmp = _mm256_loadu_pd( &B[0+lda*2] );
1848 _mm256_store_pd( &pB[0+bs*2], tmp );
1849 tmp = _mm256_loadu_pd( &B[0+lda*3] );
1850 _mm256_store_pd( &pB[0+bs*3], tmp );
1851 B += 4;
1852 pB += bs*sda;
1853 }
1854 #else
1855 for( ; ii<m-3; ii+=4)
1856 {
1857 // col 0
1858 pB[0+bs*0] = B[0+lda*0];
1859 pB[1+bs*0] = B[1+lda*0];
1860 pB[2+bs*0] = B[2+lda*0];
1861 pB[3+bs*0] = B[3+lda*0];
1862 // col 1
1863 pB[0+bs*1] = B[0+lda*1];
1864 pB[1+bs*1] = B[1+lda*1];
1865 pB[2+bs*1] = B[2+lda*1];
1866 pB[3+bs*1] = B[3+lda*1];
1867 // col 2
1868 pB[0+bs*2] = B[0+lda*2];
1869 pB[1+bs*2] = B[1+lda*2];
1870 pB[2+bs*2] = B[2+lda*2];
1871 pB[3+bs*2] = B[3+lda*2];
1872 // col 3
1873 pB[0+bs*3] = B[0+lda*3];
1874 pB[1+bs*3] = B[1+lda*3];
1875 pB[2+bs*3] = B[2+lda*3];
1876 pB[3+bs*3] = B[3+lda*3];
1877 // update
1878 B += 4;
1879 pB += bs*sda;
1880 }
1881 #endif
1882 for( ; ii<m; ii++)
1883 {
1884 // col 0
1885 pB[0+bs*0] = B[0+lda*0];
1886 // col 1
1887 pB[0+bs*1] = B[0+lda*1];
1888 // col 2
1889 pB[0+bs*2] = B[0+lda*2];
1890 // col 3
1891 pB[0+bs*3] = B[0+lda*3];
1892 // update
1893 B += 1;
1894 pB += 1;
1895 }
1896 }
1897 if(jj<n)
1898 {
1899 B = A + jj + jj*lda;
1900 pB = pA + jj*bs + jj*sda;
1901 if(n-jj==1)
1902 {
1903 // col 0
1904 pB[0+bs*0] = B[0+lda*0];
1905 }
1906 else if(n-jj==2)
1907 {
1908 // col 0
1909 pB[0+bs*0] = B[0+lda*0];
1910 pB[1+bs*0] = B[1+lda*0];
1911 // col 1
1912 // pB[0+bs*1] = B[0+lda*1];
1913 pB[1+bs*1] = B[1+lda*1];
1914 }
1915 else //if(n-jj==3)
1916 {
1917 // col 0
1918 pB[0+bs*0] = B[0+lda*0];
1919 pB[1+bs*0] = B[1+lda*0];
1920 pB[2+bs*0] = B[2+lda*0];
1921 // col 1
1922 // pB[0+bs*1] = B[0+lda*1];
1923 pB[1+bs*1] = B[1+lda*1];
1924 pB[2+bs*1] = B[2+lda*1];
1925 // col 2
1926 // pB[0+bs*2] = B[0+lda*2];
1927 // pB[1+bs*2] = B[1+lda*2];
1928 pB[2+bs*2] = B[2+lda*2];
1929 }
1930 }
1931 return;
1932 }
1933
1934
1935
1936 // convert a upper triangular matrix into a matrix structure
1937 // TODO vectorize triangle in the 4x4 diag blocks
blasfeo_pack_u_dmat(int m,int n,double * A,int lda,struct blasfeo_dmat * sA,int ai,int aj)1938 void blasfeo_pack_u_dmat(int m, int n, double *A, int lda, struct blasfeo_dmat *sA, int ai, int aj)
1939 {
1940 if(m<=0 || n<=0)
1941 return;
1942
1943 // invalidate stored inverse diagonal
1944 sA->use_dA = 0;
1945
1946 const int bs = 4;
1947 int sda = sA->cn;
1948 double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
1949 int i, ii, j, jj, m0, m1, m2;
1950 double *B, *pB;
1951 sA->use_dA = 0;
1952
1953 // row vector in sA
1954 if(m==1)
1955 {
1956 for(jj=0; jj<n; jj++)
1957 {
1958 pA[jj*bs] = A[jj*lda];
1959 }
1960 return;
1961 }
1962
1963 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1964 __m256d
1965 tmp;
1966 #endif
1967 m0 = (bs-ai%bs)%bs;
1968 if(m0>m)
1969 m0 = m;
1970 m1 = m - m0;
1971 jj = 0;
1972 for( ; jj<n-3; jj+=4)
1973 {
1974 B = A + jj*lda;
1975 pB = pA + jj*bs;
1976 ii = 0;
1977 if(m0>0)
1978 {
1979 for( ; ii<m0; ii++)
1980 {
1981 pB[ii+bs*0] = B[ii+lda*0];
1982 pB[ii+bs*1] = B[ii+lda*1];
1983 pB[ii+bs*2] = B[ii+lda*2];
1984 pB[ii+bs*3] = B[ii+lda*3];
1985 }
1986 B += m0;
1987 pB += m0 + bs*(sda-1);
1988 }
1989 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1990 for( ; ii<jj-3; ii+=4)
1991 {
1992 tmp = _mm256_loadu_pd( &B[0+lda*0] );
1993 _mm256_store_pd( &pB[0+bs*0], tmp );
1994 tmp = _mm256_loadu_pd( &B[0+lda*1] );
1995 _mm256_store_pd( &pB[0+bs*1], tmp );
1996 tmp = _mm256_loadu_pd( &B[0+lda*2] );
1997 _mm256_store_pd( &pB[0+bs*2], tmp );
1998 tmp = _mm256_loadu_pd( &B[0+lda*3] );
1999 _mm256_store_pd( &pB[0+bs*3], tmp );
2000 B += 4;
2001 pB += bs*sda;
2002 }
2003 #else
2004 for( ; ii<jj-3; ii+=4)
2005 {
2006 // col 0
2007 pB[0+bs*0] = B[0+lda*0];
2008 pB[1+bs*0] = B[1+lda*0];
2009 pB[2+bs*0] = B[2+lda*0];
2010 pB[3+bs*0] = B[3+lda*0];
2011 // col 1
2012 pB[0+bs*1] = B[0+lda*1];
2013 pB[1+bs*1] = B[1+lda*1];
2014 pB[2+bs*1] = B[2+lda*1];
2015 pB[3+bs*1] = B[3+lda*1];
2016 // col 2
2017 pB[0+bs*2] = B[0+lda*2];
2018 pB[1+bs*2] = B[1+lda*2];
2019 pB[2+bs*2] = B[2+lda*2];
2020 pB[3+bs*2] = B[3+lda*2];
2021 // col 3
2022 pB[0+bs*3] = B[0+lda*3];
2023 pB[1+bs*3] = B[1+lda*3];
2024 pB[2+bs*3] = B[2+lda*3];
2025 pB[3+bs*3] = B[3+lda*3];
2026 // update
2027 B += 4;
2028 pB += bs*sda;
2029 }
2030 #endif
2031 // col 0
2032 pB[0+bs*0] = B[0+lda*0];
2033 // pB[1+bs*0] = B[1+lda*0];
2034 // pB[2+bs*0] = B[2+lda*0];
2035 // pB[3+bs*0] = B[3+lda*0];
2036 // col 1
2037 pB[0+bs*1] = B[0+lda*1];
2038 pB[1+bs*1] = B[1+lda*1];
2039 // pB[2+bs*1] = B[2+lda*1];
2040 // pB[3+bs*1] = B[3+lda*1];
2041 // col 2
2042 pB[0+bs*2] = B[0+lda*2];
2043 pB[1+bs*2] = B[1+lda*2];
2044 pB[2+bs*2] = B[2+lda*2];
2045 // pB[3+bs*2] = B[3+lda*2];
2046 // col 3
2047 pB[0+bs*3] = B[0+lda*3];
2048 pB[1+bs*3] = B[1+lda*3];
2049 pB[2+bs*3] = B[2+lda*3];
2050 pB[3+bs*3] = B[3+lda*3];
2051 }
2052 for( ; jj<n; jj++)
2053 {
2054
2055 B = A + jj*lda;
2056 pB = pA + jj*bs;
2057
2058 ii = 0;
2059 if(m0>0)
2060 {
2061 for( ; ii<m0; ii++)
2062 {
2063 pB[ii+bs*0] = B[ii+lda*0];
2064 }
2065 B += m0;
2066 pB += m0 + bs*(sda-1);
2067 }
2068 for( ; ii<jj-3; ii+=4)
2069 {
2070 // col 0
2071 pB[0+bs*0] = B[0+lda*0];
2072 pB[1+bs*0] = B[1+lda*0];
2073 pB[2+bs*0] = B[2+lda*0];
2074 pB[3+bs*0] = B[3+lda*0];
2075 // update
2076 B += 4;
2077 pB += bs*sda;
2078 }
2079 for( ; ii<=jj; ii++)
2080 {
2081 // col 0
2082 pB[0+bs*0] = B[0+lda*0];
2083 // update
2084 B += 1;
2085 pB += 1;
2086 }
2087 }
2088 return;
2089 }
2090
2091
2092
2093 // convert and transpose a matrix into a matrix structure
blasfeo_pack_tran_dmat(int m,int n,double * A,int lda,struct blasfeo_dmat * sA,int ai,int aj)2094 void blasfeo_pack_tran_dmat(int m, int n, double *A, int lda, struct blasfeo_dmat *sA, int ai, int aj)
2095 {
2096
2097 // invalidate stored inverse diagonal
2098 sA->use_dA = 0;
2099
2100 const int bs = 4;
2101 int sda = sA->cn;
2102 double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
2103 int i, ii, j, m0, m1, m2;
2104 double *B, *pB;
2105 sA->use_dA = 0;
2106
2107 // row vector in sA
2108 if(n==1)
2109 {
2110 for(ii=0; ii<m; ii++)
2111 {
2112 pA[ii*bs] = A[ii];
2113 }
2114 return;
2115 }
2116
2117 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2118 __m256d
2119 v0, v1, v2, v3,
2120 v4, v5, v6, v7;
2121 #endif
2122 m0 = (bs-ai%bs)%bs;
2123 if(m0>n)
2124 m0 = n;
2125 m1 = n - m0;
2126 ii = 0;
2127 if(m0>0)
2128 {
2129 for(j=0; j<m; j++)
2130 {
2131 for(i=0; i<m0; i++)
2132 {
2133 pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
2134 }
2135 }
2136 A += m0*lda;
2137 pA += m0 + bs*(sda-1);
2138 }
2139 ii = 0;
2140 for(; ii<m1-3; ii+=bs)
2141 {
2142 j=0;
2143 B = A + ii*lda;
2144 pB = pA + ii*sda;
2145 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2146 for(; j<m-3; j+=4)
2147 {
2148 v0 = _mm256_loadu_pd( &B[0+0*lda] ); // 00 10 20 30
2149 v1 = _mm256_loadu_pd( &B[0+1*lda] ); // 01 11 21 31
2150 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
2151 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
2152 v2 = _mm256_loadu_pd( &B[0+2*lda] ); // 02 12 22 32
2153 v3 = _mm256_loadu_pd( &B[0+3*lda] ); // 03 13 23 33
2154 v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
2155 v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
2156
2157 B += 4;
2158
2159 v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
2160 _mm256_store_pd( &pB[0+bs*0], v0 );
2161 v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
2162 _mm256_store_pd( &pB[0+bs*2], v2 );
2163 v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
2164 _mm256_store_pd( &pB[0+bs*1], v1 );
2165 v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
2166 _mm256_store_pd( &pB[0+bs*3], v3 );
2167
2168 pB += 4*bs;
2169 }
2170 #else
2171 for(; j<m-3; j+=4)
2172 {
2173 // unroll 0
2174 pB[0+0*bs] = B[0+0*lda];
2175 pB[1+0*bs] = B[0+1*lda];
2176 pB[2+0*bs] = B[0+2*lda];
2177 pB[3+0*bs] = B[0+3*lda];
2178 // unroll 1
2179 pB[0+1*bs] = B[1+0*lda];
2180 pB[1+1*bs] = B[1+1*lda];
2181 pB[2+1*bs] = B[1+2*lda];
2182 pB[3+1*bs] = B[1+3*lda];
2183 // unroll 2
2184 pB[0+2*bs] = B[2+0*lda];
2185 pB[1+2*bs] = B[2+1*lda];
2186 pB[2+2*bs] = B[2+2*lda];
2187 pB[3+2*bs] = B[2+3*lda];
2188 // unroll 3
2189 pB[0+3*bs] = B[3+0*lda];
2190 pB[1+3*bs] = B[3+1*lda];
2191 pB[2+3*bs] = B[3+2*lda];
2192 pB[3+3*bs] = B[3+3*lda];
2193 B += 4;
2194 pB += 4*bs;
2195 }
2196 #endif
2197 for(; j<m; j++)
2198 {
2199 // unroll 0
2200 pB[0+0*bs] = B[0+0*lda];
2201 pB[1+0*bs] = B[0+1*lda];
2202 pB[2+0*bs] = B[0+2*lda];
2203 pB[3+0*bs] = B[0+3*lda];
2204 B += 1;
2205 pB += 1*bs;
2206 }
2207 }
2208 if(ii<m1)
2209 {
2210 m2 = m1-ii;
2211 if(bs<m2) m2 = bs;
2212 for(j=0; j<m; j++)
2213 {
2214 for(i=0; i<m2; i++)
2215 {
2216 pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
2217 }
2218 }
2219 }
2220 return;
2221 }
2222
2223
2224
2225 // convert a vector into a vector structure
blasfeo_pack_dvec(int m,double * a,struct blasfeo_dvec * sa,int ai)2226 void blasfeo_pack_dvec(int m, double *a, struct blasfeo_dvec *sa, int ai)
2227 {
2228 double *pa = sa->pa + ai;
2229 int ii;
2230 for(ii=0; ii<m; ii++)
2231 pa[ii] = a[ii];
2232 return;
2233 }
2234
2235
2236
2237 // convert a matrix structure into a matrix
blasfeo_unpack_dmat(int m,int n,struct blasfeo_dmat * sA,int ai,int aj,double * A,int lda)2238 void blasfeo_unpack_dmat(int m, int n, struct blasfeo_dmat *sA, int ai, int aj, double *A, int lda)
2239 {
2240 const int bs = 4;
2241 int sda = sA->cn;
2242 double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
2243 int i, ii, jj;
2244 int m0 = (bs-ai%bs)%bs;
2245 if(m0>m)
2246 m0 = m;
2247 double *ptr_pA;
2248
2249 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2250 __m256d
2251 tmp;
2252 #endif
2253
2254 jj=0;
2255 for(; jj<n-3; jj+=4)
2256 {
2257 ptr_pA = pA + jj*bs;
2258 ii = 0;
2259 if(m0>0)
2260 {
2261 for(; ii<m0; ii++)
2262 {
2263 // unroll 0
2264 A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
2265 // unroll 1
2266 A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
2267 // unroll 2
2268 A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
2269 // unroll 3
2270 A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
2271 ptr_pA++;
2272 }
2273 ptr_pA += (sda-1)*bs;
2274 }
2275 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2276 for(; ii<m-bs+1; ii+=bs)
2277 {
2278 tmp = _mm256_load_pd( &ptr_pA[0+bs*0] );
2279 _mm256_storeu_pd( &A[ii+lda*(jj+0)], tmp );
2280 tmp = _mm256_load_pd( &ptr_pA[0+bs*1] );
2281 _mm256_storeu_pd( &A[ii+lda*(jj+1)], tmp );
2282 tmp = _mm256_load_pd( &ptr_pA[0+bs*2] );
2283 _mm256_storeu_pd( &A[ii+lda*(jj+2)], tmp );
2284 tmp = _mm256_load_pd( &ptr_pA[0+bs*3] );
2285 _mm256_storeu_pd( &A[ii+lda*(jj+3)], tmp );
2286 ptr_pA += sda*bs;
2287 }
2288 #else
2289 for(; ii<m-bs+1; ii+=bs)
2290 {
2291 // unroll 0
2292 A[0+ii+lda*(jj+0)] = ptr_pA[0+bs*0];
2293 A[1+ii+lda*(jj+0)] = ptr_pA[1+bs*0];
2294 A[2+ii+lda*(jj+0)] = ptr_pA[2+bs*0];
2295 A[3+ii+lda*(jj+0)] = ptr_pA[3+bs*0];
2296 // unroll 0
2297 A[0+ii+lda*(jj+1)] = ptr_pA[0+bs*1];
2298 A[1+ii+lda*(jj+1)] = ptr_pA[1+bs*1];
2299 A[2+ii+lda*(jj+1)] = ptr_pA[2+bs*1];
2300 A[3+ii+lda*(jj+1)] = ptr_pA[3+bs*1];
2301 // unroll 0
2302 A[0+ii+lda*(jj+2)] = ptr_pA[0+bs*2];
2303 A[1+ii+lda*(jj+2)] = ptr_pA[1+bs*2];
2304 A[2+ii+lda*(jj+2)] = ptr_pA[2+bs*2];
2305 A[3+ii+lda*(jj+2)] = ptr_pA[3+bs*2];
2306 // unroll 0
2307 A[0+ii+lda*(jj+3)] = ptr_pA[0+bs*3];
2308 A[1+ii+lda*(jj+3)] = ptr_pA[1+bs*3];
2309 A[2+ii+lda*(jj+3)] = ptr_pA[2+bs*3];
2310 A[3+ii+lda*(jj+3)] = ptr_pA[3+bs*3];
2311 ptr_pA += sda*bs;
2312 }
2313 #endif
2314 for(; ii<m; ii++)
2315 {
2316 // unroll 0
2317 A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
2318 // unroll 1
2319 A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
2320 // unroll 2
2321 A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
2322 // unroll 3
2323 A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
2324 ptr_pA++;
2325 }
2326 }
2327 for(; jj<n; jj++)
2328 {
2329 ptr_pA = pA + jj*bs;
2330 ii = 0;
2331 if(m0>0)
2332 {
2333 for(; ii<m0; ii++)
2334 {
2335 A[ii+lda*jj] = ptr_pA[0];
2336 ptr_pA++;
2337 }
2338 ptr_pA += (sda-1)*bs;
2339 }
2340 for(; ii<m-bs+1; ii+=bs)
2341 {
2342 A[0+ii+lda*jj] = ptr_pA[0];
2343 A[1+ii+lda*jj] = ptr_pA[1];
2344 A[2+ii+lda*jj] = ptr_pA[2];
2345 A[3+ii+lda*jj] = ptr_pA[3];
2346 ptr_pA += sda*bs;
2347 }
2348 for(; ii<m; ii++)
2349 {
2350 A[ii+lda*jj] = ptr_pA[0];
2351 ptr_pA++;
2352 }
2353 }
2354 return;
2355 }
2356
2357
2358
2359 // convert and transpose a matrix structure into a matrix
blasfeo_unpack_tran_dmat(int m,int n,struct blasfeo_dmat * sA,int ai,int aj,double * A,int lda)2360 void blasfeo_unpack_tran_dmat(int m, int n, struct blasfeo_dmat *sA, int ai, int aj, double *A, int lda)
2361 {
2362 const int bs = 4;
2363 int sda = sA->cn;
2364 double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
2365 int i, ii, jj;
2366 int m0 = (bs-ai%bs)%bs;
2367 if(m0>m)
2368 m0 = m;
2369 double *ptr_pA;
2370
2371 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2372 __m256d
2373 v0, v1, v2, v3,
2374 v4, v5, v6, v7;
2375 #endif
2376
2377 jj=0;
2378 for(; jj<n-3; jj+=4)
2379 {
2380 ptr_pA = pA + jj*bs;
2381 ii = 0;
2382 if(m0>0)
2383 {
2384 for(; ii<m0; ii++)
2385 {
2386 // unroll 0
2387 A[jj+0+lda*ii] = ptr_pA[0+bs*0];
2388 // unroll 1
2389 A[jj+1+lda*ii] = ptr_pA[0+bs*1];
2390 // unroll 2
2391 A[jj+2+lda*ii] = ptr_pA[0+bs*2];
2392 // unroll 3
2393 A[jj+3+lda*ii] = ptr_pA[0+bs*3];
2394 ptr_pA++;
2395 }
2396 ptr_pA += (sda-1)*bs;
2397 }
2398 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2399 for(; ii<m-bs+1; ii+=bs)
2400 {
2401 v0 = _mm256_load_pd( &ptr_pA[0+bs*0] ); // 00 10 20 30
2402 v1 = _mm256_load_pd( &ptr_pA[0+bs*1] ); // 01 11 21 31
2403 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
2404 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
2405 v2 = _mm256_load_pd( &ptr_pA[0+bs*2] ); // 02 12 22 32
2406 v3 = _mm256_load_pd( &ptr_pA[0+bs*3] ); // 03 13 23 33
2407 v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
2408 v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
2409
2410 v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
2411 _mm256_storeu_pd( &A[jj+lda*(ii+0)], v0 );
2412 v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
2413 _mm256_storeu_pd( &A[jj+lda*(ii+2)], v2 );
2414 v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
2415 _mm256_storeu_pd( &A[jj+lda*(ii+1)], v1 );
2416 v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
2417 _mm256_storeu_pd( &A[jj+lda*(ii+3)], v3 );
2418
2419 ptr_pA += sda*bs;
2420 }
2421 #else
2422 for(; ii<m-bs+1; ii+=bs)
2423 {
2424 // unroll 0
2425 A[jj+0+lda*(ii+0)] = ptr_pA[0+bs*0];
2426 A[jj+0+lda*(ii+1)] = ptr_pA[1+bs*0];
2427 A[jj+0+lda*(ii+2)] = ptr_pA[2+bs*0];
2428 A[jj+0+lda*(ii+3)] = ptr_pA[3+bs*0];
2429 // unroll 1
2430 A[jj+1+lda*(ii+0)] = ptr_pA[0+bs*1];
2431 A[jj+1+lda*(ii+1)] = ptr_pA[1+bs*1];
2432 A[jj+1+lda*(ii+2)] = ptr_pA[2+bs*1];
2433 A[jj+1+lda*(ii+3)] = ptr_pA[3+bs*1];
2434 // unroll 2
2435 A[jj+2+lda*(ii+0)] = ptr_pA[0+bs*2];
2436 A[jj+2+lda*(ii+1)] = ptr_pA[1+bs*2];
2437 A[jj+2+lda*(ii+2)] = ptr_pA[2+bs*2];
2438 A[jj+2+lda*(ii+3)] = ptr_pA[3+bs*2];
2439 // unroll 3
2440 A[jj+3+lda*(ii+0)] = ptr_pA[0+bs*3];
2441 A[jj+3+lda*(ii+1)] = ptr_pA[1+bs*3];
2442 A[jj+3+lda*(ii+2)] = ptr_pA[2+bs*3];
2443 A[jj+3+lda*(ii+3)] = ptr_pA[3+bs*3];
2444 ptr_pA += sda*bs;
2445 }
2446 #endif
2447 for(; ii<m; ii++)
2448 {
2449 // unroll 0
2450 A[jj+0+lda*ii] = ptr_pA[0+bs*0];
2451 // unroll 1
2452 A[jj+1+lda*ii] = ptr_pA[0+bs*1];
2453 // unroll 2
2454 A[jj+2+lda*ii] = ptr_pA[0+bs*2];
2455 // unroll 3
2456 A[jj+3+lda*ii] = ptr_pA[0+bs*3];
2457 ptr_pA++;
2458 }
2459 }
2460 for(; jj<n; jj++)
2461 {
2462 ptr_pA = pA + jj*bs;
2463 ii = 0;
2464 if(m0>0)
2465 {
2466 for(; ii<m0; ii++)
2467 {
2468 A[jj+lda*ii] = ptr_pA[0];
2469 ptr_pA++;
2470 }
2471 ptr_pA += (sda-1)*bs;
2472 }
2473 for(; ii<m-bs+1; ii+=bs)
2474 {
2475 i=0;
2476 for(; i<bs; i++)
2477 {
2478 A[jj+lda*(i+ii)] = ptr_pA[0];
2479 ptr_pA++;
2480 }
2481 ptr_pA += (sda-1)*bs;
2482 }
2483 for(; ii<m; ii++)
2484 {
2485 A[jj+lda*ii] = ptr_pA[0];
2486 ptr_pA++;
2487 }
2488 }
2489 return;
2490 }
2491
2492
2493
2494 // convert a vector structure into a vector
blasfeo_unpack_dvec(int m,struct blasfeo_dvec * sa,int ai,double * a)2495 void blasfeo_unpack_dvec(int m, struct blasfeo_dvec *sa, int ai, double *a)
2496 {
2497 double *pa = sa->pa + ai;
2498 int ii;
2499 for(ii=0; ii<m; ii++)
2500 a[ii] = pa[ii];
2501 return;
2502 }
2503
2504
2505
2506 // cast a matrix into a matrix structure
d_cast_mat2strmat(double * A,struct blasfeo_dmat * sA)2507 void d_cast_mat2strmat(double *A, struct blasfeo_dmat *sA)
2508 {
2509
2510 // invalidate stored inverse diagonal
2511 sA->use_dA = 0;
2512
2513 sA->pA = A;
2514 return;
2515 }
2516
2517
2518
2519 // cast a matrix into the diagonal of a matrix structure
d_cast_diag_mat2strmat(double * dA,struct blasfeo_dmat * sA)2520 void d_cast_diag_mat2strmat(double *dA, struct blasfeo_dmat *sA)
2521 {
2522
2523 // invalidate stored inverse diagonal
2524 sA->use_dA = 0;
2525
2526 sA->dA = dA;
2527 return;
2528 }
2529
2530
2531
2532 // cast a vector into a vector structure
d_cast_vec2vecmat(double * a,struct blasfeo_dvec * sa)2533 void d_cast_vec2vecmat(double *a, struct blasfeo_dvec *sa)
2534 {
2535 sa->pa = a;
2536 return;
2537 }
2538
2539
2540
2541 // insert element into strmat
blasfeo_dgein1(double a,struct blasfeo_dmat * sA,int ai,int aj)2542 void blasfeo_dgein1(double a, struct blasfeo_dmat *sA, int ai, int aj)
2543 {
2544
2545 if (ai==aj)
2546 {
2547 // invalidate stored inverse diagonal
2548 sA->use_dA = 0;
2549 }
2550
2551 const int bs = 4;
2552 int sda = sA->cn;
2553 double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2554 pA[0] = a;
2555 return;
2556 }
2557
2558
2559
2560 // extract element from strmat
blasfeo_dgeex1(struct blasfeo_dmat * sA,int ai,int aj)2561 double blasfeo_dgeex1(struct blasfeo_dmat *sA, int ai, int aj)
2562 {
2563 const int bs = 4;
2564 int sda = sA->cn;
2565 double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2566 return pA[0];
2567 }
2568
2569
2570
2571 // insert element into strvec
blasfeo_dvecin1(double a,struct blasfeo_dvec * sx,int xi)2572 void blasfeo_dvecin1(double a, struct blasfeo_dvec *sx, int xi)
2573 {
2574 const int bs = 4;
2575 double *x = sx->pa + xi;
2576 x[0] = a;
2577 return;
2578 }
2579
2580
2581
2582 // extract element from strvec
blasfeo_dvecex1(struct blasfeo_dvec * sx,int xi)2583 double blasfeo_dvecex1(struct blasfeo_dvec *sx, int xi)
2584 {
2585 const int bs = 4;
2586 double *x = sx->pa + xi;
2587 return x[0];
2588 }
2589
2590
2591
2592 // set all elements of a strmat to a value
blasfeo_dgese(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj)2593 void blasfeo_dgese(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj)
2594 {
2595
2596 // invalidate stored inverse diagonal
2597 sA->use_dA = 0;
2598
2599 const int bs = 4;
2600 int sda = sA->cn;
2601 double *pA = sA->pA + ai%bs + ai/bs*bs*sda + aj*bs;
2602 int m0 = m<(bs-ai%bs)%bs ? m : (bs-ai%bs)%bs;
2603 int ii, jj;
2604 if(m0>0)
2605 {
2606 for(ii=0; ii<m0; ii++)
2607 {
2608 for(jj=0; jj<n; jj++)
2609 {
2610 pA[jj*bs] = alpha;
2611 }
2612 pA += 1;
2613 }
2614 pA += bs*(sda-1);
2615 m -= m0;
2616 }
2617 for(ii=0; ii<m-3; ii+=4)
2618 {
2619 for(jj=0; jj<n; jj++)
2620 {
2621 pA[0+jj*bs] = alpha;
2622 pA[1+jj*bs] = alpha;
2623 pA[2+jj*bs] = alpha;
2624 pA[3+jj*bs] = alpha;
2625 }
2626 pA += bs*sda;
2627 }
2628 for( ; ii<m; ii++)
2629 {
2630 for(jj=0; jj<n; jj++)
2631 {
2632 pA[jj*bs] = alpha;
2633 }
2634 pA += 1;
2635 }
2636 return;
2637 }
2638
2639
2640
2641 // set all elements of a strvec to a value
blasfeo_dvecse(int m,double alpha,struct blasfeo_dvec * sx,int xi)2642 void blasfeo_dvecse(int m, double alpha, struct blasfeo_dvec *sx, int xi)
2643 {
2644 double *x = sx->pa + xi;
2645 int ii;
2646 for(ii=0; ii<m; ii++)
2647 x[ii] = alpha;
2648 return;
2649 }
2650
2651
2652
2653 // insert a vector into diagonal
blasfeo_ddiain(int kmax,double alpha,struct blasfeo_dvec * sx,int xi,struct blasfeo_dmat * sA,int ai,int aj)2654 void blasfeo_ddiain(int kmax, double alpha, struct blasfeo_dvec *sx, int xi, struct blasfeo_dmat *sA, int ai, int aj)
2655 {
2656
2657 // invalidate stored inverse diagonal
2658 sA->use_dA = 0;
2659
2660 const int bs = 4;
2661 int sda = sA->cn;
2662 double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2663 double *x = sx->pa + xi;
2664 int offsetA = ai%bs;
2665
2666 int kna = (bs-offsetA%bs)%bs;
2667 kna = kmax<kna ? kmax : kna;
2668
2669 int jj, ll;
2670
2671 if(kna>0)
2672 {
2673 for(ll=0; ll<kna; ll++)
2674 {
2675 pA[ll+bs*ll] = alpha*x[ll];
2676 }
2677 pA += kna + bs*(sda-1) + kna*bs;
2678 x += kna;
2679 kmax -= kna;
2680 }
2681 for(jj=0; jj<kmax-3; jj+=4)
2682 {
2683 pA[jj*sda+(jj+0)*bs+0] = alpha*x[jj+0];
2684 pA[jj*sda+(jj+1)*bs+1] = alpha*x[jj+1];
2685 pA[jj*sda+(jj+2)*bs+2] = alpha*x[jj+2];
2686 pA[jj*sda+(jj+3)*bs+3] = alpha*x[jj+3];
2687 }
2688 for(ll=0; ll<kmax-jj; ll++)
2689 {
2690 pA[jj*sda+(jj+ll)*bs+ll] = alpha*x[jj+ll];
2691 }
2692 return;
2693 }
2694
2695
2696
2697 // add scalar to diagonal
blasfeo_ddiare(int kmax,double alpha,struct blasfeo_dmat * sA,int ai,int aj)2698 void blasfeo_ddiare(int kmax, double alpha, struct blasfeo_dmat *sA, int ai, int aj)
2699 {
2700
2701 // invalidate stored inverse diagonal
2702 sA->use_dA = 0;
2703
2704 const int bs = 4;
2705 int sda = sA->cn;
2706 double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2707 int offsetA = ai%bs;
2708
2709 int kna = (bs-offsetA%bs)%bs;
2710 kna = kmax<kna ? kmax : kna;
2711
2712 int jj, ll;
2713
2714 if(kna>0)
2715 {
2716 for(ll=0; ll<kna; ll++)
2717 {
2718 pA[ll+bs*ll] += alpha;
2719 }
2720 pA += kna + bs*(sda-1) + kna*bs;
2721 kmax -= kna;
2722 }
2723 for(jj=0; jj<kmax-3; jj+=4)
2724 {
2725 pA[jj*sda+(jj+0)*bs+0] += alpha;
2726 pA[jj*sda+(jj+1)*bs+1] += alpha;
2727 pA[jj*sda+(jj+2)*bs+2] += alpha;
2728 pA[jj*sda+(jj+3)*bs+3] += alpha;
2729 }
2730 for(ll=0; ll<kmax-jj; ll++)
2731 {
2732 pA[jj*sda+(jj+ll)*bs+ll] += alpha;
2733 }
2734 return;
2735 }
2736
2737
2738
2739 // swap two rows of two matrix structs
blasfeo_drowsw(int kmax,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sC,int ci,int cj)2740 void blasfeo_drowsw(int kmax, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sC, int ci, int cj)
2741 {
2742
2743 // invalidate stored inverse diagonal
2744 sA->use_dA = 0;
2745 sC->use_dA = 0;
2746
2747 const int bs = 4;
2748 int sda = sA->cn;
2749 double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2750 int sdc = sC->cn;
2751 double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
2752 kernel_drowsw_lib4(kmax, pA, pC);
2753 return;
2754 }
2755
2756
2757
2758 // permute the rows of a matrix struct
blasfeo_drowpe(int kmax,int * ipiv,struct blasfeo_dmat * sA)2759 void blasfeo_drowpe(int kmax, int *ipiv, struct blasfeo_dmat *sA)
2760 {
2761
2762 // invalidate stored inverse diagonal
2763 sA->use_dA = 0;
2764
2765 int ii;
2766 for(ii=0; ii<kmax; ii++)
2767 {
2768 if(ipiv[ii]!=ii)
2769 blasfeo_drowsw(sA->n, sA, ii, 0, sA, ipiv[ii], 0);
2770 }
2771 return;
2772 }
2773
2774
2775 // inverse permute the rows of a matrix struct
blasfeo_drowpei(int kmax,int * ipiv,struct blasfeo_dmat * sA)2776 void blasfeo_drowpei(int kmax, int *ipiv, struct blasfeo_dmat *sA)
2777 {
2778
2779 // invalidate stored inverse diagonal
2780 sA->use_dA = 0;
2781
2782 int ii;
2783 for(ii=kmax-1; ii>=0; ii--)
2784 {
2785 if(ipiv[ii]!=ii)
2786 blasfeo_drowsw(sA->n, sA, ii, 0, sA, ipiv[ii], 0);
2787 }
2788 return;
2789 }
2790
2791
2792 // extract a row int a vector
blasfeo_drowex(int kmax,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi)2793 void blasfeo_drowex(int kmax, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi)
2794 {
2795 const int bs = 4;
2796 int sda = sA->cn;
2797 double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2798 double *x = sx->pa + xi;
2799 drowex_lib(kmax, alpha, pA, x);
2800 return;
2801 }
2802
2803
2804
2805 // insert a vector into a row
blasfeo_drowin(int kmax,double alpha,struct blasfeo_dvec * sx,int xi,struct blasfeo_dmat * sA,int ai,int aj)2806 void blasfeo_drowin(int kmax, double alpha, struct blasfeo_dvec *sx, int xi, struct blasfeo_dmat *sA, int ai, int aj)
2807 {
2808
2809 // invalidate stored inverse diagonal
2810 sA->use_dA = 0;
2811
2812 const int bs = 4;
2813 int sda = sA->cn;
2814 double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2815 double *x = sx->pa + xi;
2816 drowin_lib(kmax, alpha, x, pA);
2817 return;
2818 }
2819
2820
2821
2822 // add a vector to a row
blasfeo_drowad(int kmax,double alpha,struct blasfeo_dvec * sx,int xi,struct blasfeo_dmat * sA,int ai,int aj)2823 void blasfeo_drowad(int kmax, double alpha, struct blasfeo_dvec *sx, int xi, struct blasfeo_dmat *sA, int ai, int aj)
2824 {
2825
2826 // invalidate stored inverse diagonal
2827 sA->use_dA = 0;
2828
2829 const int bs = 4;
2830 int sda = sA->cn;
2831 double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2832 double *x = sx->pa + xi;
2833 drowad_lib(kmax, alpha, x, pA);
2834 return;
2835 }
2836
2837
2838
2839 // extract vector from column
blasfeo_dcolex(int kmax,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi)2840 void blasfeo_dcolex(int kmax, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi)
2841 {
2842
2843 // invalidate stored inverse diagonal
2844 sA->use_dA = 0;
2845
2846 const int bs = 4;
2847 int sda = sA->cn;
2848 double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2849 double *x = sx->pa + xi;
2850 dcolex_lib(kmax, ai%bs, pA, sda, x);
2851 return;
2852 }
2853
2854
2855
2856 // insert as vector as a column
blasfeo_dcolin(int kmax,struct blasfeo_dvec * sx,int xi,struct blasfeo_dmat * sA,int ai,int aj)2857 void blasfeo_dcolin(int kmax, struct blasfeo_dvec *sx, int xi, struct blasfeo_dmat *sA, int ai, int aj)
2858 {
2859
2860 // invalidate stored inverse diagonal
2861 sA->use_dA = 0;
2862
2863 const int bs = 4;
2864 int sda = sA->cn;
2865 double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2866 double *x = sx->pa + xi;
2867 dcolin_lib(kmax, x, ai%bs, pA, sda);
2868 return;
2869 }
2870
2871
2872
2873 // add scaled vector to column
blasfeo_dcolad(int kmax,double alpha,struct blasfeo_dvec * sx,int xi,struct blasfeo_dmat * sA,int ai,int aj)2874 void blasfeo_dcolad(int kmax, double alpha, struct blasfeo_dvec *sx, int xi, struct blasfeo_dmat *sA, int ai, int aj)
2875 {
2876
2877 // invalidate stored inverse diagonal
2878 sA->use_dA = 0;
2879
2880 const int bs = 4;
2881 int sda = sA->cn;
2882 double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2883 double *x = sx->pa + xi;
2884 dcolad_lib(kmax, alpha, x, ai%bs, pA, sda);
2885 return;
2886 }
2887
2888
2889
2890 // scale a column
blasfeo_dcolsc(int kmax,double alpha,struct blasfeo_dmat * sA,int ai,int aj)2891 void blasfeo_dcolsc(int kmax, double alpha, struct blasfeo_dmat *sA, int ai, int aj)
2892 {
2893
2894 // invalidate stored inverse diagonal
2895 sA->use_dA = 0;
2896
2897 const int bs = 4;
2898
2899 int sda = sA->cn;
2900 double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2901
2902 int kna = (bs-ai%bs)%bs;
2903 kna = kmax<kna ? kmax : kna;
2904
2905 int jj, ll;
2906
2907 if(kna>0)
2908 {
2909 for(ll=0; ll<kna; ll++)
2910 {
2911 pA[ll] *= alpha;
2912 }
2913 pA += kna + bs*(sda-1);
2914 kmax -= kna;
2915 }
2916 for(jj=0; jj<kmax-3; jj+=4)
2917 {
2918 pA[jj*sda+0] *= alpha;
2919 pA[jj*sda+1] *= alpha;
2920 pA[jj*sda+2] *= alpha;
2921 pA[jj*sda+3] *= alpha;
2922 }
2923 for(ll=0; ll<kmax-jj; ll++)
2924 {
2925 pA[jj*sda+ll] *= alpha;
2926 }
2927
2928 return;
2929 }
2930
2931
2932
2933 // swap two cols of two matrix structs
blasfeo_dcolsw(int kmax,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sC,int ci,int cj)2934 void blasfeo_dcolsw(int kmax, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sC, int ci, int cj)
2935 {
2936
2937 // invalidate stored inverse diagonal
2938 sA->use_dA = 0;
2939 sC->use_dA = 0;
2940
2941 const int bs = 4;
2942 int sda = sA->cn;
2943 double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2944 int sdc = sC->cn;
2945 double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
2946 dcolsw_lib(kmax, ai%bs, pA, sda, ci%bs, pC, sdc);
2947 return;
2948 }
2949
2950
2951
2952 // permute the cols of a matrix struct
blasfeo_dcolpe(int kmax,int * ipiv,struct blasfeo_dmat * sA)2953 void blasfeo_dcolpe(int kmax, int *ipiv, struct blasfeo_dmat *sA)
2954 {
2955
2956 // invalidate stored inverse diagonal
2957 sA->use_dA = 0;
2958
2959 int ii;
2960 for(ii=0; ii<kmax; ii++)
2961 {
2962 if(ipiv[ii]!=ii)
2963 blasfeo_dcolsw(sA->m, sA, 0, ii, sA, 0, ipiv[ii]);
2964 }
2965 return;
2966 }
2967
2968
2969
2970 // inverse permute the cols of a matrix struct
blasfeo_dcolpei(int kmax,int * ipiv,struct blasfeo_dmat * sA)2971 void blasfeo_dcolpei(int kmax, int *ipiv, struct blasfeo_dmat *sA)
2972 {
2973
2974 // invalidate stored inverse diagonal
2975 sA->use_dA = 0;
2976
2977 int ii;
2978 for(ii=kmax-1; ii>=0; ii--)
2979 {
2980 if(ipiv[ii]!=ii)
2981 blasfeo_dcolsw(sA->m, sA, 0, ii, sA, 0, ipiv[ii]);
2982 }
2983 return;
2984 }
2985
2986
2987
2988 // copy a generic strmat into a generic strmat
blasfeo_dgecp(int m,int n,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj)2989 void blasfeo_dgecp(int m, int n, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj)
2990 {
2991
2992 // invalidate stored inverse diagonal
2993 sB->use_dA = 0;
2994
2995 const int bs = 4;
2996 const double alpha = 1.0;
2997
2998 // get submatrices
2999 int sda = sA->cn;
3000 double *A = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
3001 int sdb = sB->cn;
3002 double *B = sB->pA + bi/bs*bs*sdb + bi%bs + bj*bs;
3003
3004 if(m<=0 || n<=0)
3005 return;
3006
3007 int mna, ii;
3008
3009 // compute offset from closest panels
3010 int offA = ai%bs;
3011 int offB = bi%bs;
3012
3013 // A at the beginning of the block
3014 A -= offA;
3015 // A at the beginning of the block
3016 B -= offB;
3017
3018 // same alignment
3019 if(offA==offB)
3020 {
3021 ii = 0;
3022 // clean up at the beginning
3023 mna = (4-offB)%bs;
3024 if(mna>0)
3025 {
3026 if(m<mna) // mna<=3 ==> m = { 1, 2 }
3027 {
3028 if(m==1)
3029 {
3030 kernel_dgecpsc_1_0_lib4(0, n, alpha, A+offA, B+offB);
3031 return;
3032 }
3033 else //if(m==2 && mna==3)
3034 {
3035 kernel_dgecpsc_2_0_lib4(0, n, alpha, A+offA, B+offB);
3036 return;
3037 }
3038 }
3039 if(mna==1)
3040 {
3041 kernel_dgecpsc_1_0_lib4(0, n, alpha, A+offA, B+offB);
3042 A += 4*sda;
3043 B += 4*sdb;
3044 ii += 1;
3045 }
3046 else if(mna==2)
3047 {
3048 kernel_dgecpsc_2_0_lib4(0, n, alpha, A+offA, B+offB);
3049 A += 4*sda;
3050 B += 4*sdb;
3051 ii += 2;
3052 }
3053 else // if(mna==3)
3054 {
3055 kernel_dgecpsc_3_0_lib4(0, n, alpha, A+offA, B+offB);
3056 A += 4*sda;
3057 B += 4*sdb;
3058 ii += 3;
3059 }
3060 }
3061 // main loop
3062 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
3063 for(; ii<m-7; ii+=8)
3064 {
3065 kernel_dgecp_8_0_lib4(0, n, A, sda, B, sdb);
3066 A += 8*sda;
3067 B += 8*sdb;
3068 }
3069 #endif
3070 for(; ii<m-3; ii+=4)
3071 {
3072 kernel_dgecp_4_0_lib4(0, n, A, B);
3073 A += 4*sda;
3074 B += 4*sdb;
3075 }
3076 // clean up at the end
3077 if(ii<m)
3078 {
3079 if(m-ii==1)
3080 kernel_dgecpsc_1_0_lib4(0, n, alpha, A, B);
3081 else if(m-ii==2)
3082 kernel_dgecpsc_2_0_lib4(0, n, alpha, A, B);
3083 else // if(m-ii==3)
3084 kernel_dgecpsc_3_0_lib4(0, n, alpha, A, B);
3085 }
3086 }
3087 // skip one element of A
3088 else if(offA==(offB+1)%bs)
3089 {
3090 ii = 0;
3091 // clean up at the beginning
3092 mna = (4-offB)%bs;
3093 if(mna>0)
3094 {
3095 if(m<mna) // mna<=3 ==> m = { 1, 2 }
3096 {
3097 if(m==1)
3098 {
3099 kernel_dgecpsc_1_0_lib4(0, n, alpha, A+offA, B+offB);
3100 return;
3101 }
3102 else //if(m==2 && mna==3)
3103 {
3104 kernel_dgecpsc_2_0_lib4(0, n, alpha, A+offA, B+offB);
3105 return;
3106 }
3107 }
3108 if(mna==1)
3109 {
3110 kernel_dgecpsc_1_0_lib4(0, n, alpha, A+offA, B+offB);
3111 //A += 4*sda;
3112 B += 4*sdb;
3113 ii += 1;
3114 }
3115 else if(mna==2)
3116 {
3117 kernel_dgecpsc_2_3_lib4(0, n, alpha, A, sda, B+2);
3118 A += 4*sda;
3119 B += 4*sdb;
3120 ii += 2;
3121 }
3122 else // if(mna==3)
3123 {
3124 kernel_dgecpsc_3_2_lib4(0, n, alpha, A, sda, B+1);
3125 A += 4*sda;
3126 B += 4*sdb;
3127 ii += 3;
3128 }
3129 }
3130 // main loop
3131 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
3132 for( ; ii<m-7; ii+=8)
3133 {
3134 kernel_dgecpsc_8_1_lib4(0, n, alpha, A, sda, B, sdb);
3135 A += 8*sda;
3136 B += 8*sdb;
3137 }
3138 #endif
3139 for( ; ii<m-3; ii+=4)
3140 {
3141 kernel_dgecpsc_4_1_lib4(0, n, alpha, A, sda, B);
3142 A += 4*sda;
3143 B += 4*sdb;
3144 }
3145 // clean up at the end
3146 if(ii<m)
3147 {
3148 if(m-ii==1)
3149 kernel_dgecpsc_1_0_lib4(0, n, alpha, A+1, B);
3150 else if(m-ii==2)
3151 kernel_dgecpsc_2_0_lib4(0, n, alpha, A+1, B);
3152 else // if(m-ii==3)
3153 kernel_dgecpsc_3_0_lib4(0, n, alpha, A+1, B);
3154 }
3155 }
3156 // skip 2 elements of A
3157 else if(offA==(offB+2)%bs)
3158 {
3159 ii = 0;
3160 // clean up at the beginning
3161 mna = (4-offB)%bs;
3162 if(mna>0)
3163 {
3164 if(m<mna)
3165 {
3166 if(m==1)
3167 {
3168 kernel_dgecpsc_1_0_lib4(0, n, alpha, A+offA, B+offB);
3169 return;
3170 }
3171 else // if(m==2 && mna==3)
3172 {
3173 kernel_dgecpsc_2_3_lib4(0, n, alpha, A, sda, B+1);
3174 return;
3175 }
3176 }
3177 if(mna==1)
3178 {
3179 kernel_dgecpsc_1_0_lib4(0, n, alpha, A+1, B+3);
3180 // A += 4*sda;
3181 B += 4*sdb;
3182 ii += 1;
3183 }
3184 else if(mna==2)
3185 {
3186 kernel_dgecpsc_2_0_lib4(0, n, alpha, A, B+2);
3187 // A += 4*sda;
3188 B += 4*sdb;
3189 ii += 2;
3190 }
3191 else // if(mna==3)
3192 {
3193 kernel_dgecpsc_3_3_lib4(0, n, alpha, A, sda, B+1);
3194 A += 4*sda;
3195 B += 4*sdb;
3196 ii += 3;
3197 }
3198 }
3199 // main loop
3200 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
3201 for(; ii<m-7; ii+=8)
3202 {
3203 kernel_dgecpsc_8_2_lib4(0, n, alpha, A, sda, B, sdb);
3204 A += 8*sda;
3205 B += 8*sdb;
3206 }
3207 #endif
3208 for(; ii<m-3; ii+=4)
3209 {
3210 kernel_dgecpsc_4_2_lib4(0, n, alpha, A, sda, B);
3211 A += 4*sda;
3212 B += 4*sdb;
3213 }
3214 // clean up at the end
3215 if(ii<m)
3216 {
3217 if(m-ii==1)
3218 kernel_dgecpsc_1_0_lib4(0, n, alpha, A+2, B);
3219 else if(m-ii==2)
3220 kernel_dgecpsc_2_0_lib4(0, n, alpha, A+2, B);
3221 else // if(m-ii==3)
3222 kernel_dgecpsc_3_2_lib4(0, n, alpha, A, sda, B);
3223 }
3224 }
3225 // skip 3 elements of A
3226 else // if(offA==(offB+3)%bs)
3227 {
3228 ii = 0;
3229 // clean up at the beginning
3230 mna = (4-offB)%bs;
3231 if(mna>0)
3232 {
3233 if(m<mna)
3234 {
3235 if(m==1)
3236 {
3237 kernel_dgecpsc_1_0_lib4(0, n, alpha, A+offA, B+offB);
3238 return;
3239 }
3240 else // if(m==2 && mna==3)
3241 {
3242 kernel_dgecpsc_2_0_lib4(0, n, alpha, A+offA, B+offB);
3243 return;
3244 }
3245 }
3246 if(mna==1)
3247 {
3248 kernel_dgecpsc_1_0_lib4(0, n, alpha, A+offA, B+offB);
3249 // A += 4*sda;
3250 B += 4*sdb;
3251 ii += 1;
3252 }
3253 else if(mna==2)
3254 {
3255 kernel_dgecpsc_2_0_lib4(0, n, alpha, A+offA, B+offB);
3256 // A += 4*sda;
3257 B += 4*sdb;
3258 ii += 2;
3259 }
3260 else // if(mna==3)
3261 {
3262 kernel_dgecpsc_3_0_lib4(0, n, alpha, A+offA, B+offB);
3263 // A += 4*sda;
3264 B += 4*sdb;
3265 ii += 3;
3266 }
3267 }
3268 // main loop
3269 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
3270 for(; ii<m-7; ii+=8)
3271 {
3272 kernel_dgecpsc_8_3_lib4(0, n, alpha, A, sda, B, sdb);
3273 A += 8*sda;
3274 B += 8*sdb;
3275 }
3276 #endif
3277 for(; ii<m-3; ii+=4)
3278 {
3279 kernel_dgecpsc_4_3_lib4(0, n, alpha, A, sda, B);
3280 A += 4*sda;
3281 B += 4*sdb;
3282 }
3283 // clean up at the end
3284 if(ii<m)
3285 {
3286 if(m-ii==1)
3287 kernel_dgecpsc_1_0_lib4(0, n, alpha, A+3, B);
3288 else if(m-ii==2)
3289 kernel_dgecpsc_2_3_lib4(0, n, alpha, A, sda, B);
3290 else // if(m-ii==3)
3291 kernel_dgecpsc_3_3_lib4(0, n, alpha, A, sda, B);
3292 }
3293 }
3294
3295 return;
3296 }
3297
3298
3299
3300 // copy a lower triangular strmat into a lower triangular strmat
blasfeo_dtrcp_l(int m,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj)3301 void blasfeo_dtrcp_l(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj)
3302 {
3303
3304 // invalidate stored inverse diagonal
3305 sB->use_dA = 0;
3306
3307 const int bs = 4;
3308 const double alpha = 1;
3309
3310 // get submatrices
3311 int sda = sA->cn;
3312 double *A = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
3313 int sdb = sB->cn;
3314 double *B = sB->pA + bi/bs*bs*sdb + bi%bs + bj*bs;
3315
3316 if(m<=0)
3317 return;
3318
3319 int n = m;
3320
3321 int mna, ii;
3322
3323 int offA = ai%bs;
3324 int offB = bi%bs;
3325
3326 // A at the beginning of the block
3327 A -= offA;
3328
3329 // A at the beginning of the block
3330 B -= offB;
3331
3332 // same alignment
3333 if(offA==offB)
3334 {
3335 ii = 0;
3336 // clean up at the beginning
3337 mna = (4-offB)%bs;
3338 if(mna>0)
3339 {
3340 if(m<mna) // mna<=3 ==> m = { 1, 2 }
3341 {
3342 if(m==1)
3343 {
3344 kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+offA, B+offB);
3345 return;
3346 }
3347 else //if(m==2 && mna==3)
3348 {
3349 kernel_dgecpsc_2_0_lib4(1, ii, alpha, A+offA, B+offB);
3350 return;
3351 }
3352 }
3353 if(mna==1)
3354 {
3355 kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+offA, B+offB);
3356 A += 4*sda;
3357 B += 4*sdb;
3358 ii += 1;
3359 }
3360 else if(mna==2)
3361 {
3362 kernel_dgecpsc_2_0_lib4(1, ii, alpha, A+offA, B+offB);
3363 A += 4*sda;
3364 B += 4*sdb;
3365 ii += 2;
3366 }
3367 else // if(mna==3)
3368 {
3369 kernel_dgecpsc_3_0_lib4(1, ii, alpha, A+offA, B+offB);
3370 A += 4*sda;
3371 B += 4*sdb;
3372 ii += 3;
3373 }
3374 }
3375 // main loop
3376 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
3377 for(; ii<m-7; ii+=8)
3378 {
3379 kernel_dgecp_8_0_lib4(1, ii, A, sda, B, sdb);
3380 A += 8*sda;
3381 B += 8*sdb;
3382 }
3383 #endif
3384 for(; ii<m-3; ii+=4)
3385 {
3386 kernel_dgecp_4_0_lib4(1, ii, A, B);
3387 A += 4*sda;
3388 B += 4*sdb;
3389 }
3390 // clean up at the end
3391 if(ii<m)
3392 {
3393 if(m-ii==1)
3394 kernel_dgecpsc_1_0_lib4(1, ii, alpha, A, B);
3395 else if(m-ii==2)
3396 kernel_dgecpsc_2_0_lib4(1, ii, alpha, A, B);
3397 else // if(m-ii==3)
3398 kernel_dgecpsc_3_0_lib4(1, ii, alpha, A, B);
3399 }
3400 }
3401 // skip one element of A
3402 else if(offA==(offB+1)%bs)
3403 {
3404 ii = 0;
3405 // clean up at the beginning
3406 mna = (4-offB)%bs;
3407 if(mna>0)
3408 {
3409 if(m<mna) // mna<=3 ==> m = { 1, 2 }
3410 {
3411 if(m==1)
3412 {
3413 kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+offA, B+offB);
3414 return;
3415 }
3416 else //if(m==2 && mna==3)
3417 {
3418 kernel_dgecpsc_2_0_lib4(1, ii, alpha, A+offA, B+offB);
3419 return;
3420 }
3421 }
3422 if(mna==1)
3423 {
3424 kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+offA, B+offB);
3425 //A += 4*sda;
3426 B += 4*sdb;
3427 ii += 1;
3428 }
3429 else if(mna==2)
3430 {
3431 kernel_dgecpsc_2_3_lib4(1, ii, alpha, A, sda, B+2);
3432 A += 4*sda;
3433 B += 4*sdb;
3434 ii += 2;
3435 }
3436 else // if(mna==3)
3437 {
3438 kernel_dgecpsc_3_2_lib4(1, ii, alpha, A, sda, B+1);
3439 A += 4*sda;
3440 B += 4*sdb;
3441 ii += 3;
3442 }
3443 }
3444 // main loop
3445 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
3446 for( ; ii<m-7; ii+=8)
3447 {
3448 kernel_dgecpsc_8_1_lib4(1, ii, alpha, A, sda, B, sdb);
3449 A += 8*sda;
3450 B += 8*sdb;
3451 }
3452 #endif
3453 for( ; ii<m-3; ii+=4)
3454 {
3455 kernel_dgecpsc_4_1_lib4(1, ii, alpha, A, sda, B);
3456 A += 4*sda;
3457 B += 4*sdb;
3458 }
3459 // clean up at the end
3460 if(ii<m)
3461 {
3462 if(m-ii==1)
3463 kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+1, B);
3464 else if(m-ii==2)
3465 kernel_dgecpsc_2_0_lib4(1, ii, alpha, A+1, B);
3466 else // if(m-ii==3)
3467 kernel_dgecpsc_3_0_lib4(1, ii, alpha, A+1, B);
3468 }
3469 }
3470 // skip 2 elements of A
3471 else if(offA==(offB+2)%bs)
3472 {
3473 ii = 0;
3474 // clean up at the beginning
3475 mna = (4-offB)%bs;
3476 if(mna>0)
3477 {
3478 if(m<mna)
3479 {
3480 if(m==1)
3481 {
3482 kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+offA, B+offB);
3483 return;
3484 }
3485 else // if(m==2 && mna==3)
3486 {
3487 kernel_dgecpsc_2_3_lib4(1, ii, alpha, A, sda, B+1);
3488 return;
3489 }
3490 }
3491 if(mna==1)
3492 {
3493 kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+1, B+3);
3494 // A += 4*sda;
3495 B += 4*sdb;
3496 ii += 1;
3497 }
3498 else if(mna==2)
3499 {
3500 kernel_dgecpsc_2_0_lib4(1, ii, alpha, A, B+2);
3501 // A += 4*sda;
3502 B += 4*sdb;
3503 ii += 2;
3504 }
3505 else // if(mna==3)
3506 {
3507 kernel_dgecpsc_3_3_lib4(1, ii, alpha, A, sda, B+1);
3508 A += 4*sda;
3509 B += 4*sdb;
3510 ii += 3;
3511 }
3512 }
3513 // main loop
3514 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
3515 for(; ii<m-7; ii+=8)
3516 {
3517 kernel_dgecpsc_8_2_lib4(1, ii, alpha, A, sda, B, sdb);
3518 A += 8*sda;
3519 B += 8*sdb;
3520 }
3521 #endif
3522 for(; ii<m-3; ii+=4)
3523 {
3524 kernel_dgecpsc_4_2_lib4(1, ii, alpha, A, sda, B);
3525 A += 4*sda;
3526 B += 4*sdb;
3527 }
3528 // clean up at the end
3529 if(ii<m)
3530 {
3531 if(m-ii==1)
3532 kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+2, B);
3533 else if(m-ii==2)
3534 kernel_dgecpsc_2_0_lib4(1, ii, alpha, A+2, B);
3535 else // if(m-ii==3)
3536 kernel_dgecpsc_3_2_lib4(1, ii, alpha, A, sda, B);
3537 }
3538 }
3539 // skip 3 elements of A
3540 else // if(offA==(offB+3)%bs)
3541 {
3542 ii = 0;
3543 // clean up at the beginning
3544 mna = (4-offB)%bs;
3545 if(mna>0)
3546 {
3547 if(m<mna)
3548 {
3549 if(m==1)
3550 {
3551 kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+offA, B+offB);
3552 return;
3553 }
3554 else // if(m==2 && mna==3)
3555 {
3556 kernel_dgecpsc_2_0_lib4(1, ii, alpha, A+offA, B+offB);
3557 return;
3558 }
3559 }
3560 if(mna==1)
3561 {
3562 kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+offA, B+offB);
3563 // A += 4*sda;
3564 B += 4*sdb;
3565 ii += 1;
3566 }
3567 else if(mna==2)
3568 {
3569 kernel_dgecpsc_2_0_lib4(1, ii, alpha, A+offA, B+offB);
3570 // A += 4*sda;
3571 B += 4*sdb;
3572 ii += 2;
3573 }
3574 else // if(mna==3)
3575 {
3576 kernel_dgecpsc_3_0_lib4(1, ii, alpha, A+offA, B+offB);
3577 // A += 4*sda;
3578 B += 4*sdb;
3579 ii += 3;
3580 }
3581 }
3582 // main loop
3583 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
3584 for(; ii<m-7; ii+=8)
3585 {
3586 kernel_dgecpsc_8_3_lib4(1, ii, alpha, A, sda, B, sdb);
3587 A += 8*sda;
3588 B += 8*sdb;
3589 }
3590 #endif
3591 for(; ii<m-3; ii+=4)
3592 {
3593 kernel_dgecpsc_4_3_lib4(1, ii, alpha, A, sda, B);
3594 A += 4*sda;
3595 B += 4*sdb;
3596 }
3597 // clean up at the end
3598 if(ii<m)
3599 {
3600 if(m-ii==1)
3601 kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+3, B);
3602 else if(m-ii==2)
3603 kernel_dgecpsc_2_3_lib4(1, ii, alpha, A, sda, B);
3604 else // if(m-ii==3)
3605 kernel_dgecpsc_3_3_lib4(1, ii, alpha, A, sda, B);
3606 }
3607 }
3608
3609 /* } */
3610 return;
3611 }
3612
3613
3614
3615 // copy and scale a generic strmat into a generic strmat
blasfeo_dgecpsc(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj)3616 void blasfeo_dgecpsc(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj)
3617 {
3618
3619 // invalidate stored inverse diagonal
3620 sB->use_dA = 0;
3621
3622 const int bs = 4;
3623
3624 // extract dimension
3625 int sda = sA->cn;
3626 int sdb = sB->cn;
3627
3628 // extract submatrix
3629 double *A = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
3630 double *B = sB->pA + bi/bs*bs*sdb + bi%bs + bj*bs;
3631
3632 if(m<=0 || n<=0)
3633 return;
3634
3635 int mna, ii;
3636
3637 int offA = ai%bs;
3638 int offB = bi%bs;
3639
3640 // A at the beginning of the block
3641 A -= offA;
3642
3643 // A at the beginning of the block
3644 B -= offB;
3645
3646 // same alignment
3647 if(offA==offB)
3648 {
3649 ii = 0;
3650 // clean up at the beginning
3651 mna = (4-offB)%bs;
3652 if(mna>0)
3653 {
3654 if(m<mna) // mna<=3 ==> m = { 1, 2 }
3655 {
3656 if(m==1)
3657 {
3658 kernel_dgecpsc_1_0_lib4(0, n, alpha, A+offA, B+offB);
3659 return;
3660 }
3661 else //if(m==2 && mna==3)
3662 {
3663 kernel_dgecpsc_2_0_lib4(0, n, alpha, A+offA, B+offB);
3664 return;
3665 }
3666 }
3667 if(mna==1)
3668 {
3669 kernel_dgecpsc_1_0_lib4(0, n, alpha, A+offA, B+offB);
3670 A += 4*sda;
3671 B += 4*sdb;
3672 ii += 1;
3673 }
3674 else if(mna==2)
3675 {
3676 kernel_dgecpsc_2_0_lib4(0, n, alpha, A+offA, B+offB);
3677 A += 4*sda;
3678 B += 4*sdb;
3679 ii += 2;
3680 }
3681 else // if(mna==3)
3682 {
3683 kernel_dgecpsc_3_0_lib4(0, n, alpha, A+offA, B+offB);
3684 A += 4*sda;
3685 B += 4*sdb;
3686 ii += 3;
3687 }
3688 }
3689 // main loop
3690 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
3691 for(; ii<m-7; ii+=8)
3692 {
3693 kernel_dgecpsc_8_0_lib4(0, n, alpha, A, sda, B, sdb);
3694 A += 8*sda;
3695 B += 8*sdb;
3696 }
3697 #endif
3698 for(; ii<m-3; ii+=4)
3699 {
3700 kernel_dgecpsc_4_0_lib4(0, n, alpha, A, B);
3701 A += 4*sda;
3702 B += 4*sdb;
3703 }
3704 // clean up at the end
3705 if(ii<m)
3706 {
3707 if(m-ii==1)
3708 kernel_dgecpsc_1_0_lib4(0, n, alpha, A, B);
3709 else if(m-ii==2)
3710 kernel_dgecpsc_2_0_lib4(0, n, alpha, A, B);
3711 else // if(m-ii==3)
3712 kernel_dgecpsc_3_0_lib4(0, n, alpha, A, B);
3713 }
3714 }
3715 // skip one element of A
3716 else if(offA==(offB+1)%bs)
3717 {
3718 ii = 0;
3719 // clean up at the beginning
3720 mna = (4-offB)%bs;
3721 if(mna>0)
3722 {
3723 if(m<mna) // mna<=3 ==> m = { 1, 2 }
3724 {
3725 if(m==1)
3726 {
3727 kernel_dgecpsc_1_0_lib4(0, n, alpha, A+offA, B+offB);
3728 return;
3729 }
3730 else //if(m==2 && mna==3)
3731 {
3732 kernel_dgecpsc_2_0_lib4(0, n, alpha, A+offA, B+offB);
3733 return;
3734 }
3735 }
3736 if(mna==1)
3737 {
3738 kernel_dgecpsc_1_0_lib4(0, n, alpha, A+offA, B+offB);
3739 //A += 4*sda;
3740 B += 4*sdb;
3741 ii += 1;
3742 }
3743 else if(mna==2)
3744 {
3745 kernel_dgecpsc_2_3_lib4(0, n, alpha, A, sda, B+2);
3746 A += 4*sda;
3747 B += 4*sdb;
3748 ii += 2;
3749 }
3750 else // if(mna==3)
3751 {
3752 kernel_dgecpsc_3_2_lib4(0, n, alpha, A, sda, B+1);
3753 A += 4*sda;
3754 B += 4*sdb;
3755 ii += 3;
3756 }
3757 }
3758 // main loop
3759 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
3760 for( ; ii<m-7; ii+=8)
3761 {
3762 kernel_dgecpsc_8_1_lib4(0, n, alpha, A, sda, B, sdb);
3763 A += 8*sda;
3764 B += 8*sdb;
3765 }
3766 #endif
3767 for( ; ii<m-3; ii+=4)
3768 {
3769 kernel_dgecpsc_4_1_lib4(0, n, alpha, A, sda, B);
3770 A += 4*sda;
3771 B += 4*sdb;
3772 }
3773 // clean up at the end
3774 if(ii<m)
3775 {
3776 if(m-ii==1)
3777 kernel_dgecpsc_1_0_lib4(0, n, alpha, A+1, B);
3778 else if(m-ii==2)
3779 kernel_dgecpsc_2_0_lib4(0, n, alpha, A+1, B);
3780 else // if(m-ii==3)
3781 kernel_dgecpsc_3_0_lib4(0, n, alpha, A+1, B);
3782 }
3783 }
3784 // skip 2 elements of A
3785 else if(offA==(offB+2)%bs)
3786 {
3787 ii = 0;
3788 // clean up at the beginning
3789 mna = (4-offB)%bs;
3790 if(mna>0)
3791 {
3792 if(m<mna)
3793 {
3794 if(m==1)
3795 {
3796 kernel_dgecpsc_1_0_lib4(0, n, alpha, A+offA, B+offB);
3797 return;
3798 }
3799 else // if(m==2 && mna==3)
3800 {
3801 kernel_dgecpsc_2_3_lib4(0, n, alpha, A, sda, B+1);
3802 return;
3803 }
3804 }
3805 if(mna==1)
3806 {
3807 kernel_dgecpsc_1_0_lib4(0, n, alpha, A+1, B+3);
3808 // A += 4*sda;
3809 B += 4*sdb;
3810 ii += 1;
3811 }
3812 else if(mna==2)
3813 {
3814 kernel_dgecpsc_2_0_lib4(0, n, alpha, A, B+2);
3815 // A += 4*sda;
3816 B += 4*sdb;
3817 ii += 2;
3818 }
3819 else // if(mna==3)
3820 {
3821 kernel_dgecpsc_3_3_lib4(0, n, alpha, A, sda, B+1);
3822 A += 4*sda;
3823 B += 4*sdb;
3824 ii += 3;
3825 }
3826 }
3827 // main loop
3828 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
3829 for(; ii<m-7; ii+=8)
3830 {
3831 kernel_dgecpsc_8_2_lib4(0, n, alpha, A, sda, B, sdb);
3832 A += 8*sda;
3833 B += 8*sdb;
3834 }
3835 #endif
3836 for(; ii<m-3; ii+=4)
3837 {
3838 kernel_dgecpsc_4_2_lib4(0, n, alpha, A, sda, B);
3839 A += 4*sda;
3840 B += 4*sdb;
3841 }
3842 // clean up at the end
3843 if(ii<m)
3844 {
3845 if(m-ii==1)
3846 kernel_dgecpsc_1_0_lib4(0, n, alpha, A+2, B);
3847 else if(m-ii==2)
3848 kernel_dgecpsc_2_0_lib4(0, n, alpha, A+2, B);
3849 else // if(m-ii==3)
3850 kernel_dgecpsc_3_2_lib4(0, n, alpha, A, sda, B);
3851 }
3852 }
3853 // skip 3 elements of A
3854 else // if(offA==(offB+3)%bs)
3855 {
3856 ii = 0;
3857 // clean up at the beginning
3858 mna = (4-offB)%bs;
3859 if(mna>0)
3860 {
3861 if(m<mna)
3862 {
3863 if(m==1)
3864 {
3865 kernel_dgecpsc_1_0_lib4(0, n, alpha, A+offA, B+offB);
3866 return;
3867 }
3868 else // if(m==2 && mna==3)
3869 {
3870 kernel_dgecpsc_2_0_lib4(0, n, alpha, A+offA, B+offB);
3871 return;
3872 }
3873 }
3874 if(mna==1)
3875 {
3876 kernel_dgecpsc_1_0_lib4(0, n, alpha, A+offA, B+offB);
3877 // A += 4*sda;
3878 B += 4*sdb;
3879 ii += 1;
3880 }
3881 else if(mna==2)
3882 {
3883 kernel_dgecpsc_2_0_lib4(0, n, alpha, A+offA, B+offB);
3884 // A += 4*sda;
3885 B += 4*sdb;
3886 ii += 2;
3887 }
3888 else // if(mna==3)
3889 {
3890 kernel_dgecpsc_3_0_lib4(0, n, alpha, A+offA, B+offB);
3891 // A += 4*sda;
3892 B += 4*sdb;
3893 ii += 3;
3894 }
3895 }
3896 // main loop
3897 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
3898 for(; ii<m-7; ii+=8)
3899 {
3900 kernel_dgecpsc_8_3_lib4(0, n, alpha, A, sda, B, sdb);
3901 A += 8*sda;
3902 B += 8*sdb;
3903 }
3904 #endif
3905 for(; ii<m-3; ii+=4)
3906 {
3907 kernel_dgecpsc_4_3_lib4(0, n, alpha, A, sda, B);
3908 A += 4*sda;
3909 B += 4*sdb;
3910 }
3911 // clean up at the end
3912 if(ii<m)
3913 {
3914 if(m-ii==1)
3915 kernel_dgecpsc_1_0_lib4(0, n, alpha, A+3, B);
3916 else if(m-ii==2)
3917 kernel_dgecpsc_2_3_lib4(0, n, alpha, A, sda, B);
3918 else // if(m-ii==3)
3919 kernel_dgecpsc_3_3_lib4(0, n, alpha, A, sda, B);
3920 }
3921 }
3922 return;
3923 }
3924
3925
3926
3927 // copy and scale a lower triangular strmat into a lower triangular strmat
blasfeo_dtrcpsc_l(int m,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj)3928 void blasfeo_dtrcpsc_l(int m, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj)
3929 {
3930
3931 // invalidate stored inverse diagonal
3932 sB->use_dA = 0;
3933
3934 const int bs = 4;
3935 int sda = sA->cn;
3936 double *A = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
3937 int sdb = sB->cn;
3938 double *B = sB->pA + bi/bs*bs*sdb + bi%bs + bj*bs;
3939
3940 /* dtrcp_l_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); */
3941 /* // copies a lower triangular packed matrix into a lower triangular packed matrix */
3942 /* void dtrcp_l_lib(int m, double alpha, int offsetA, double *A, int sda, int offsetB, double *B, int sdb) */
3943 /* { */
3944
3945 if(m<=0)
3946 return;
3947
3948 int n = m;
3949
3950 int mna, ii;
3951
3952 int offA = ai%bs;
3953 int offB = bi%bs;
3954
3955 // A at the beginning of the block
3956 A -= offA;
3957
3958 // A at the beginning of the block
3959 B -= offB;
3960
3961 // same alignment
3962 if(offA==offB)
3963 {
3964 ii = 0;
3965 // clean up at the beginning
3966 mna = (4-offB)%bs;
3967 if(mna>0)
3968 {
3969 if(m<mna) // mna<=3 ==> m = { 1, 2 }
3970 {
3971 if(m==1)
3972 {
3973 kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+offA, B+offB);
3974 return;
3975 }
3976 else //if(m==2 && mna==3)
3977 {
3978 kernel_dgecpsc_2_0_lib4(1, ii, alpha, A+offA, B+offB);
3979 return;
3980 }
3981 }
3982 if(mna==1)
3983 {
3984 kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+offA, B+offB);
3985 A += 4*sda;
3986 B += 4*sdb;
3987 ii += 1;
3988 }
3989 else if(mna==2)
3990 {
3991 kernel_dgecpsc_2_0_lib4(1, ii, alpha, A+offA, B+offB);
3992 A += 4*sda;
3993 B += 4*sdb;
3994 ii += 2;
3995 }
3996 else // if(mna==3)
3997 {
3998 kernel_dgecpsc_3_0_lib4(1, ii, alpha, A+offA, B+offB);
3999 A += 4*sda;
4000 B += 4*sdb;
4001 ii += 3;
4002 }
4003 }
4004 // main loop
4005 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
4006 for(; ii<m-7; ii+=8)
4007 {
4008 kernel_dgecpsc_8_0_lib4(1, ii, alpha, A, sda, B, sdb);
4009 A += 8*sda;
4010 B += 8*sdb;
4011 }
4012 #endif
4013 for(; ii<m-3; ii+=4)
4014 {
4015 kernel_dgecpsc_4_0_lib4(1, ii, alpha, A, B);
4016 A += 4*sda;
4017 B += 4*sdb;
4018 }
4019 // clean up at the end
4020 if(ii<m)
4021 {
4022 if(m-ii==1)
4023 kernel_dgecpsc_1_0_lib4(1, ii, alpha, A, B);
4024 else if(m-ii==2)
4025 kernel_dgecpsc_2_0_lib4(1, ii, alpha, A, B);
4026 else // if(m-ii==3)
4027 kernel_dgecpsc_3_0_lib4(1, ii, alpha, A, B);
4028 }
4029 }
4030 // skip one element of A
4031 else if(offA==(offB+1)%bs)
4032 {
4033 ii = 0;
4034 // clean up at the beginning
4035 mna = (4-offB)%bs;
4036 if(mna>0)
4037 {
4038 if(m<mna) // mna<=3 ==> m = { 1, 2 }
4039 {
4040 if(m==1)
4041 {
4042 kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+offA, B+offB);
4043 return;
4044 }
4045 else //if(m==2 && mna==3)
4046 {
4047 kernel_dgecpsc_2_0_lib4(1, ii, alpha, A+offA, B+offB);
4048 return;
4049 }
4050 }
4051 if(mna==1)
4052 {
4053 kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+offA, B+offB);
4054 //A += 4*sda;
4055 B += 4*sdb;
4056 ii += 1;
4057 }
4058 else if(mna==2)
4059 {
4060 kernel_dgecpsc_2_3_lib4(1, ii, alpha, A, sda, B+2);
4061 A += 4*sda;
4062 B += 4*sdb;
4063 ii += 2;
4064 }
4065 else // if(mna==3)
4066 {
4067 kernel_dgecpsc_3_2_lib4(1, ii, alpha, A, sda, B+1);
4068 A += 4*sda;
4069 B += 4*sdb;
4070 ii += 3;
4071 }
4072 }
4073 // main loop
4074 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
4075 for( ; ii<m-7; ii+=8)
4076 {
4077 kernel_dgecpsc_8_1_lib4(1, ii, alpha, A, sda, B, sdb);
4078 A += 8*sda;
4079 B += 8*sdb;
4080 }
4081 #endif
4082 for( ; ii<m-3; ii+=4)
4083 {
4084 kernel_dgecpsc_4_1_lib4(1, ii, alpha, A, sda, B);
4085 A += 4*sda;
4086 B += 4*sdb;
4087 }
4088 // clean up at the end
4089 if(ii<m)
4090 {
4091 if(m-ii==1)
4092 kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+1, B);
4093 else if(m-ii==2)
4094 kernel_dgecpsc_2_0_lib4(1, ii, alpha, A+1, B);
4095 else // if(m-ii==3)
4096 kernel_dgecpsc_3_0_lib4(1, ii, alpha, A+1, B);
4097 }
4098 }
4099 // skip 2 elements of A
4100 else if(offA==(offB+2)%bs)
4101 {
4102 ii = 0;
4103 // clean up at the beginning
4104 mna = (4-offB)%bs;
4105 if(mna>0)
4106 {
4107 if(m<mna)
4108 {
4109 if(m==1)
4110 {
4111 kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+offA, B+offB);
4112 return;
4113 }
4114 else // if(m==2 && mna==3)
4115 {
4116 kernel_dgecpsc_2_3_lib4(1, ii, alpha, A, sda, B+1);
4117 return;
4118 }
4119 }
4120 if(mna==1)
4121 {
4122 kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+1, B+3);
4123 // A += 4*sda;
4124 B += 4*sdb;
4125 ii += 1;
4126 }
4127 else if(mna==2)
4128 {
4129 kernel_dgecpsc_2_0_lib4(1, ii, alpha, A, B+2);
4130 // A += 4*sda;
4131 B += 4*sdb;
4132 ii += 2;
4133 }
4134 else // if(mna==3)
4135 {
4136 kernel_dgecpsc_3_3_lib4(1, ii, alpha, A, sda, B+1);
4137 A += 4*sda;
4138 B += 4*sdb;
4139 ii += 3;
4140 }
4141 }
4142 // main loop
4143 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
4144 for(; ii<m-7; ii+=8)
4145 {
4146 kernel_dgecpsc_8_2_lib4(1, ii, alpha, A, sda, B, sdb);
4147 A += 8*sda;
4148 B += 8*sdb;
4149 }
4150 #endif
4151 for(; ii<m-3; ii+=4)
4152 {
4153 kernel_dgecpsc_4_2_lib4(1, ii, alpha, A, sda, B);
4154 A += 4*sda;
4155 B += 4*sdb;
4156 }
4157 // clean up at the end
4158 if(ii<m)
4159 {
4160 if(m-ii==1)
4161 kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+2, B);
4162 else if(m-ii==2)
4163 kernel_dgecpsc_2_0_lib4(1, ii, alpha, A+2, B);
4164 else // if(m-ii==3)
4165 kernel_dgecpsc_3_2_lib4(1, ii, alpha, A, sda, B);
4166 }
4167 }
4168 // skip 3 elements of A
4169 else // if(offA==(offB+3)%bs)
4170 {
4171 ii = 0;
4172 // clean up at the beginning
4173 mna = (4-offB)%bs;
4174 if(mna>0)
4175 {
4176 if(m<mna)
4177 {
4178 if(m==1)
4179 {
4180 kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+offA, B+offB);
4181 return;
4182 }
4183 else // if(m==2 && mna==3)
4184 {
4185 kernel_dgecpsc_2_0_lib4(1, ii, alpha, A+offA, B+offB);
4186 return;
4187 }
4188 }
4189 if(mna==1)
4190 {
4191 kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+offA, B+offB);
4192 // A += 4*sda;
4193 B += 4*sdb;
4194 ii += 1;
4195 }
4196 else if(mna==2)
4197 {
4198 kernel_dgecpsc_2_0_lib4(1, ii, alpha, A+offA, B+offB);
4199 // A += 4*sda;
4200 B += 4*sdb;
4201 ii += 2;
4202 }
4203 else // if(mna==3)
4204 {
4205 kernel_dgecpsc_3_0_lib4(1, ii, alpha, A+offA, B+offB);
4206 // A += 4*sda;
4207 B += 4*sdb;
4208 ii += 3;
4209 }
4210 }
4211 // main loop
4212 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
4213 for(; ii<m-7; ii+=8)
4214 {
4215 kernel_dgecpsc_8_3_lib4(1, ii, alpha, A, sda, B, sdb);
4216 A += 8*sda;
4217 B += 8*sdb;
4218 }
4219 #endif
4220 for(; ii<m-3; ii+=4)
4221 {
4222 kernel_dgecpsc_4_3_lib4(1, ii, alpha, A, sda, B);
4223 A += 4*sda;
4224 B += 4*sdb;
4225 }
4226 // clean up at the end
4227 if(ii<m)
4228 {
4229 if(m-ii==1)
4230 kernel_dgecpsc_1_0_lib4(1, ii, alpha, A+3, B);
4231 else if(m-ii==2)
4232 kernel_dgecpsc_2_3_lib4(1, ii, alpha, A, sda, B);
4233 else // if(m-ii==3)
4234 kernel_dgecpsc_3_3_lib4(1, ii, alpha, A, sda, B);
4235 }
4236 }
4237
4238 /* } */
4239 return;
4240 }
4241
4242
4243
4244 // scale a generic strmat
blasfeo_dgesc(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj)4245 void blasfeo_dgesc(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj)
4246 {
4247 // invalidate stored inverse diagonal
4248 sA->use_dA = 0;
4249
4250 blasfeo_dgecpsc(m, n, alpha, sA, ai, aj, sA, ai, aj);
4251 }
4252
4253
4254
4255 // scale a triangular strmat
blasfeo_dtrsc_l(int m,double alpha,struct blasfeo_dmat * sA,int ai,int aj)4256 void blasfeo_dtrsc_l(int m, double alpha, struct blasfeo_dmat *sA, int ai, int aj)
4257 {
4258 // invalidate stored inverse diagonal
4259 sA->use_dA = 0;
4260
4261 blasfeo_dtrcpsc_l(m, alpha, sA, ai, aj, sA, ai, aj);
4262 }
4263
4264
4265
4266 // copy a strvec into a strvec
blasfeo_dveccp(int m,struct blasfeo_dvec * sa,int ai,struct blasfeo_dvec * sc,int ci)4267 void blasfeo_dveccp(int m, struct blasfeo_dvec *sa, int ai, struct blasfeo_dvec *sc, int ci)
4268 {
4269 double *pa = sa->pa + ai;
4270 double *pc = sc->pa + ci;
4271 int ii;
4272 ii = 0;
4273 for(; ii<m-3; ii+=4)
4274 {
4275 pc[ii+0] = pa[ii+0];
4276 pc[ii+1] = pa[ii+1];
4277 pc[ii+2] = pa[ii+2];
4278 pc[ii+3] = pa[ii+3];
4279 }
4280 for(; ii<m; ii++)
4281 {
4282 pc[ii+0] = pa[ii+0];
4283 }
4284 return;
4285 }
4286
4287
4288
4289 // scale a strvec
blasfeo_dvecsc(int m,double alpha,struct blasfeo_dvec * sa,int ai)4290 void blasfeo_dvecsc(int m, double alpha, struct blasfeo_dvec *sa, int ai)
4291 {
4292 double *pa = sa->pa + ai;
4293 int ii;
4294 ii = 0;
4295 for(; ii<m-3; ii+=4)
4296 {
4297 pa[ii+0] *= alpha;
4298 pa[ii+1] *= alpha;
4299 pa[ii+2] *= alpha;
4300 pa[ii+3] *= alpha;
4301 }
4302 for(; ii<m; ii++)
4303 {
4304 pa[ii+0] *= alpha;
4305 }
4306 return;
4307 }
4308
4309
4310
4311 // copy and scale a strvec into a strvec
blasfeo_dveccpsc(int m,double alpha,struct blasfeo_dvec * sa,int ai,struct blasfeo_dvec * sc,int ci)4312 void blasfeo_dveccpsc(int m, double alpha, struct blasfeo_dvec *sa, int ai, struct blasfeo_dvec *sc, int ci)
4313 {
4314 double *pa = sa->pa + ai;
4315 double *pc = sc->pa + ci;
4316 int ii;
4317 ii = 0;
4318 for(; ii<m-3; ii+=4)
4319 {
4320 pc[ii+0] = alpha*pa[ii+0];
4321 pc[ii+1] = alpha*pa[ii+1];
4322 pc[ii+2] = alpha*pa[ii+2];
4323 pc[ii+3] = alpha*pa[ii+3];
4324 }
4325 for(; ii<m; ii++)
4326 {
4327 pc[ii+0] = alpha*pa[ii+0];
4328 }
4329 return;
4330 }
4331
4332
4333
4334 // scale and add a generic strmat into a generic strmat
blasfeo_dgead(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sC,int ci,int cj)4335 void blasfeo_dgead(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sC, int ci, int cj)
4336 {
4337 // invalidate stored inverse diagonal
4338 sC->use_dA = 0;
4339
4340 const int bs = 4;
4341 int sda = sA->cn;
4342 double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
4343 int sdc = sC->cn;
4344 double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
4345 dgead_lib(m, n, alpha, ai%bs, pA, sda, ci%bs, pC, sdc);
4346 return;
4347 }
4348
4349
4350
4351 // copy and transpose a generic strmat into a generic strmat
blasfeo_dgetr(int m,int n,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sC,int ci,int cj)4352 void blasfeo_dgetr(int m, int n, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sC, int ci, int cj)
4353 {
4354 // invalidate stored inverse diagonal
4355 sC->use_dA = 0;
4356
4357 const int bs = 4;
4358 int sda = sA->cn;
4359 double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
4360 int sdc = sC->cn;
4361 double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
4362 dgetr_lib(m, n, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
4363 return;
4364 }
4365
4366
4367
4368 // copy and transpose a lower triangular strmat into an upper triangular strmat
blasfeo_dtrtr_l(int m,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sC,int ci,int cj)4369 void blasfeo_dtrtr_l(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sC, int ci, int cj)
4370 {
4371 // invalidate stored inverse diagonal
4372 sC->use_dA = 0;
4373
4374 const int bs = 4;
4375 int sda = sA->cn;
4376 double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
4377 int sdc = sC->cn;
4378 double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
4379 dtrtr_l_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
4380 return;
4381 }
4382
4383
4384
4385 // copy and transpose an upper triangular strmat into a lower triangular strmat
blasfeo_dtrtr_u(int m,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sC,int ci,int cj)4386 void blasfeo_dtrtr_u(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sC, int ci, int cj)
4387 {
4388 // invalidate stored inverse diagonal
4389 sC->use_dA = 0;
4390
4391 const int bs = 4;
4392 int sda = sA->cn;
4393 double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
4394 int sdc = sC->cn;
4395 double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
4396 dtrtr_u_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
4397 return;
4398 }
4399
4400
4401
4402 // insert a strvec to diagonal of strmat, sparse formulation
blasfeo_ddiain_sp(int kmax,double alpha,struct blasfeo_dvec * sx,int xi,int * idx,struct blasfeo_dmat * sD,int di,int dj)4403 void blasfeo_ddiain_sp(int kmax, double alpha, struct blasfeo_dvec *sx, int xi, int *idx, struct blasfeo_dmat *sD, int di, int dj)
4404 {
4405 // invalidate stored inverse diagonal
4406 sD->use_dA = 0;
4407
4408 const int bs = 4;
4409 double *x = sx->pa + xi;
4410 int sdd = sD->cn;
4411 double *pD = sD->pA;
4412 int ii, jj;
4413 for(jj=0; jj<kmax; jj++)
4414 {
4415 ii = idx[jj];
4416 pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = alpha * x[jj];
4417 }
4418 return;
4419 }
4420
4421
4422
4423 // extract a vector from diagonal
blasfeo_ddiaex(int kmax,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi)4424 void blasfeo_ddiaex(int kmax, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi)
4425 {
4426 const int bs = 4;
4427 int sda = sA->cn;
4428 double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
4429 double *x = sx->pa + xi;
4430 int offsetA = ai%bs;
4431
4432 int kna = (bs-offsetA%bs)%bs;
4433 kna = kmax<kna ? kmax : kna;
4434
4435 int jj, ll;
4436
4437 if(kna>0)
4438 {
4439 for(ll=0; ll<kna; ll++)
4440 {
4441 x[ll] = alpha*pA[ll+bs*ll];
4442 }
4443 pA += kna + bs*(sda-1) + kna*bs;
4444 x += kna;
4445 kmax -= kna;
4446 }
4447 for(jj=0; jj<kmax-3; jj+=4)
4448 {
4449 x[jj+0] = alpha*pA[jj*sda+(jj+0)*bs+0];
4450 x[jj+1] = alpha*pA[jj*sda+(jj+1)*bs+1];
4451 x[jj+2] = alpha*pA[jj*sda+(jj+2)*bs+2];
4452 x[jj+3] = alpha*pA[jj*sda+(jj+3)*bs+3];
4453 }
4454 for(ll=0; ll<kmax-jj; ll++)
4455 {
4456 x[jj+ll] = alpha*pA[jj*sda+(jj+ll)*bs+ll];
4457 }
4458 return;
4459 }
4460
4461
4462
4463 // extract the diagonal of a strmat to a strvec, sparse formulation
blasfeo_ddiaex_sp(int kmax,double alpha,int * idx,struct blasfeo_dmat * sD,int di,int dj,struct blasfeo_dvec * sx,int xi)4464 void blasfeo_ddiaex_sp(int kmax, double alpha, int *idx, struct blasfeo_dmat *sD, int di, int dj, struct blasfeo_dvec *sx, int xi)
4465 {
4466 const int bs = 4;
4467 double *x = sx->pa + xi;
4468 int sdd = sD->cn;
4469 double *pD = sD->pA;
4470 int ii, jj;
4471 for(jj=0; jj<kmax; jj++)
4472 {
4473 ii = idx[jj];
4474 x[jj] = alpha * pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs];
4475 }
4476 return;
4477 }
4478
4479
4480
4481 // add a vector to diagonal
blasfeo_ddiaad(int kmax,double alpha,struct blasfeo_dvec * sx,int xi,struct blasfeo_dmat * sA,int ai,int aj)4482 void blasfeo_ddiaad(int kmax, double alpha, struct blasfeo_dvec *sx, int xi, struct blasfeo_dmat *sA, int ai, int aj)
4483 {
4484
4485 // invalidate stored inverse diagonal
4486 sA->use_dA = 0;
4487
4488 const int bs = 4;
4489 int sda = sA->cn;
4490 double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
4491 double *x = sx->pa + xi;
4492 int offsetA = ai%bs;
4493
4494 int kna = (bs-offsetA%bs)%bs;
4495 kna = kmax<kna ? kmax : kna;
4496
4497 int jj, ll;
4498
4499 if(kna>0)
4500 {
4501 for(ll=0; ll<kna; ll++)
4502 {
4503 pA[ll+bs*ll] += alpha*x[ll];
4504 }
4505 pA += kna + bs*(sda-1) + kna*bs;
4506 x += kna;
4507 kmax -= kna;
4508 }
4509 for(jj=0; jj<kmax-3; jj+=4)
4510 {
4511 pA[jj*sda+(jj+0)*bs+0] += alpha*x[jj+0];
4512 pA[jj*sda+(jj+1)*bs+1] += alpha*x[jj+1];
4513 pA[jj*sda+(jj+2)*bs+2] += alpha*x[jj+2];
4514 pA[jj*sda+(jj+3)*bs+3] += alpha*x[jj+3];
4515 }
4516 for(ll=0; ll<kmax-jj; ll++)
4517 {
4518 pA[jj*sda+(jj+ll)*bs+ll] += alpha*x[jj+ll];
4519 }
4520 return;
4521 }
4522
4523
4524
4525 // add scaled strvec to diagonal of strmat, sparse formulation
blasfeo_ddiaad_sp(int kmax,double alpha,struct blasfeo_dvec * sx,int xi,int * idx,struct blasfeo_dmat * sD,int di,int dj)4526 void blasfeo_ddiaad_sp(int kmax, double alpha, struct blasfeo_dvec *sx, int xi, int *idx, struct blasfeo_dmat *sD, int di, int dj)
4527 {
4528
4529 // invalidate stored inverse diagonal
4530 sD->use_dA = 0;
4531
4532 const int bs = 4;
4533 double *x = sx->pa + xi;
4534 int sdd = sD->cn;
4535 double *pD = sD->pA;
4536 int ii, jj;
4537 for(jj=0; jj<kmax; jj++)
4538 {
4539 ii = idx[jj];
4540 pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] += alpha * x[jj];
4541 }
4542 return;
4543 }
4544
4545
4546
4547 // add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation
blasfeo_ddiaadin_sp(int kmax,double alpha,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sy,int yi,int * idx,struct blasfeo_dmat * sD,int di,int dj)4548 void blasfeo_ddiaadin_sp(int kmax, double alpha, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sy, int yi, int *idx, struct blasfeo_dmat *sD, int di, int dj)
4549 {
4550
4551 // invalidate stored inverse diagonal
4552 sD->use_dA = 0;
4553
4554 const int bs = 4;
4555 double *x = sx->pa + xi;
4556 double *y = sy->pa + yi;
4557 int sdd = sD->cn;
4558 double *pD = sD->pA;
4559 int ii, jj;
4560 for(jj=0; jj<kmax; jj++)
4561 {
4562 ii = idx[jj];
4563 pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = y[jj] + alpha * x[jj];
4564 }
4565 return;
4566 }
4567
4568
4569
4570 // add scaled strvec to row of strmat, sparse formulation
blasfeo_drowad_sp(int kmax,double alpha,struct blasfeo_dvec * sx,int xi,int * idx,struct blasfeo_dmat * sD,int di,int dj)4571 void blasfeo_drowad_sp(int kmax, double alpha, struct blasfeo_dvec *sx, int xi, int *idx, struct blasfeo_dmat *sD, int di, int dj)
4572 {
4573
4574 // invalidate stored inverse diagonal
4575 sD->use_dA = 0;
4576
4577 const int bs = 4;
4578 double *x = sx->pa + xi;
4579 int sdd = sD->cn;
4580 double *pD = sD->pA + di/bs*bs*sdd + di%bs + dj*bs;
4581 drowad_libsp(kmax, idx, alpha, x, pD);
4582 return;
4583 }
4584
4585
4586
4587 // add scaled strvec to strvec, sparse formulation
blasfeo_dvecad_sp(int m,double alpha,struct blasfeo_dvec * sx,int xi,int * idx,struct blasfeo_dvec * sz,int zi)4588 void blasfeo_dvecad_sp(int m, double alpha, struct blasfeo_dvec *sx, int xi, int *idx, struct blasfeo_dvec *sz, int zi)
4589 {
4590 double *x = sx->pa + xi;
4591 double *z = sz->pa + zi;
4592 int ii;
4593 for(ii=0; ii<m; ii++)
4594 z[idx[ii]] += alpha * x[ii];
4595 return;
4596 }
4597
4598
4599
4600 // insert scaled strvec to strvec, sparse formulation
blasfeo_dvecin_sp(int m,double alpha,struct blasfeo_dvec * sx,int xi,int * idx,struct blasfeo_dvec * sz,int zi)4601 void blasfeo_dvecin_sp(int m, double alpha, struct blasfeo_dvec *sx, int xi, int *idx, struct blasfeo_dvec *sz, int zi)
4602 {
4603 double *x = sx->pa + xi;
4604 double *z = sz->pa + zi;
4605 int ii;
4606 for(ii=0; ii<m; ii++)
4607 z[idx[ii]] = alpha * x[ii];
4608 return;
4609 }
4610
4611
4612
4613 // extract scaled strvec to strvec, sparse formulation
blasfeo_dvecex_sp(int m,double alpha,int * idx,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sz,int zi)4614 void blasfeo_dvecex_sp(int m, double alpha, int *idx, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi)
4615 {
4616 double *x = sx->pa + xi;
4617 double *z = sz->pa + zi;
4618 int ii;
4619 for(ii=0; ii<m; ii++)
4620 z[ii] = alpha * x[idx[ii]];
4621 return;
4622 }
4623
4624
4625
4626 // clip strvec between two strvec
blasfeo_dveccl(int m,struct blasfeo_dvec * sxm,int xim,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sxp,int xip,struct blasfeo_dvec * sz,int zi)4627 void blasfeo_dveccl(int m, struct blasfeo_dvec *sxm, int xim, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sxp, int xip, struct blasfeo_dvec *sz, int zi)
4628 {
4629
4630 double *xm = sxm->pa + xim;
4631 double *x = sx->pa + xi;
4632 double *xp = sxp->pa + xip;
4633 double *z = sz->pa + zi;
4634
4635 int ii;
4636
4637 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
4638 double d0;
4639
4640 __m256d
4641 xm0, x0, xp0, z0, tmp0, tmp1, ones, mones, mask1, mask2;
4642
4643 ones = _mm256_set_pd( 1.0, 1.0, 1.0, 1.0 );
4644 mones = _mm256_set_pd( -1.0, -1.0, -1.0, -1.0 );
4645 mask1 = _mm256_set_pd( 3.5, 2.5, 1.5, 0.5 );
4646
4647 for(ii=0; ii<m-3; ii+=4)
4648 {
4649 x0 = _mm256_loadu_pd( &x[ii] );
4650 xp0 = _mm256_loadu_pd( &xp[ii] );
4651 xm0 = _mm256_loadu_pd( &xm[ii] );
4652 tmp0 = _mm256_cmp_pd( xp0, x0, 0x2 );
4653 tmp1 = _mm256_cmp_pd( x0, xm0, 0x2 );
4654 z0 = _mm256_blendv_pd( x0, xp0, tmp0 );
4655 z0 = _mm256_blendv_pd( z0, xm0, tmp1 );
4656 _mm256_storeu_pd( &z[ii], z0 );
4657 }
4658 if(ii<m)
4659 {
4660 d0 = (double) m-ii;
4661 mask2 = _mm256_broadcast_sd( &d0 );
4662 mask2 = _mm256_sub_pd( mask1, mask2 );
4663 x0 = _mm256_loadu_pd( &x[ii] );
4664 xp0 = _mm256_loadu_pd( &xp[ii] );
4665 xm0 = _mm256_loadu_pd( &xm[ii] );
4666 tmp0 = _mm256_cmp_pd( xp0, x0, 0x2 );
4667 tmp1 = _mm256_cmp_pd( x0, xm0, 0x2 );
4668 z0 = _mm256_blendv_pd( x0, xp0, tmp0 );
4669 z0 = _mm256_blendv_pd( z0, xm0, tmp1 );
4670 _mm256_maskstore_pd( &z[ii], _mm256_castpd_si256( mask2 ), z0 );
4671 }
4672 #else
4673 for(ii=0; ii<m; ii++)
4674 {
4675 if(x[ii]>=xp[ii])
4676 {
4677 z[ii] = xp[ii];
4678 }
4679 else if(x[ii]<=xm[ii])
4680 {
4681 z[ii] = xm[ii];
4682 }
4683 else
4684 {
4685 z[ii] = x[ii];
4686 }
4687 }
4688 #endif
4689
4690 return;
4691
4692 }
4693
4694
4695
4696 // clip strvec between two strvec, with mask
blasfeo_dveccl_mask(int m,struct blasfeo_dvec * sxm,int xim,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sxp,int xip,struct blasfeo_dvec * sz,int zi,struct blasfeo_dvec * sm,int mi)4697 void blasfeo_dveccl_mask(int m, struct blasfeo_dvec *sxm, int xim, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sxp, int xip, struct blasfeo_dvec *sz, int zi, struct blasfeo_dvec *sm, int mi)
4698 {
4699
4700 double *xm = sxm->pa + xim;
4701 double *x = sx->pa + xi;
4702 double *xp = sxp->pa + xip;
4703 double *z = sz->pa + zi;
4704 double *mask = sm->pa + mi;
4705
4706 int ii;
4707
4708 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
4709 double d0;
4710
4711 __m256d
4712 xm0, x0, xp0, z0, mask0, tmp0, tmp1, ones, mones, mask1, mask2;
4713
4714 ones = _mm256_set_pd( 1.0, 1.0, 1.0, 1.0 );
4715 mones = _mm256_set_pd( -1.0, -1.0, -1.0, -1.0 );
4716 mask1 = _mm256_set_pd( 3.5, 2.5, 1.5, 0.5 );
4717
4718 for(ii=0; ii<m-3; ii+=4)
4719 {
4720 mask0 = _mm256_setzero_pd();
4721 x0 = _mm256_loadu_pd( &x[ii] );
4722 xp0 = _mm256_loadu_pd( &xp[ii] );
4723 xm0 = _mm256_loadu_pd( &xm[ii] );
4724 tmp0 = _mm256_cmp_pd( xp0, x0, 0x2 );
4725 tmp1 = _mm256_cmp_pd( x0, xm0, 0x2 );
4726 z0 = _mm256_blendv_pd( x0, xp0, tmp0 );
4727 z0 = _mm256_blendv_pd( z0, xm0, tmp1 );
4728 mask0 = _mm256_blendv_pd( mask0, ones, tmp0 );
4729 mask0 = _mm256_blendv_pd( mask0, mones, tmp1 );
4730 _mm256_storeu_pd( &z[ii], z0 );
4731 _mm256_storeu_pd( &mask[ii], mask0 );
4732 }
4733 if(ii<m)
4734 {
4735 d0 = (double) m-ii;
4736 mask2 = _mm256_broadcast_sd( &d0 );
4737 mask2 = _mm256_sub_pd( mask1, mask2 );
4738 mask0 = _mm256_setzero_pd();
4739 x0 = _mm256_loadu_pd( &x[ii] );
4740 xp0 = _mm256_loadu_pd( &xp[ii] );
4741 xm0 = _mm256_loadu_pd( &xm[ii] );
4742 tmp0 = _mm256_cmp_pd( xp0, x0, 0x2 );
4743 tmp1 = _mm256_cmp_pd( x0, xm0, 0x2 );
4744 z0 = _mm256_blendv_pd( x0, xp0, tmp0 );
4745 z0 = _mm256_blendv_pd( z0, xm0, tmp1 );
4746 mask0 = _mm256_blendv_pd( mask0, ones, tmp0 );
4747 mask0 = _mm256_blendv_pd( mask0, mones, tmp1 );
4748 _mm256_maskstore_pd( &z[ii], _mm256_castpd_si256( mask2 ), z0 );
4749 _mm256_maskstore_pd( &mask[ii], _mm256_castpd_si256( mask2 ), mask0 );
4750 }
4751 #else
4752 for(ii=0; ii<m; ii++)
4753 {
4754 if(x[ii]>=xp[ii])
4755 {
4756 z[ii] = xp[ii];
4757 mask[ii] = 1.0;
4758 }
4759 else if(x[ii]<=xm[ii])
4760 {
4761 z[ii] = xm[ii];
4762 mask[ii] = -1.0;
4763 }
4764 else
4765 {
4766 z[ii] = x[ii];
4767 mask[ii] = 0.0;
4768 }
4769 }
4770 #endif
4771
4772 return;
4773
4774 }
4775
4776
4777 // zero out strvec to strvec with mask
blasfeo_dvecze(int m,struct blasfeo_dvec * sm,int mi,struct blasfeo_dvec * sv,int vi,struct blasfeo_dvec * se,int ei)4778 void blasfeo_dvecze(int m, struct blasfeo_dvec *sm, int mi, struct blasfeo_dvec *sv, int vi, struct blasfeo_dvec *se, int ei)
4779 {
4780 double *mask = sm->pa + mi;
4781 double *v = sv->pa + vi;
4782 double *e = se->pa + ei;
4783
4784 int ii;
4785
4786 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
4787 double d0;
4788
4789 __m256d
4790 mask0, mask1, mask2, mask3, fives, zeros, e0, v0;
4791
4792 fives = _mm256_set_pd( 0.5, 0.5, 0.5, 0.5 );
4793 zeros = _mm256_setzero_pd();
4794 mask3 = _mm256_set_pd( 3.5, 2.5, 1.5, 0.5 );
4795
4796 for(ii=0; ii<m-3; ii+=4)
4797 {
4798 v0 = _mm256_loadu_pd( &v[ii] );
4799 mask0 = _mm256_loadu_pd( &mask[ii] );
4800 mask1 = mask0;
4801 mask0 = _mm256_sub_pd( mask0, fives);
4802 mask1 = _mm256_add_pd( mask1, fives);
4803 mask0 = _mm256_xor_pd( mask0, mask1);
4804 e0 = _mm256_blendv_pd( zeros, v0, mask0 );
4805 _mm256_storeu_pd( &e[ii], e0 );
4806 }
4807 if(ii<m)
4808 {
4809 d0 = (double) m-ii;
4810 mask2 = _mm256_broadcast_sd( &d0 );
4811 mask2 = _mm256_sub_pd( mask3, mask2 );
4812 v0 = _mm256_loadu_pd( &v[ii] );
4813 mask0 = _mm256_loadu_pd( &mask[ii] );
4814 mask1 = mask0;
4815 mask0 = _mm256_sub_pd( mask0, fives);
4816 mask1 = _mm256_add_pd( mask1, fives);
4817 mask0 = _mm256_xor_pd( mask0, mask1);
4818 e0 = _mm256_blendv_pd( zeros, v0, mask0 );
4819 _mm256_maskstore_pd( &e[ii], _mm256_castpd_si256( mask2 ), e0 );
4820 }
4821 #else
4822 for(ii=0; ii<m; ii++)
4823 {
4824 if(mask[ii]==0)
4825 {
4826 e[ii] = v[ii];
4827 }
4828 else
4829 {
4830 e[ii] = 0;
4831 }
4832 }
4833 #endif
4834
4835 }
4836
4837
4838 // compute inf norm of strvec
blasfeo_dvecnrm_inf(int m,struct blasfeo_dvec * sx,int xi,double * ptr_norm)4839 void blasfeo_dvecnrm_inf(int m, struct blasfeo_dvec *sx, int xi, double *ptr_norm)
4840 {
4841 int ii;
4842 double *x = sx->pa + xi;
4843 double norm = 0.0;
4844 double tmp;
4845 for(ii=0; ii<m; ii++)
4846 {
4847 #ifdef USE_C99_MATH
4848 norm = fmax(norm, fabs(x[ii]));
4849 #else
4850 tmp = fabs(x[ii]);
4851 norm = tmp>norm ? tmp : norm;
4852 #endif
4853 }
4854 *ptr_norm = norm;
4855 return;
4856 }
4857
4858
4859
4860 // permute elements of a vector struct
blasfeo_dvecpe(int kmax,int * ipiv,struct blasfeo_dvec * sx,int xi)4861 void blasfeo_dvecpe(int kmax, int *ipiv, struct blasfeo_dvec *sx, int xi)
4862 {
4863 int ii;
4864 double tmp;
4865 double *x = sx->pa + xi;
4866 for(ii=0; ii<kmax; ii++)
4867 {
4868 if(ipiv[ii]!=ii)
4869 {
4870 tmp = x[ipiv[ii]];
4871 x[ipiv[ii]] = x[ii];
4872 x[ii] = tmp;
4873 }
4874 }
4875 return;
4876 }
4877
4878
4879
4880 // inverse permute elements of a vector struct
blasfeo_dvecpei(int kmax,int * ipiv,struct blasfeo_dvec * sx,int xi)4881 void blasfeo_dvecpei(int kmax, int *ipiv, struct blasfeo_dvec *sx, int xi)
4882 {
4883 int ii;
4884 double tmp;
4885 double *x = sx->pa + xi;
4886 for(ii=kmax-1; ii>=0; ii--)
4887 {
4888 if(ipiv[ii]!=ii)
4889 {
4890 tmp = x[ipiv[ii]];
4891 x[ipiv[ii]] = x[ii];
4892 x[ii] = tmp;
4893 }
4894 }
4895 return;
4896 }
4897
4898
4899
4900 #else
4901
4902 #error : wrong LA choice
4903
4904 #endif
4905