1 /**************************************************************************************************
2 * *
3 * This file is part of BLASFEO. *
4 * *
5 * BLASFEO -- BLAS For Embedded Optimization. *
6 * Copyright (C) 2019 by Gianluca Frison. *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8 * All rights reserved. *
9 * *
10 * The 2-Clause BSD License *
11 * *
12 * Redistribution and use in source and binary forms, with or without *
13 * modification, are permitted provided that the following conditions are met: *
14 * *
15 * 1. Redistributions of source code must retain the above copyright notice, this *
16 * list of conditions and the following disclaimer. *
17 * 2. Redistributions in binary form must reproduce the above copyright notice, *
18 * this list of conditions and the following disclaimer in the documentation *
19 * and/or other materials provided with the distribution. *
20 * *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
31 * *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de *
33 * *
34 **************************************************************************************************/
35
36 #include <stdlib.h>
37 #include <stdio.h>
38 #include <math.h>
39
40 #include "../include/blasfeo_common.h"
41 #include "../include/blasfeo_block_size.h"
42 #include "../include/blasfeo_s_kernel.h"
43
44
45
46 // scales and adds a strvec into a strvec
blasfeo_svecad(int m,float * alphap,struct blasfeo_svec * sa,int ai,struct blasfeo_svec * sc,int ci)47 void blasfeo_svecad(int m, float *alphap, struct blasfeo_svec *sa, int ai, struct blasfeo_svec *sc, int ci)
48 {
49 float alpha = alphap[0];
50 float *pa = sa->pa + ai;
51 float *pc = sc->pa + ci;
52 int ii;
53 ii = 0;
54 for(; ii<m-3; ii+=4)
55 {
56 pc[ii+0] += alpha*pa[ii+0];
57 pc[ii+1] += alpha*pa[ii+1];
58 pc[ii+2] += alpha*pa[ii+2];
59 pc[ii+3] += alpha*pa[ii+3];
60 }
61 for(; ii<m; ii++)
62 {
63 pc[ii+0] += alpha*pa[ii+0];
64 }
65 return;
66 }
67
68
69
70 // transpose general matrix; m and n are referred to the original matrix
sgetr_lib(int m,int n,float alpha,int offsetA,float * pA,int sda,int offsetC,float * pC,int sdc)71 void sgetr_lib(int m, int n, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
72 {
73
74 /*
75
76 m = 5
77 n = 3
78 offsetA = 1
79 offsetC = 2
80
81 A =
82 x x x
83 -
84 x x x
85 x x x
86 x x x
87 x x x
88
89 C =
90 x x x x x
91 x x x x x
92 -
93 x x x x x
94
95 */
96
97 if(m<=0 || n<=0)
98 return;
99
100 const int bs = 4;
101
102 int mna = (bs-offsetA%bs)%bs;
103 mna = m<mna ? m : mna;
104 int nna = (bs-offsetC%bs)%bs;
105 nna = n<nna ? n : nna;
106
107 int ii;
108
109 ii = 0;
110
111 if(mna>0)
112 {
113 if(mna==1)
114 kernel_sgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
115 else if(mna==2)
116 kernel_sgetr_2_lib4(0, n, nna, alpha, pA, pC, sdc);
117 else //if(mna==3)
118 kernel_sgetr_3_lib4(0, n, nna, alpha, pA, pC, sdc);
119 ii += mna;
120 pA += mna + bs*(sda-1);
121 pC += mna*bs;
122 }
123 for( ; ii<m-3; ii+=4)
124 // for( ; ii<m; ii+=4)
125 {
126 kernel_sgetr_4_lib4(0, n, nna, alpha, pA, pC, sdc);
127 pA += bs*sda;
128 pC += bs*bs;
129 }
130
131 // clean-up at the end using smaller kernels
132 if(ii==m)
133 return;
134
135 if(m-ii==1)
136 kernel_sgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
137 else if(m-ii==2)
138 kernel_sgetr_2_lib4(0, n, nna, alpha, pA, pC, sdc);
139 else if(m-ii==3)
140 kernel_sgetr_3_lib4(0, n, nna, alpha, pA, pC, sdc);
141
142 return;
143
144 }
145
146
147
148 // transpose lower triangular matrix
strtr_l_lib(int m,float alpha,int offsetA,float * pA,int sda,int offsetC,float * pC,int sdc)149 void strtr_l_lib(int m, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
150 {
151
152 /*
153
154 A =
155 x
156 x x
157 x x x
158 x x x x
159
160 x x x x x
161 x x x x x x
162 x x x x x x x
163 x x x x x x x x
164
165 C =
166 x x x x x x x x
167
168 x x x x x x x
169 x x x x x x
170 x x x x x
171 x x x x
172
173 x x x
174 x x
175 x
176
177 */
178
179 int n = m;
180
181 if(m<=0 || n<=0)
182 return;
183
184 const int bs = 4;
185
186 int mna = (bs-offsetA%bs)%bs;
187 mna = m<mna ? m : mna;
188 int nna = (bs-offsetC%bs)%bs;
189 nna = n<nna ? n : nna;
190
191 int ii;
192
193 ii = 0;
194
195 if(mna>0)
196 {
197 if(mna==1)
198 {
199 pC[0] = alpha * pA[0];
200 }
201 else if(mna==2)
202 {
203 if(nna==1)
204 {
205 pC[0+bs*0] = alpha * pA[0+bs*0];
206 pC[0+bs*1] = alpha * pA[1+bs*0];
207 pC[1+bs*(0+sdc)] = alpha * pA[1+bs*1];
208 }
209 else
210 {
211 pC[0+bs*0] = alpha * pA[0+bs*0];
212 pC[0+bs*1] = alpha * pA[1+bs*0];
213 pC[1+bs*1] = alpha * pA[1+bs*1];
214 }
215 }
216 else //if(mna==3)
217 {
218 if(nna==1)
219 {
220 pC[0+bs*0] = alpha * pA[0+bs*0];
221 pC[0+bs*1] = alpha * pA[1+bs*0];
222 pC[0+bs*2] = alpha * pA[2+bs*0];
223 pC[1+bs*(0+sdc)] = alpha * pA[1+bs*1];
224 pC[1+bs*(1+sdc)] = alpha * pA[2+bs*1];
225 pC[2+bs*(1+sdc)] = alpha * pA[2+bs*2];
226 }
227 else if(nna==2)
228 {
229 pC[0+bs*0] = alpha * pA[0+bs*0];
230 pC[0+bs*1] = alpha * pA[1+bs*0];
231 pC[0+bs*2] = alpha * pA[2+bs*0];
232 pC[1+bs*1] = alpha * pA[1+bs*1];
233 pC[1+bs*2] = alpha * pA[2+bs*1];
234 pC[2+bs*(1+sdc)] = alpha * pA[2+bs*2];
235 }
236 else
237 {
238 pC[0+bs*0] = alpha * pA[0+bs*0];
239 pC[0+bs*1] = alpha * pA[1+bs*0];
240 pC[0+bs*2] = alpha * pA[2+bs*0];
241 pC[1+bs*1] = alpha * pA[1+bs*1];
242 pC[1+bs*2] = alpha * pA[2+bs*1];
243 pC[2+bs*2] = alpha * pA[2+bs*2];
244 }
245 }
246 ii += mna;
247 pA += mna + bs*(sda-1);
248 pC += mna*bs;
249 }
250 for( ; ii<m-3; ii+=4)
251 {
252 kernel_sgetr_4_lib4(1, ii, nna, alpha, pA, pC, sdc);
253 pA += bs*sda;
254 pC += bs*bs;
255 }
256
257 // clean-up at the end using smaller kernels
258 if(ii==m)
259 return;
260
261 if(m-ii==1)
262 kernel_sgetr_1_lib4(1, ii, nna, alpha, pA, pC, sdc);
263 else if(m-ii==2)
264 kernel_sgetr_2_lib4(1, ii, nna, alpha, pA, pC, sdc);
265 else if(m-ii==3)
266 kernel_sgetr_3_lib4(1, ii, nna, alpha, pA, pC, sdc);
267
268 return;
269
270 }
271
272
273
274 // transpose an aligned upper triangular matrix into an aligned lower triangular matrix
strtr_u_lib(int m,float alpha,int offsetA,float * pA,int sda,int offsetC,float * pC,int sdc)275 void strtr_u_lib(int m, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
276 {
277
278 /*
279
280 A =
281 x x x x x x x x
282 x x x x x x x
283
284 x x x x x x
285 x x x x x
286 x x x x
287 x x x
288 x x
289 x
290
291 C =
292 x
293
294 x x
295 x x x
296 x x x x
297 x x x x x
298 x x x x x x
299 x x x x x x x
300 x x x x x x x x
301
302 */
303
304 int n = m;
305
306 if(m<=0 || n<=0)
307 return;
308
309 const int bs = 4;
310
311 int mna = (bs-offsetA%bs)%bs;
312 mna = m<mna ? m : mna;
313 int nna = (bs-offsetC%bs)%bs;
314 nna = n<nna ? n : nna;
315 int tna = nna;
316
317 int ii;
318
319 ii = 0;
320
321 if(mna>0)
322 {
323 if(mna==1)
324 {
325 kernel_sgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
326 if(nna!=1)
327 {
328 // pC[0+bs*0] = alpha * pA[0+bs*0];
329 pA += 1*bs;
330 pC += 1;
331 tna = (bs-(offsetC+1)%bs)%bs;
332 }
333 else //if(nna==1)
334 {
335 // pC[0+bs*0] = alpha * pA[0+bs*0];
336 pA += 1*bs;
337 pC += 1 + (sdc-1)*bs;
338 tna = 0; //(bs-(offsetC+1)%bs)%bs;
339 }
340 // kernel_sgetr_1_lib4(0, n-1, tna, alpha, pA, pC, sdc);
341 }
342 else if(mna==2)
343 {
344 if(nna==0 || nna==3)
345 {
346 pC[0+bs*0] = alpha * pA[0+bs*0];
347 pC[1+bs*0] = alpha * pA[0+bs*1];
348 pC[1+bs*1] = alpha * pA[1+bs*1];
349 pA += 2*bs;
350 pC += 2;
351 tna = (bs-(offsetC+2)%bs)%bs;
352 kernel_sgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
353 }
354 else if(nna==1)
355 {
356 pC[0+bs*0] = alpha * pA[0+bs*0];
357 pA += 1*bs;
358 pC += 1 + (sdc-1)*bs;
359 // pC[0+bs*0] = alpha * pA[0+bs*0];
360 // pC[0+bs*1] = alpha * pA[1+bs*0];
361 kernel_sgetr_2_lib4(0, n-1, 0, alpha, pA, pC, sdc);
362 pA += 1*bs;
363 pC += 1;
364 tna = 3; //(bs-(offsetC+2)%bs)%bs;
365 // kernel_sgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
366 }
367 else if(nna==2)
368 {
369 pC[0+bs*0] = alpha * pA[0+bs*0];
370 pC[1+bs*0] = alpha * pA[0+bs*1];
371 pC[1+bs*1] = alpha * pA[1+bs*1];
372 pA += 2*bs;
373 pC += 2 + (sdc-1)*bs;
374 tna = 0; //(bs-(offsetC+2)%bs)%bs;
375 kernel_sgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
376 }
377 }
378 else //if(mna==3)
379 {
380 if(nna==0)
381 {
382 pC[0+bs*0] = alpha * pA[0+bs*0];
383 pC[1+bs*0] = alpha * pA[0+bs*1];
384 pC[1+bs*1] = alpha * pA[1+bs*1];
385 pC[2+bs*0] = alpha * pA[0+bs*2];
386 pC[2+bs*1] = alpha * pA[1+bs*2];
387 pC[2+bs*2] = alpha * pA[2+bs*2];
388 pA += 3*bs;
389 pC += 3;
390 tna = 1;
391 kernel_sgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
392 }
393 else if(nna==1)
394 {
395 pC[0+bs*0] = alpha * pA[0+bs*0];
396 pA += bs;
397 pC += 1 + (sdc-1)*bs;
398 pC[0+bs*0] = alpha * pA[0+bs*0];
399 pC[0+bs*1] = alpha * pA[1+bs*0];
400 pC[1+bs*0] = alpha * pA[0+bs*1];
401 pC[1+bs*1] = alpha * pA[1+bs*1];
402 pC[1+bs*2] = alpha * pA[2+bs*1];
403 pA += 2*bs;
404 pC += 2;
405 tna = 2;
406 kernel_sgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
407 }
408 else if(nna==2)
409 {
410 pC[0+bs*0] = alpha * pA[0+bs*0];
411 pC[1+bs*0] = alpha * pA[0+bs*1];
412 pC[1+bs*1] = alpha * pA[1+bs*1];
413 pA += 2*bs;
414 pC += 2 + (sdc-1)*bs;
415 // pC[0+bs*0] = alpha * pA[0+bs*0];
416 // pC[0+bs*1] = alpha * pA[1+bs*0];
417 // pC[0+bs*2] = alpha * pA[2+bs*0];
418 kernel_sgetr_3_lib4(0, n-2, 0, alpha, pA, pC, sdc);
419 pA += 1*bs;
420 pC += 1;
421 tna = 3;
422 // kernel_sgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
423 }
424 else //if(nna==3)
425 {
426 pC[0+bs*0] = alpha * pA[0+bs*0];
427 pC[1+bs*0] = alpha * pA[0+bs*1];
428 pC[1+bs*1] = alpha * pA[1+bs*1];
429 pC[2+bs*0] = alpha * pA[0+bs*2];
430 pC[2+bs*1] = alpha * pA[1+bs*2];
431 pC[2+bs*2] = alpha * pA[2+bs*2];
432 pA += 3*bs;
433 pC += 3 + (sdc-1)*bs;
434 tna = 0;
435 kernel_sgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
436 }
437 }
438 ii += mna;
439 pA += mna + bs*(sda-1);
440 pC += mna*bs;
441 }
442 for( ; ii<m-3; ii+=4)
443 {
444 if(tna==0)
445 {
446 pC[0+bs*0] = alpha * pA[0+bs*0];
447 pC[1+bs*0] = alpha * pA[0+bs*1];
448 pC[1+bs*1] = alpha * pA[1+bs*1];
449 pC[2+bs*0] = alpha * pA[0+bs*2];
450 pC[2+bs*1] = alpha * pA[1+bs*2];
451 pC[2+bs*2] = alpha * pA[2+bs*2];
452 pC[3+bs*0] = alpha * pA[0+bs*3];
453 pC[3+bs*1] = alpha * pA[1+bs*3];
454 pC[3+bs*2] = alpha * pA[2+bs*3];
455 pC[3+bs*3] = alpha * pA[3+bs*3];
456 pA += 4*bs;
457 pC += sdc*bs;
458 kernel_sgetr_4_lib4(0, n-ii-4, 0, alpha, pA, pC, sdc);
459 }
460 else if(tna==1)
461 {
462 pC[0+bs*0] = alpha * pA[0+bs*0];
463 pA += bs;
464 pC += 1 + (sdc-1)*bs;
465 pC[0+bs*0] = alpha * pA[0+bs*0];
466 pC[0+bs*1] = alpha * pA[1+bs*0];
467 pC[1+bs*0] = alpha * pA[0+bs*1];
468 pC[1+bs*1] = alpha * pA[1+bs*1];
469 pC[1+bs*2] = alpha * pA[2+bs*1];
470 pC[2+bs*0] = alpha * pA[0+bs*2];
471 pC[2+bs*1] = alpha * pA[1+bs*2];
472 pC[2+bs*2] = alpha * pA[2+bs*2];
473 pC[2+bs*3] = alpha * pA[3+bs*2];
474 pA += 3*bs;
475 pC += 3;
476 kernel_sgetr_4_lib4(0, n-ii-4, 1, alpha, pA, pC, sdc);
477 }
478 else if(tna==2)
479 {
480 pC[0+bs*0] = alpha * pA[0+bs*0];
481 pC[1+bs*0] = alpha * pA[0+bs*1];
482 pC[1+bs*1] = alpha * pA[1+bs*1];
483 pA += 2*bs;
484 pC += 2 + (sdc-1)*bs;
485 pC[0+bs*0] = alpha * pA[0+bs*0];
486 pC[0+bs*1] = alpha * pA[1+bs*0];
487 pC[0+bs*2] = alpha * pA[2+bs*0];
488 pC[1+bs*0] = alpha * pA[0+bs*1];
489 pC[1+bs*1] = alpha * pA[1+bs*1];
490 pC[1+bs*2] = alpha * pA[2+bs*1];
491 pC[1+bs*3] = alpha * pA[3+bs*1];
492 pA += 2*bs;
493 pC += 2;
494 kernel_sgetr_4_lib4(0, n-ii-4, 2, alpha, pA, pC, sdc);
495 }
496 else //if(tna==3)
497 {
498 pC[0+bs*0] = alpha * pA[0+bs*0];
499 pC[1+bs*0] = alpha * pA[0+bs*1];
500 pC[1+bs*1] = alpha * pA[1+bs*1];
501 pC[2+bs*0] = alpha * pA[0+bs*2];
502 pC[2+bs*1] = alpha * pA[1+bs*2];
503 pC[2+bs*2] = alpha * pA[2+bs*2];
504 pA += 3*bs;
505 pC += 3 + (sdc-1)*bs;
506 kernel_sgetr_4_lib4(0, n-ii-3, 0, alpha, pA, pC, sdc);
507 // pC[0+bs*0] = alpha * pA[0+bs*0];
508 // pC[0+bs*1] = alpha * pA[1+bs*0];
509 // pC[0+bs*2] = alpha * pA[2+bs*0];
510 // pC[0+bs*3] = alpha * pA[3+bs*0];
511 pA += bs;
512 pC += 1;
513 // kernel_sgetr_4_lib4(0, n-ii-4, tna, alpha, pA, pC, sdc);
514 }
515 pA += bs*sda;
516 pC += bs*bs;
517 }
518
519 // clean-up at the end
520 if(ii==m)
521 return;
522
523 if(m-ii==1)
524 {
525 pC[0+bs*0] = alpha * pA[0+bs*0];
526 }
527 else if(m-ii==2)
528 {
529 if(tna!=1)
530 {
531 pC[0+bs*0] = alpha * pA[0+bs*0];
532 pC[1+bs*0] = alpha * pA[0+bs*1];
533 pC[1+bs*1] = alpha * pA[1+bs*1];
534 }
535 else //if(tna==1)
536 {
537 pC[0+bs*0] = alpha * pA[0+bs*0];
538 pA += bs;
539 pC += 1 + (sdc-1)*bs;
540 pC[0+bs*0] = alpha * pA[0+bs*0];
541 pC[0+bs*1] = alpha * pA[1+bs*0];
542 }
543 }
544 else if(m-ii==3)
545 {
546 if(tna==0 || tna==3)
547 {
548 pC[0+bs*0] = alpha * pA[0+bs*0];
549 pC[1+bs*0] = alpha * pA[0+bs*1];
550 pC[1+bs*1] = alpha * pA[1+bs*1];
551 pC[2+bs*0] = alpha * pA[0+bs*2];
552 pC[2+bs*1] = alpha * pA[1+bs*2];
553 pC[2+bs*2] = alpha * pA[2+bs*2];
554 }
555 else if(tna==1)
556 {
557 pC[0+bs*0] = alpha * pA[0+bs*0];
558 pA += bs;
559 pC += 1 + (sdc-1)*bs;
560 pC[0+bs*0] = alpha * pA[0+bs*0];
561 pC[0+bs*1] = alpha * pA[1+bs*0];
562 pC[1+bs*0] = alpha * pA[0+bs*0];
563 pC[1+bs*1] = alpha * pA[1+bs*1];
564 pC[1+bs*2] = alpha * pA[2+bs*1];
565 }
566 else //if(tna==2)
567 {
568 pC[0+bs*0] = alpha * pA[0+bs*0];
569 pC[1+bs*0] = alpha * pA[0+bs*1];
570 pC[1+bs*1] = alpha * pA[1+bs*1];
571 pA += 2*bs;
572 pC += 2 + (sdc-1)*bs;
573 pC[0+bs*0] = alpha * pA[0+bs*0];
574 pC[0+bs*1] = alpha * pA[1+bs*0];
575 pC[0+bs*2] = alpha * pA[2+bs*0];
576 }
577 }
578
579 return;
580
581 }
582
583
584
585 // regularize diagonal
sdiareg_lib(int kmax,float reg,int offset,float * pD,int sdd)586 void sdiareg_lib(int kmax, float reg, int offset, float *pD, int sdd)
587 {
588
589 const int bs = 4;
590
591 int kna = (bs-offset%bs)%bs;
592 kna = kmax<kna ? kmax : kna;
593
594 int jj, ll;
595
596 if(kna>0)
597 {
598 for(ll=0; ll<kna; ll++)
599 {
600 pD[ll+bs*ll] += reg;
601 }
602 pD += kna + bs*(sdd-1) + kna*bs;
603 kmax -= kna;
604 }
605 for(jj=0; jj<kmax-3; jj+=4)
606 {
607 pD[jj*sdd+(jj+0)*bs+0] += reg;
608 pD[jj*sdd+(jj+1)*bs+1] += reg;
609 pD[jj*sdd+(jj+2)*bs+2] += reg;
610 pD[jj*sdd+(jj+3)*bs+3] += reg;
611 }
612 for(ll=0; ll<kmax-jj; ll++)
613 {
614 pD[jj*sdd+(jj+ll)*bs+ll] += reg;
615 }
616
617 }
618
619
620
621 // insert vector to diagonal
sdiain_lib(int kmax,float alpha,float * x,int offset,float * pD,int sdd)622 void sdiain_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd)
623 {
624
625 const int bs = 4;
626
627 int kna = (bs-offset%bs)%bs;
628 kna = kmax<kna ? kmax : kna;
629
630 int jj, ll;
631
632 if(kna>0)
633 {
634 for(ll=0; ll<kna; ll++)
635 {
636 pD[ll+bs*ll] = alpha*x[ll];
637 }
638 pD += kna + bs*(sdd-1) + kna*bs;
639 x += kna;
640 kmax -= kna;
641 }
642 for(jj=0; jj<kmax-3; jj+=4)
643 {
644 pD[jj*sdd+(jj+0)*bs+0] = alpha*x[jj+0];
645 pD[jj*sdd+(jj+1)*bs+1] = alpha*x[jj+1];
646 pD[jj*sdd+(jj+2)*bs+2] = alpha*x[jj+2];
647 pD[jj*sdd+(jj+3)*bs+3] = alpha*x[jj+3];
648 }
649 for(ll=0; ll<kmax-jj; ll++)
650 {
651 pD[jj*sdd+(jj+ll)*bs+ll] = alpha*x[jj+ll];
652 }
653
654 }
655
656
657
658 // insert sqrt of vector to diagonal
sdiain_sqrt_lib(int kmax,float * x,int offset,float * pD,int sdd)659 void sdiain_sqrt_lib(int kmax, float *x, int offset, float *pD, int sdd)
660 {
661
662 const int bs = 4;
663
664 int kna = (bs-offset%bs)%bs;
665 kna = kmax<kna ? kmax : kna;
666
667 int jj, ll;
668
669 if(kna>0)
670 {
671 for(ll=0; ll<kna; ll++)
672 {
673 pD[ll+bs*ll] = sqrt(x[ll]);
674 }
675 pD += kna + bs*(sdd-1) + kna*bs;
676 x += kna;
677 kmax -= kna;
678 }
679 for(jj=0; jj<kmax-3; jj+=4)
680 {
681 pD[jj*sdd+(jj+0)*bs+0] = sqrt(x[jj+0]);
682 pD[jj*sdd+(jj+1)*bs+1] = sqrt(x[jj+1]);
683 pD[jj*sdd+(jj+2)*bs+2] = sqrt(x[jj+2]);
684 pD[jj*sdd+(jj+3)*bs+3] = sqrt(x[jj+3]);
685 }
686 for(ll=0; ll<kmax-jj; ll++)
687 {
688 pD[jj*sdd+(jj+ll)*bs+ll] = sqrt(x[jj+ll]);
689 }
690
691 }
692
693
694
695 // extract diagonal to vector
sdiaex_lib(int kmax,float alpha,int offset,float * pD,int sdd,float * x)696 void sdiaex_lib(int kmax, float alpha, int offset, float *pD, int sdd, float *x)
697 {
698
699 const int bs = 4;
700
701 int kna = (bs-offset%bs)%bs;
702 kna = kmax<kna ? kmax : kna;
703
704 int jj, ll;
705
706 if(kna>0)
707 {
708 for(ll=0; ll<kna; ll++)
709 {
710 x[ll] = alpha * pD[ll+bs*ll];
711 }
712 pD += kna + bs*(sdd-1) + kna*bs;
713 x += kna;
714 kmax -= kna;
715 }
716 for(jj=0; jj<kmax-3; jj+=4)
717 {
718 x[jj+0] = alpha * pD[jj*sdd+(jj+0)*bs+0];
719 x[jj+1] = alpha * pD[jj*sdd+(jj+1)*bs+1];
720 x[jj+2] = alpha * pD[jj*sdd+(jj+2)*bs+2];
721 x[jj+3] = alpha * pD[jj*sdd+(jj+3)*bs+3];
722 }
723 for(ll=0; ll<kmax-jj; ll++)
724 {
725 x[jj+ll] = alpha * pD[jj*sdd+(jj+ll)*bs+ll];
726 }
727
728 }
729
730
731
732 // add scaled vector to diagonal
sdiaad_lib(int kmax,float alpha,float * x,int offset,float * pD,int sdd)733 void sdiaad_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd)
734 {
735
736 const int bs = 4;
737
738 int kna = (bs-offset%bs)%bs;
739 kna = kmax<kna ? kmax : kna;
740
741 int jj, ll;
742
743 if(kna>0)
744 {
745 for(ll=0; ll<kna; ll++)
746 {
747 pD[ll+bs*ll] += alpha * x[ll];
748 }
749 pD += kna + bs*(sdd-1) + kna*bs;
750 x += kna;
751 kmax -= kna;
752 }
753 for(jj=0; jj<kmax-3; jj+=4)
754 {
755 pD[jj*sdd+(jj+0)*bs+0] += alpha * x[jj+0];
756 pD[jj*sdd+(jj+1)*bs+1] += alpha * x[jj+1];
757 pD[jj*sdd+(jj+2)*bs+2] += alpha * x[jj+2];
758 pD[jj*sdd+(jj+3)*bs+3] += alpha * x[jj+3];
759 }
760 for(ll=0; ll<kmax-jj; ll++)
761 {
762 pD[jj*sdd+(jj+ll)*bs+ll] += alpha * x[jj+ll];
763 }
764
765 }
766
767
768
769 // insert vector to diagonal, sparse formulation
sdiain_libsp(int kmax,int * idx,float alpha,float * x,float * pD,int sdd)770 void sdiain_libsp(int kmax, int *idx, float alpha, float *x, float *pD, int sdd)
771 {
772
773 const int bs = 4;
774
775 int ii, jj;
776
777 for(jj=0; jj<kmax; jj++)
778 {
779 ii = idx[jj];
780 pD[ii/bs*bs*sdd+ii%bs+ii*bs] = alpha * x[jj];
781 }
782
783 }
784
785
786
787 // extract diagonal to vector, sparse formulation
sdiaex_libsp(int kmax,int * idx,float alpha,float * pD,int sdd,float * x)788 void sdiaex_libsp(int kmax, int *idx, float alpha, float *pD, int sdd, float *x)
789 {
790
791 const int bs = 4;
792
793 int ii, jj;
794
795 for(jj=0; jj<kmax; jj++)
796 {
797 ii = idx[jj];
798 x[jj] = alpha * pD[ii/bs*bs*sdd+ii%bs+ii*bs];
799 }
800
801 }
802
803
804
805 // add scaled vector to diagonal, sparse formulation
sdiaad_libsp(int kmax,int * idx,float alpha,float * x,float * pD,int sdd)806 void sdiaad_libsp(int kmax, int *idx, float alpha, float *x, float *pD, int sdd)
807 {
808
809 const int bs = 4;
810
811 int ii, jj;
812
813 for(jj=0; jj<kmax; jj++)
814 {
815 ii = idx[jj];
816 pD[ii/bs*bs*sdd+ii%bs+ii*bs] += alpha * x[jj];
817 }
818
819 }
820
821
822
823 // add scaled vector to another vector and insert to diagonal, sparse formulation
sdiaadin_libsp(int kmax,int * idx,float alpha,float * x,float * y,float * pD,int sdd)824 void sdiaadin_libsp(int kmax, int *idx, float alpha, float *x, float *y, float *pD, int sdd)
825 {
826
827 const int bs = 4;
828
829 int ii, jj;
830
831 for(jj=0; jj<kmax; jj++)
832 {
833 ii = idx[jj];
834 pD[ii/bs*bs*sdd+ii%bs+ii*bs] = y[jj] + alpha * x[jj];
835 }
836
837 }
838
839
840
841 // insert vector to row
srowin_lib(int kmax,float alpha,float * x,float * pD)842 void srowin_lib(int kmax, float alpha, float *x, float *pD)
843 {
844
845 const int bs = 4;
846
847 int jj, ll;
848
849 for(jj=0; jj<kmax-3; jj+=4)
850 {
851 pD[(jj+0)*bs] = alpha*x[jj+0];
852 pD[(jj+1)*bs] = alpha*x[jj+1];
853 pD[(jj+2)*bs] = alpha*x[jj+2];
854 pD[(jj+3)*bs] = alpha*x[jj+3];
855 }
856 for(; jj<kmax; jj++)
857 {
858 pD[(jj)*bs] = alpha*x[jj];
859 }
860
861 }
862
863
864
865 // extract row to vector
srowex_lib(int kmax,float alpha,float * pD,float * x)866 void srowex_lib(int kmax, float alpha, float *pD, float *x)
867 {
868
869 const int bs = 4;
870
871 int jj, ll;
872
873 for(jj=0; jj<kmax-3; jj+=4)
874 {
875 x[jj+0] = alpha*pD[(jj+0)*bs];
876 x[jj+1] = alpha*pD[(jj+1)*bs];
877 x[jj+2] = alpha*pD[(jj+2)*bs];
878 x[jj+3] = alpha*pD[(jj+3)*bs];
879 }
880 for(; jj<kmax; jj++)
881 {
882 x[jj] = alpha*pD[(jj)*bs];
883 }
884
885 }
886
887
888
889 // add scaled vector to row
srowad_lib(int kmax,float alpha,float * x,float * pD)890 void srowad_lib(int kmax, float alpha, float *x, float *pD)
891 {
892
893 const int bs = 4;
894
895 int jj, ll;
896
897 for(jj=0; jj<kmax-3; jj+=4)
898 {
899 pD[(jj+0)*bs] += alpha * x[jj+0];
900 pD[(jj+1)*bs] += alpha * x[jj+1];
901 pD[(jj+2)*bs] += alpha * x[jj+2];
902 pD[(jj+3)*bs] += alpha * x[jj+3];
903 }
904 for(; jj<kmax; jj++)
905 {
906 pD[(jj)*bs] += alpha * x[jj];
907 }
908
909 }
910
911
912
913 // insert vector to row, sparse formulation
srowin_libsp(int kmax,float alpha,int * idx,float * x,float * pD)914 void srowin_libsp(int kmax, float alpha, int *idx, float *x, float *pD)
915 {
916
917 const int bs = 4;
918
919 int ii, jj;
920
921 for(jj=0; jj<kmax; jj++)
922 {
923 ii = idx[jj];
924 pD[ii*bs] = alpha*x[jj];
925 }
926
927 }
928
929
930
931 // add scaled vector to row, sparse formulation
srowad_libsp(int kmax,int * idx,float alpha,float * x,float * pD)932 void srowad_libsp(int kmax, int *idx, float alpha, float *x, float *pD)
933 {
934
935 const int bs = 4;
936
937 int ii, jj;
938
939 for(jj=0; jj<kmax; jj++)
940 {
941 ii = idx[jj];
942 pD[ii*bs] += alpha * x[jj];
943 }
944
945 }
946
947
948
949 // add scaled vector to another vector and insert to row, sparse formulation
srowadin_libsp(int kmax,int * idx,float alpha,float * x,float * y,float * pD)950 void srowadin_libsp(int kmax, int *idx, float alpha, float *x, float *y, float *pD)
951 {
952
953 const int bs = 4;
954
955 int ii, jj;
956
957 for(jj=0; jj<kmax; jj++)
958 {
959 ii = idx[jj];
960 pD[ii*bs] = y[jj] + alpha * x[jj];
961 }
962
963 }
964
965
966
967 // swap two rows
srowsw_lib(int kmax,float * pA,float * pC)968 void srowsw_lib(int kmax, float *pA, float *pC)
969 {
970
971 const int bs = 4;
972
973 int ii;
974 float tmp;
975
976 for(ii=0; ii<kmax-3; ii+=4)
977 {
978 tmp = pA[0+bs*0];
979 pA[0+bs*0] = pC[0+bs*0];
980 pC[0+bs*0] = tmp;
981 tmp = pA[0+bs*1];
982 pA[0+bs*1] = pC[0+bs*1];
983 pC[0+bs*1] = tmp;
984 tmp = pA[0+bs*2];
985 pA[0+bs*2] = pC[0+bs*2];
986 pC[0+bs*2] = tmp;
987 tmp = pA[0+bs*3];
988 pA[0+bs*3] = pC[0+bs*3];
989 pC[0+bs*3] = tmp;
990 pA += 4*bs;
991 pC += 4*bs;
992 }
993 for( ; ii<kmax; ii++)
994 {
995 tmp = pA[0+bs*0];
996 pA[0+bs*0] = pC[0+bs*0];
997 pC[0+bs*0] = tmp;
998 pA += 1*bs;
999 pC += 1*bs;
1000 }
1001
1002 }
1003
1004
1005
1006 // extract vector from column
scolex_lib(int kmax,int offset,float * pD,int sdd,float * x)1007 void scolex_lib(int kmax, int offset, float *pD, int sdd, float *x)
1008 {
1009
1010 const int bs = 4;
1011
1012 int kna = (bs-offset%bs)%bs;
1013 kna = kmax<kna ? kmax : kna;
1014
1015 int jj, ll;
1016
1017 if(kna>0)
1018 {
1019 for(ll=0; ll<kna; ll++)
1020 {
1021 x[ll] = pD[ll];
1022 }
1023 pD += kna + bs*(sdd-1);
1024 x += kna;
1025 kmax -= kna;
1026 }
1027 for(jj=0; jj<kmax-3; jj+=4)
1028 {
1029 x[jj*sdd+0] = pD[jj+0];
1030 x[jj*sdd+1] = pD[jj+1];
1031 x[jj*sdd+2] = pD[jj+2];
1032 x[jj*sdd+3] = pD[jj+3];
1033 }
1034 for(ll=0; ll<kmax-jj; ll++)
1035 {
1036 x[jj*sdd+ll] = pD[jj+ll];
1037 }
1038
1039 }
1040
1041
1042
1043 // insert vector to column
scolin_lib(int kmax,float * x,int offset,float * pD,int sdd)1044 void scolin_lib(int kmax, float *x, int offset, float *pD, int sdd)
1045 {
1046
1047 const int bs = 4;
1048
1049 int kna = (bs-offset%bs)%bs;
1050 kna = kmax<kna ? kmax : kna;
1051
1052 int jj, ll;
1053
1054 if(kna>0)
1055 {
1056 for(ll=0; ll<kna; ll++)
1057 {
1058 pD[ll] = x[ll];
1059 }
1060 pD += kna + bs*(sdd-1);
1061 x += kna;
1062 kmax -= kna;
1063 }
1064 for(jj=0; jj<kmax-3; jj+=4)
1065 {
1066 pD[jj*sdd+0] = x[jj+0];
1067 pD[jj*sdd+1] = x[jj+1];
1068 pD[jj*sdd+2] = x[jj+2];
1069 pD[jj*sdd+3] = x[jj+3];
1070 }
1071 for(ll=0; ll<kmax-jj; ll++)
1072 {
1073 pD[jj*sdd+ll] = x[jj+ll];
1074 }
1075
1076 }
1077
1078
1079
1080 // add scaled vector to column
scolad_lib(int kmax,float alpha,float * x,int offset,float * pD,int sdd)1081 void scolad_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd)
1082 {
1083
1084 const int bs = 4;
1085
1086 int kna = (bs-offset%bs)%bs;
1087 kna = kmax<kna ? kmax : kna;
1088
1089 int jj, ll;
1090
1091 if(kna>0)
1092 {
1093 for(ll=0; ll<kna; ll++)
1094 {
1095 pD[ll] += alpha * x[ll];
1096 }
1097 pD += kna + bs*(sdd-1);
1098 x += kna;
1099 kmax -= kna;
1100 }
1101 for(jj=0; jj<kmax-3; jj+=4)
1102 {
1103 pD[jj*sdd+0] += alpha * x[jj+0];
1104 pD[jj*sdd+1] += alpha * x[jj+1];
1105 pD[jj*sdd+2] += alpha * x[jj+2];
1106 pD[jj*sdd+3] += alpha * x[jj+3];
1107 }
1108 for(ll=0; ll<kmax-jj; ll++)
1109 {
1110 pD[jj*sdd+ll] += alpha * x[jj+ll];
1111 }
1112
1113 }
1114
1115
1116
1117 // insert vector to diagonal, sparse formulation
scolin_libsp(int kmax,int * idx,float * x,float * pD,int sdd)1118 void scolin_libsp(int kmax, int *idx, float *x, float *pD, int sdd)
1119 {
1120
1121 const int bs = 4;
1122
1123 int ii, jj;
1124
1125 for(jj=0; jj<kmax; jj++)
1126 {
1127 ii = idx[jj];
1128 pD[ii/bs*bs*sdd+ii%bs] = x[jj];
1129 }
1130
1131 }
1132
1133
1134
1135 // add scaled vector to diagonal, sparse formulation
scolad_libsp(int kmax,float alpha,int * idx,float * x,float * pD,int sdd)1136 void scolad_libsp(int kmax, float alpha, int *idx, float *x, float *pD, int sdd)
1137 {
1138
1139 const int bs = 4;
1140
1141 int ii, jj;
1142
1143 for(jj=0; jj<kmax; jj++)
1144 {
1145 ii = idx[jj];
1146 pD[ii/bs*bs*sdd+ii%bs] += alpha * x[jj];
1147 }
1148
1149 }
1150
1151
1152
1153 // swaps two cols
scolsw_lib(int kmax,int offsetA,float * pA,int sda,int offsetC,float * pC,int sdc)1154 void scolsw_lib(int kmax, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
1155 {
1156
1157 const int bs = 4;
1158
1159 int ii;
1160
1161 float tmp;
1162
1163 if(offsetA==offsetC)
1164 {
1165 if(offsetA>0)
1166 {
1167 ii = 0;
1168 for(; ii<bs-offsetA; ii++)
1169 {
1170 tmp = pA[0+bs*0];
1171 pA[0+bs*0] = pC[0+bs*0];
1172 pC[0+bs*0] = tmp;
1173 pA += 1;
1174 pC += 1;
1175 }
1176 pA += bs*(sda-1);
1177 pC += bs*(sdc-1);
1178 kmax -= bs-offsetA;
1179 }
1180 ii = 0;
1181 for(; ii<kmax-3; ii+=4)
1182 {
1183 tmp = pA[0+bs*0];
1184 pA[0+bs*0] = pC[0+bs*0];
1185 pC[0+bs*0] = tmp;
1186 tmp = pA[1+bs*0];
1187 pA[1+bs*0] = pC[1+bs*0];
1188 pC[1+bs*0] = tmp;
1189 tmp = pA[2+bs*0];
1190 pA[2+bs*0] = pC[2+bs*0];
1191 pC[2+bs*0] = tmp;
1192 tmp = pA[3+bs*0];
1193 pA[3+bs*0] = pC[3+bs*0];
1194 pC[3+bs*0] = tmp;
1195 pA += bs*sda;
1196 pC += bs*sdc;
1197 }
1198 for(; ii<kmax; ii++)
1199 {
1200 tmp = pA[0+bs*0];
1201 pA[0+bs*0] = pC[0+bs*0];
1202 pC[0+bs*0] = tmp;
1203 pA += 1;
1204 pC += 1;
1205 }
1206 }
1207 else
1208 {
1209 printf("\nscolsw: feature not implemented yet: offsetA!=offsetC\n\n");
1210 exit(1);
1211 }
1212
1213 return;
1214
1215 }
1216
1217
1218
1219 // insert vector to vector, sparse formulation
svecin_libsp(int kmax,int * idx,float * x,float * y)1220 void svecin_libsp(int kmax, int *idx, float *x, float *y)
1221 {
1222
1223 int jj;
1224
1225 for(jj=0; jj<kmax; jj++)
1226 {
1227 y[idx[jj]] = x[jj];
1228 }
1229
1230 }
1231
1232
1233
1234 // adds vector to vector, sparse formulation
svecad_libsp(int kmax,int * idx,float alpha,float * x,float * y)1235 void svecad_libsp(int kmax, int *idx, float alpha, float *x, float *y)
1236 {
1237
1238 int jj;
1239
1240 for(jj=0; jj<kmax; jj++)
1241 {
1242 y[idx[jj]] += alpha * x[jj];
1243 }
1244
1245 }
1246
1247
1248
1249 /****************************
1250 * new interface
1251 ****************************/
1252
1253
1254
1255 #if defined(LA_HIGH_PERFORMANCE)
1256
1257
1258
1259 // return the memory size (in bytes) needed for a strmat
blasfeo_memsize_smat(int m,int n)1260 int blasfeo_memsize_smat(int m, int n)
1261 {
1262 const int bs = 4;
1263 int nc = S_NC;
1264 int al = bs*nc;
1265 int pm = (m+bs-1)/bs*bs;
1266 int cn = (n+nc-1)/nc*nc;
1267 int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
1268 int memsize = (pm*cn+tmp)*sizeof(float);
1269 return memsize;
1270 }
1271
1272
1273
blasfeo_memsize_smat_ps(int ps,int m,int n)1274 int blasfeo_memsize_smat_ps(int ps, int m, int n)
1275 {
1276 int nc = S_NC;
1277 int al = ps*nc;
1278 int pm = (m+ps-1)/ps*ps;
1279 int cn = (n+nc-1)/nc*nc;
1280 int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
1281 int memsize = (pm*cn+tmp)*sizeof(float);
1282 return memsize;
1283 }
1284
1285
1286
1287 // return the memory size (in bytes) needed for the digonal of a strmat
blasfeo_memsize_diag_smat(int m,int n)1288 int blasfeo_memsize_diag_smat(int m, int n)
1289 {
1290 const int bs = 4;
1291 int nc = S_NC;
1292 int al = bs*nc;
1293 int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
1294 int memsize = tmp*sizeof(float);
1295 return memsize;
1296 }
1297
1298
1299
1300 // create a matrix structure for a matrix of size m*n by using memory passed by a pointer
blasfeo_create_smat(int m,int n,struct blasfeo_smat * sA,void * memory)1301 void blasfeo_create_smat(int m, int n, struct blasfeo_smat *sA, void *memory)
1302 {
1303
1304 // invalidate stored inverse diagonal
1305 sA->use_dA = 0;
1306
1307 const int bs = 4;
1308 int nc = S_NC;
1309 int al = bs*nc;
1310 sA->m = m;
1311 sA->n = n;
1312 int pm = (m+bs-1)/bs*bs;
1313 int cn = (n+nc-1)/nc*nc;
1314 sA->pm = pm;
1315 sA->cn = cn;
1316 float *ptr = (float *) memory;
1317 sA->pA = ptr;
1318 ptr += pm*cn;
1319 int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
1320 sA->dA = ptr;
1321 ptr += tmp;
1322 sA->memsize = (pm*cn+tmp)*sizeof(float);
1323 return;
1324 }
1325
1326
1327
blasfeo_create_smat_ps(int ps,int m,int n,struct blasfeo_smat * sA,void * memory)1328 void blasfeo_create_smat_ps(int ps, int m, int n, struct blasfeo_smat *sA, void *memory)
1329 {
1330
1331 // invalidate stored inverse diagonal
1332 sA->use_dA = 0;
1333
1334 int nc = S_NC;
1335 int al = ps*nc;
1336 sA->m = m;
1337 sA->n = n;
1338 int pm = (m+ps-1)/ps*ps;
1339 int cn = (n+nc-1)/nc*nc;
1340 sA->pm = pm;
1341 sA->cn = cn;
1342 float *ptr = (float *) memory;
1343 sA->pA = ptr;
1344 ptr += pm*cn;
1345 int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
1346 sA->dA = ptr;
1347 ptr += tmp;
1348 sA->memsize = (pm*cn+tmp)*sizeof(float);
1349 return;
1350 }
1351
1352
1353
1354 // return memory size (in bytes) needed for a strvec
blasfeo_memsize_svec(int m)1355 int blasfeo_memsize_svec(int m)
1356 {
1357 const int bs = 4;
1358 // int nc = S_NC;
1359 // int al = bs*nc;
1360 int pm = (m+bs-1)/bs*bs;
1361 int memsize = pm*sizeof(float);
1362 return memsize;
1363 }
1364
1365
1366
1367 // create a vector structure for a vector of size m by using memory passed by a pointer
blasfeo_create_svec(int m,struct blasfeo_svec * sa,void * memory)1368 void blasfeo_create_svec(int m, struct blasfeo_svec *sa, void *memory)
1369 {
1370 const int bs = 4;
1371 // int nc = S_NC;
1372 // int al = bs*nc;
1373 sa->m = m;
1374 int pm = (m+bs-1)/bs*bs;
1375 sa->pm = pm;
1376 float *ptr = (float *) memory;
1377 sa->pa = ptr;
1378 // ptr += pm;
1379 sa->memsize = pm*sizeof(float);
1380 return;
1381 }
1382
1383
1384
1385 // convert a matrix into a matrix structure
blasfeo_pack_smat(int m,int n,float * A,int lda,struct blasfeo_smat * sA,int ai,int aj)1386 void blasfeo_pack_smat(int m, int n, float *A, int lda, struct blasfeo_smat *sA, int ai, int aj)
1387 {
1388
1389 // invalidate stored inverse diagonal
1390 sA->use_dA = 0;
1391
1392 const int bs = 4;
1393 int sda = sA->cn;
1394 float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
1395 int i, ii, j, jj, m0, m1, m2;
1396 float *B, *pB;
1397 m0 = (bs-ai%bs)%bs;
1398 if(m0>m)
1399 m0 = m;
1400 m1 = m - m0;
1401 jj = 0;
1402 for( ; jj<n-3; jj+=4)
1403 {
1404 B = A + jj*lda;
1405 pB = pA + jj*bs;
1406 ii = 0;
1407 if(m0>0)
1408 {
1409 for( ; ii<m0; ii++)
1410 {
1411 pB[ii+bs*0] = B[ii+lda*0];
1412 pB[ii+bs*1] = B[ii+lda*1];
1413 pB[ii+bs*2] = B[ii+lda*2];
1414 pB[ii+bs*3] = B[ii+lda*3];
1415 }
1416 B += m0;
1417 pB += m0 + bs*(sda-1);
1418 }
1419 for( ; ii<m-3; ii+=4)
1420 {
1421 // col 0
1422 pB[0+bs*0] = B[0+lda*0];
1423 pB[1+bs*0] = B[1+lda*0];
1424 pB[2+bs*0] = B[2+lda*0];
1425 pB[3+bs*0] = B[3+lda*0];
1426 // col 1
1427 pB[0+bs*1] = B[0+lda*1];
1428 pB[1+bs*1] = B[1+lda*1];
1429 pB[2+bs*1] = B[2+lda*1];
1430 pB[3+bs*1] = B[3+lda*1];
1431 // col 2
1432 pB[0+bs*2] = B[0+lda*2];
1433 pB[1+bs*2] = B[1+lda*2];
1434 pB[2+bs*2] = B[2+lda*2];
1435 pB[3+bs*2] = B[3+lda*2];
1436 // col 3
1437 pB[0+bs*3] = B[0+lda*3];
1438 pB[1+bs*3] = B[1+lda*3];
1439 pB[2+bs*3] = B[2+lda*3];
1440 pB[3+bs*3] = B[3+lda*3];
1441 // update
1442 B += 4;
1443 pB += bs*sda;
1444 }
1445 for( ; ii<m; ii++)
1446 {
1447 // col 0
1448 pB[0+bs*0] = B[0+lda*0];
1449 // col 1
1450 pB[0+bs*1] = B[0+lda*1];
1451 // col 2
1452 pB[0+bs*2] = B[0+lda*2];
1453 // col 3
1454 pB[0+bs*3] = B[0+lda*3];
1455 // update
1456 B += 1;
1457 pB += 1;
1458 }
1459 }
1460 for( ; jj<n; jj++)
1461 {
1462
1463 B = A + jj*lda;
1464 pB = pA + jj*bs;
1465
1466 ii = 0;
1467 if(m0>0)
1468 {
1469 for( ; ii<m0; ii++)
1470 {
1471 pB[ii+bs*0] = B[ii+lda*0];
1472 }
1473 B += m0;
1474 pB += m0 + bs*(sda-1);
1475 }
1476 for( ; ii<m-3; ii+=4)
1477 {
1478 // col 0
1479 pB[0+bs*0] = B[0+lda*0];
1480 pB[1+bs*0] = B[1+lda*0];
1481 pB[2+bs*0] = B[2+lda*0];
1482 pB[3+bs*0] = B[3+lda*0];
1483 // update
1484 B += 4;
1485 pB += bs*sda;
1486 }
1487 for( ; ii<m; ii++)
1488 {
1489 // col 0
1490 pB[0+bs*0] = B[0+lda*0];
1491 // update
1492 B += 1;
1493 pB += 1;
1494 }
1495 }
1496 return;
1497 }
1498
1499
1500
1501 // convert and transpose a matrix into a matrix structure
blasfeo_pack_tran_smat(int m,int n,float * A,int lda,struct blasfeo_smat * sA,int ai,int aj)1502 void blasfeo_pack_tran_smat(int m, int n, float *A, int lda, struct blasfeo_smat *sA, int ai, int aj)
1503 {
1504
1505 // invalidate stored inverse diagonal
1506 sA->use_dA = 0;
1507
1508 const int bs = 4;
1509 int sda = sA->cn;
1510 float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
1511 int i, ii, j, m0, m1, m2;
1512 float *B, *pB;
1513 m0 = (bs-ai%bs)%bs;
1514 if(m0>n)
1515 m0 = n;
1516 m1 = n - m0;
1517 ii = 0;
1518 if(m0>0)
1519 {
1520 for(j=0; j<m; j++)
1521 {
1522 for(i=0; i<m0; i++)
1523 {
1524 pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
1525 }
1526 }
1527 A += m0*lda;
1528 pA += m0 + bs*(sda-1);
1529 }
1530 ii = 0;
1531 for(; ii<m1-3; ii+=bs)
1532 {
1533 j=0;
1534 B = A + ii*lda;
1535 pB = pA + ii*sda;
1536 for(; j<m-3; j+=4)
1537 {
1538 // unroll 0
1539 pB[0+0*bs] = B[0+0*lda];
1540 pB[1+0*bs] = B[0+1*lda];
1541 pB[2+0*bs] = B[0+2*lda];
1542 pB[3+0*bs] = B[0+3*lda];
1543 // unroll 1
1544 pB[0+1*bs] = B[1+0*lda];
1545 pB[1+1*bs] = B[1+1*lda];
1546 pB[2+1*bs] = B[1+2*lda];
1547 pB[3+1*bs] = B[1+3*lda];
1548 // unroll 2
1549 pB[0+2*bs] = B[2+0*lda];
1550 pB[1+2*bs] = B[2+1*lda];
1551 pB[2+2*bs] = B[2+2*lda];
1552 pB[3+2*bs] = B[2+3*lda];
1553 // unroll 3
1554 pB[0+3*bs] = B[3+0*lda];
1555 pB[1+3*bs] = B[3+1*lda];
1556 pB[2+3*bs] = B[3+2*lda];
1557 pB[3+3*bs] = B[3+3*lda];
1558 B += 4;
1559 pB += 4*bs;
1560 }
1561 for(; j<m; j++)
1562 {
1563 // unroll 0
1564 pB[0+0*bs] = B[0+0*lda];
1565 pB[1+0*bs] = B[0+1*lda];
1566 pB[2+0*bs] = B[0+2*lda];
1567 pB[3+0*bs] = B[0+3*lda];
1568 B += 1;
1569 pB += 1*bs;
1570 }
1571 }
1572 if(ii<m1)
1573 {
1574 m2 = m1-ii;
1575 if(bs<m2) m2 = bs;
1576 for(j=0; j<m; j++)
1577 {
1578 for(i=0; i<m2; i++)
1579 {
1580 pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
1581 }
1582 }
1583 }
1584 return;
1585 }
1586
1587
1588
1589 // convert a vector into a vector structure
blasfeo_pack_svec(int m,float * a,struct blasfeo_svec * sa,int ai)1590 void blasfeo_pack_svec(int m, float *a, struct blasfeo_svec *sa, int ai)
1591 {
1592 float *pa = sa->pa + ai;
1593 int ii;
1594 for(ii=0; ii<m; ii++)
1595 pa[ii] = a[ii];
1596 return;
1597 }
1598
1599
1600
1601 // convert a matrix structure into a matrix
blasfeo_unpack_smat(int m,int n,struct blasfeo_smat * sA,int ai,int aj,float * A,int lda)1602 void blasfeo_unpack_smat(int m, int n, struct blasfeo_smat *sA, int ai, int aj, float *A, int lda)
1603 {
1604 const int bs = 4;
1605 int sda = sA->cn;
1606 float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
1607 int i, ii, jj;
1608 int m0 = (bs-ai%bs)%bs;
1609 if(m0>m)
1610 m0 = m;
1611 float *ptr_pA;
1612 jj=0;
1613 for(; jj<n-3; jj+=4)
1614 {
1615 ptr_pA = pA + jj*bs;
1616 ii = 0;
1617 if(m0>0)
1618 {
1619 for(; ii<m0; ii++)
1620 {
1621 // unroll 0
1622 A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
1623 // unroll 1
1624 A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
1625 // unroll 2
1626 A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
1627 // unroll 3
1628 A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
1629 ptr_pA++;
1630 }
1631 ptr_pA += (sda-1)*bs;
1632 }
1633 for(; ii<m-bs+1; ii+=bs)
1634 {
1635 // unroll 0
1636 A[0+ii+lda*(jj+0)] = ptr_pA[0+bs*0];
1637 A[1+ii+lda*(jj+0)] = ptr_pA[1+bs*0];
1638 A[2+ii+lda*(jj+0)] = ptr_pA[2+bs*0];
1639 A[3+ii+lda*(jj+0)] = ptr_pA[3+bs*0];
1640 // unroll 0
1641 A[0+ii+lda*(jj+1)] = ptr_pA[0+bs*1];
1642 A[1+ii+lda*(jj+1)] = ptr_pA[1+bs*1];
1643 A[2+ii+lda*(jj+1)] = ptr_pA[2+bs*1];
1644 A[3+ii+lda*(jj+1)] = ptr_pA[3+bs*1];
1645 // unroll 0
1646 A[0+ii+lda*(jj+2)] = ptr_pA[0+bs*2];
1647 A[1+ii+lda*(jj+2)] = ptr_pA[1+bs*2];
1648 A[2+ii+lda*(jj+2)] = ptr_pA[2+bs*2];
1649 A[3+ii+lda*(jj+2)] = ptr_pA[3+bs*2];
1650 // unroll 0
1651 A[0+ii+lda*(jj+3)] = ptr_pA[0+bs*3];
1652 A[1+ii+lda*(jj+3)] = ptr_pA[1+bs*3];
1653 A[2+ii+lda*(jj+3)] = ptr_pA[2+bs*3];
1654 A[3+ii+lda*(jj+3)] = ptr_pA[3+bs*3];
1655 ptr_pA += sda*bs;
1656 }
1657 for(; ii<m; ii++)
1658 {
1659 // unroll 0
1660 A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
1661 // unroll 1
1662 A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
1663 // unroll 2
1664 A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
1665 // unroll 3
1666 A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
1667 ptr_pA++;
1668 }
1669 }
1670 for(; jj<n; jj++)
1671 {
1672 ptr_pA = pA + jj*bs;
1673 ii = 0;
1674 if(m0>0)
1675 {
1676 for(; ii<m0; ii++)
1677 {
1678 A[ii+lda*jj] = ptr_pA[0];
1679 ptr_pA++;
1680 }
1681 ptr_pA += (sda-1)*bs;
1682 }
1683 for(; ii<m-bs+1; ii+=bs)
1684 {
1685 A[0+ii+lda*jj] = ptr_pA[0];
1686 A[1+ii+lda*jj] = ptr_pA[1];
1687 A[2+ii+lda*jj] = ptr_pA[2];
1688 A[3+ii+lda*jj] = ptr_pA[3];
1689 ptr_pA += sda*bs;
1690 }
1691 for(; ii<m; ii++)
1692 {
1693 A[ii+lda*jj] = ptr_pA[0];
1694 ptr_pA++;
1695 }
1696 }
1697 return;
1698 }
1699
1700
1701
1702 // convert and transpose a matrix structure into a matrix
blasfeo_unpack_tran_smat(int m,int n,struct blasfeo_smat * sA,int ai,int aj,float * A,int lda)1703 void blasfeo_unpack_tran_smat(int m, int n, struct blasfeo_smat *sA, int ai, int aj, float *A, int lda)
1704 {
1705 const int bs = 4;
1706 int sda = sA->cn;
1707 float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
1708 int i, ii, jj;
1709 int m0 = (bs-ai%bs)%bs;
1710 if(m0>m)
1711 m0 = m;
1712 float *ptr_pA;
1713 jj=0;
1714 for(; jj<n-3; jj+=4)
1715 {
1716 ptr_pA = pA + jj*bs;
1717 ii = 0;
1718 if(m0>0)
1719 {
1720 for(; ii<m0; ii++)
1721 {
1722 // unroll 0
1723 A[jj+0+lda*ii] = ptr_pA[0+bs*0];
1724 // unroll 1
1725 A[jj+1+lda*ii] = ptr_pA[0+bs*1];
1726 // unroll 2
1727 A[jj+2+lda*ii] = ptr_pA[0+bs*2];
1728 // unroll 3
1729 A[jj+3+lda*ii] = ptr_pA[0+bs*3];
1730 ptr_pA++;
1731 }
1732 ptr_pA += (sda-1)*bs;
1733 }
1734 for(; ii<m-bs+1; ii+=bs)
1735 {
1736 // unroll 0
1737 A[jj+0+lda*(ii+0)] = ptr_pA[0+bs*0];
1738 A[jj+0+lda*(ii+1)] = ptr_pA[1+bs*0];
1739 A[jj+0+lda*(ii+2)] = ptr_pA[2+bs*0];
1740 A[jj+0+lda*(ii+3)] = ptr_pA[3+bs*0];
1741 // unroll 1
1742 A[jj+1+lda*(ii+0)] = ptr_pA[0+bs*1];
1743 A[jj+1+lda*(ii+1)] = ptr_pA[1+bs*1];
1744 A[jj+1+lda*(ii+2)] = ptr_pA[2+bs*1];
1745 A[jj+1+lda*(ii+3)] = ptr_pA[3+bs*1];
1746 // unroll 2
1747 A[jj+2+lda*(ii+0)] = ptr_pA[0+bs*2];
1748 A[jj+2+lda*(ii+1)] = ptr_pA[1+bs*2];
1749 A[jj+2+lda*(ii+2)] = ptr_pA[2+bs*2];
1750 A[jj+2+lda*(ii+3)] = ptr_pA[3+bs*2];
1751 // unroll 3
1752 A[jj+3+lda*(ii+0)] = ptr_pA[0+bs*3];
1753 A[jj+3+lda*(ii+1)] = ptr_pA[1+bs*3];
1754 A[jj+3+lda*(ii+2)] = ptr_pA[2+bs*3];
1755 A[jj+3+lda*(ii+3)] = ptr_pA[3+bs*3];
1756 ptr_pA += sda*bs;
1757 }
1758 for(; ii<m; ii++)
1759 {
1760 // unroll 0
1761 A[jj+0+lda*ii] = ptr_pA[0+bs*0];
1762 // unroll 1
1763 A[jj+1+lda*ii] = ptr_pA[0+bs*1];
1764 // unroll 2
1765 A[jj+2+lda*ii] = ptr_pA[0+bs*2];
1766 // unroll 3
1767 A[jj+3+lda*ii] = ptr_pA[0+bs*3];
1768 ptr_pA++;
1769 }
1770 }
1771 for(; jj<n; jj++)
1772 {
1773 ptr_pA = pA + jj*bs;
1774 ii = 0;
1775 if(m0>0)
1776 {
1777 for(; ii<m0; ii++)
1778 {
1779 A[jj+lda*ii] = ptr_pA[0];
1780 ptr_pA++;
1781 }
1782 ptr_pA += (sda-1)*bs;
1783 }
1784 for(; ii<m-bs+1; ii+=bs)
1785 {
1786 i=0;
1787 for(; i<bs; i++)
1788 {
1789 A[jj+lda*(i+ii)] = ptr_pA[0];
1790 ptr_pA++;
1791 }
1792 ptr_pA += (sda-1)*bs;
1793 }
1794 for(; ii<m; ii++)
1795 {
1796 A[jj+lda*ii] = ptr_pA[0];
1797 ptr_pA++;
1798 }
1799 }
1800 return;
1801 }
1802
1803
1804
1805 // convert a vector structure into a vector
blasfeo_unpack_svec(int m,struct blasfeo_svec * sa,int ai,float * a)1806 void blasfeo_unpack_svec(int m, struct blasfeo_svec *sa, int ai, float *a)
1807 {
1808 float *pa = sa->pa + ai;
1809 int ii;
1810 for(ii=0; ii<m; ii++)
1811 a[ii] = pa[ii];
1812 return;
1813 }
1814
1815
1816
1817 // cast a matrix into a matrix structure
s_cast_mat2strmat(float * A,struct blasfeo_smat * sA)1818 void s_cast_mat2strmat(float *A, struct blasfeo_smat *sA)
1819 {
1820
1821 // invalidate stored inverse diagonal
1822 sA->use_dA = 0;
1823
1824 sA->pA = A;
1825 return;
1826 }
1827
1828
1829
1830 // cast a matrix into the diagonal of a matrix structure
s_cast_diag_mat2strmat(float * dA,struct blasfeo_smat * sA)1831 void s_cast_diag_mat2strmat(float *dA, struct blasfeo_smat *sA)
1832 {
1833
1834 // invalidate stored inverse diagonal
1835 sA->use_dA = 0;
1836
1837 sA->dA = dA;
1838 return;
1839 }
1840
1841
1842
1843 // cast a vector into a vector structure
s_cast_vec2vecmat(float * a,struct blasfeo_svec * sa)1844 void s_cast_vec2vecmat(float *a, struct blasfeo_svec *sa)
1845 {
1846 sa->pa = a;
1847 return;
1848 }
1849
1850
1851
1852 // insert element into strmat
blasfeo_sgein1(float a,struct blasfeo_smat * sA,int ai,int aj)1853 void blasfeo_sgein1(float a, struct blasfeo_smat *sA, int ai, int aj)
1854 {
1855
1856 if (ai == aj)
1857 {
1858 // invalidate stored inverse diagonal
1859 sA->use_dA = 0;
1860 }
1861
1862 const int bs = 4;
1863 int sda = sA->cn;
1864 float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
1865 pA[0] = a;
1866 return;
1867 }
1868
1869
1870
1871 // extract element from strmat
blasfeo_sgeex1(struct blasfeo_smat * sA,int ai,int aj)1872 float blasfeo_sgeex1(struct blasfeo_smat *sA, int ai, int aj)
1873 {
1874 const int bs = 4;
1875 int sda = sA->cn;
1876 float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
1877 return pA[0];
1878 }
1879
1880
1881
1882 // insert element into strvec
blasfeo_svecin1(float a,struct blasfeo_svec * sx,int xi)1883 void blasfeo_svecin1(float a, struct blasfeo_svec *sx, int xi)
1884 {
1885 const int bs = 4;
1886 float *x = sx->pa + xi;
1887 x[0] = a;
1888 return;
1889 }
1890
1891
1892
1893 // extract element from strvec
blasfeo_svecex1(struct blasfeo_svec * sx,int xi)1894 float blasfeo_svecex1(struct blasfeo_svec *sx, int xi)
1895 {
1896 const int bs = 4;
1897 float *x = sx->pa + xi;
1898 return x[0];
1899 }
1900
1901
1902
1903 // set all elements of a strmat to a value
blasfeo_sgese(int m,int n,float alpha,struct blasfeo_smat * sA,int ai,int aj)1904 void blasfeo_sgese(int m, int n, float alpha, struct blasfeo_smat *sA, int ai, int aj)
1905 {
1906
1907 // invalidate stored inverse diagonal
1908 sA->use_dA = 0;
1909
1910 const int bs = 4;
1911 int sda = sA->cn;
1912 float *pA = sA->pA + ai%bs + ai/bs*bs*sda + aj*bs;
1913 int m0 = m<(bs-ai%bs)%bs ? m : (bs-ai%bs)%bs;
1914 int ii, jj;
1915 if(m0>0)
1916 {
1917 for(ii=0; ii<m0; ii++)
1918 {
1919 for(jj=0; jj<n; jj++)
1920 {
1921 pA[jj*bs] = alpha;
1922 }
1923 pA += 1;
1924 }
1925 pA += bs*(sda-1);
1926 m -= m0;
1927 }
1928 for(ii=0; ii<m-3; ii+=4)
1929 {
1930 for(jj=0; jj<n; jj++)
1931 {
1932 pA[0+jj*bs] = alpha;
1933 pA[1+jj*bs] = alpha;
1934 pA[2+jj*bs] = alpha;
1935 pA[3+jj*bs] = alpha;
1936 }
1937 pA += bs*sda;
1938 }
1939 for( ; ii<m; ii++)
1940 {
1941 for(jj=0; jj<n; jj++)
1942 {
1943 pA[jj*bs] = alpha;
1944 }
1945 pA += 1;
1946 }
1947 return;
1948 }
1949
1950
1951
1952 // set all elements of a strvec to a value
blasfeo_svecse(int m,float alpha,struct blasfeo_svec * sx,int xi)1953 void blasfeo_svecse(int m, float alpha, struct blasfeo_svec *sx, int xi)
1954 {
1955 float *x = sx->pa + xi;
1956 int ii;
1957 for(ii=0; ii<m; ii++)
1958 x[ii] = alpha;
1959 return;
1960 }
1961
1962
1963
1964 // extract diagonal to vector
blasfeo_sdiaex(int kmax,float alpha,struct blasfeo_smat * sA,int ai,int aj,struct blasfeo_svec * sx,int xi)1965 void blasfeo_sdiaex(int kmax, float alpha, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx, int xi)
1966 {
1967 const int bs = 4;
1968 int sda = sA->cn;
1969 float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
1970 float *x = sx->pa + xi;
1971 sdiaex_lib(kmax, alpha, ai%bs, pA, sda, x);
1972 return;
1973 }
1974
1975
1976
1977 // insert a vector into diagonal
blasfeo_sdiain(int kmax,float alpha,struct blasfeo_svec * sx,int xi,struct blasfeo_smat * sA,int ai,int aj)1978 void blasfeo_sdiain(int kmax, float alpha, struct blasfeo_svec *sx, int xi, struct blasfeo_smat *sA, int ai, int aj)
1979 {
1980
1981 // invalidate stored inverse diagonal
1982 sA->use_dA = 0;
1983
1984 const int bs = 4;
1985 int sda = sA->cn;
1986 float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
1987 float *x = sx->pa + xi;
1988 int offsetA = ai%bs;
1989
1990 int kna = (bs-offsetA%bs)%bs;
1991 kna = kmax<kna ? kmax : kna;
1992
1993 int jj, ll;
1994
1995 if(kna>0)
1996 {
1997 for(ll=0; ll<kna; ll++)
1998 {
1999 pA[ll+bs*ll] = alpha*x[ll];
2000 }
2001 pA += kna + bs*(sda-1) + kna*bs;
2002 x += kna;
2003 kmax -= kna;
2004 }
2005 for(jj=0; jj<kmax-3; jj+=4)
2006 {
2007 pA[jj*sda+(jj+0)*bs+0] = alpha*x[jj+0];
2008 pA[jj*sda+(jj+1)*bs+1] = alpha*x[jj+1];
2009 pA[jj*sda+(jj+2)*bs+2] = alpha*x[jj+2];
2010 pA[jj*sda+(jj+3)*bs+3] = alpha*x[jj+3];
2011 }
2012 for(ll=0; ll<kmax-jj; ll++)
2013 {
2014 pA[jj*sda+(jj+ll)*bs+ll] = alpha*x[jj+ll];
2015 }
2016 return;
2017 }
2018
2019
2020
2021 // add scalar to diagonal
blasfeo_sdiare(int kmax,float alpha,struct blasfeo_smat * sA,int ai,int aj)2022 void blasfeo_sdiare(int kmax, float alpha, struct blasfeo_smat *sA, int ai, int aj)
2023 {
2024
2025 // invalidate stored inverse diagonal
2026 sA->use_dA = 0;
2027
2028 const int bs = 4;
2029 int sda = sA->cn;
2030 float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2031 int offsetA = ai%bs;
2032
2033 int kna = (bs-offsetA%bs)%bs;
2034 kna = kmax<kna ? kmax : kna;
2035
2036 int jj, ll;
2037
2038 if(kna>0)
2039 {
2040 for(ll=0; ll<kna; ll++)
2041 {
2042 pA[ll+bs*ll] += alpha;
2043 }
2044 pA += kna + bs*(sda-1) + kna*bs;
2045 kmax -= kna;
2046 }
2047 for(jj=0; jj<kmax-3; jj+=4)
2048 {
2049 pA[jj*sda+(jj+0)*bs+0] += alpha;
2050 pA[jj*sda+(jj+1)*bs+1] += alpha;
2051 pA[jj*sda+(jj+2)*bs+2] += alpha;
2052 pA[jj*sda+(jj+3)*bs+3] += alpha;
2053 }
2054 for(ll=0; ll<kmax-jj; ll++)
2055 {
2056 pA[jj*sda+(jj+ll)*bs+ll] += alpha;
2057 }
2058 return;
2059 }
2060
2061
2062
2063 // swap two rows of two matrix structs
blasfeo_srowsw(int kmax,struct blasfeo_smat * sA,int ai,int aj,struct blasfeo_smat * sC,int ci,int cj)2064 void blasfeo_srowsw(int kmax, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_smat *sC, int ci, int cj)
2065 {
2066
2067 // invalidate stored inverse diagonal
2068 sA->use_dA = 0;
2069 sC->use_dA = 0;
2070
2071 const int bs = 4;
2072 int sda = sA->cn;
2073 float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2074 int sdc = sC->cn;
2075 float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
2076 srowsw_lib(kmax, pA, pC);
2077 return;
2078 }
2079
2080
2081
2082 // permute the rows of a matrix struct
blasfeo_srowpe(int kmax,int * ipiv,struct blasfeo_smat * sA)2083 void blasfeo_srowpe(int kmax, int *ipiv, struct blasfeo_smat *sA)
2084 {
2085
2086 // invalidate stored inverse diagonal
2087 sA->use_dA = 0;
2088
2089 int ii;
2090 for(ii=0; ii<kmax; ii++)
2091 {
2092 if(ipiv[ii]!=ii)
2093 blasfeo_srowsw(sA->n, sA, ii, 0, sA, ipiv[ii], 0);
2094 }
2095 return;
2096 }
2097
2098
2099 // inverse permute the rows of a matrix struct
blasfeo_srowpei(int kmax,int * ipiv,struct blasfeo_smat * sA)2100 void blasfeo_srowpei(int kmax, int *ipiv, struct blasfeo_smat *sA)
2101 {
2102
2103 // invalidate stored inverse diagonal
2104 sA->use_dA = 0;
2105
2106 int ii;
2107 for(ii=kmax-1; ii>=0; ii--)
2108 {
2109 if(ipiv[ii]!=ii)
2110 blasfeo_srowsw(sA->n, sA, ii, 0, sA, ipiv[ii], 0);
2111 }
2112 return;
2113 }
2114
2115
2116 // extract a row int a vector
blasfeo_srowex(int kmax,float alpha,struct blasfeo_smat * sA,int ai,int aj,struct blasfeo_svec * sx,int xi)2117 void blasfeo_srowex(int kmax, float alpha, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx, int xi)
2118 {
2119 const int bs = 4;
2120 int sda = sA->cn;
2121 float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2122 float *x = sx->pa + xi;
2123 srowex_lib(kmax, alpha, pA, x);
2124 return;
2125 }
2126
2127
2128
2129 // insert a vector into a row
blasfeo_srowin(int kmax,float alpha,struct blasfeo_svec * sx,int xi,struct blasfeo_smat * sA,int ai,int aj)2130 void blasfeo_srowin(int kmax, float alpha, struct blasfeo_svec *sx, int xi, struct blasfeo_smat *sA, int ai, int aj)
2131 {
2132
2133 // invalidate stored inverse diagonal
2134 sA->use_dA = 0;
2135
2136 const int bs = 4;
2137 int sda = sA->cn;
2138 float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2139 float *x = sx->pa + xi;
2140 srowin_lib(kmax, alpha, x, pA);
2141 return;
2142 }
2143
2144
2145
2146 // add a vector to a row
blasfeo_srowad(int kmax,float alpha,struct blasfeo_svec * sx,int xi,struct blasfeo_smat * sA,int ai,int aj)2147 void blasfeo_srowad(int kmax, float alpha, struct blasfeo_svec *sx, int xi, struct blasfeo_smat *sA, int ai, int aj)
2148 {
2149
2150 // invalidate stored inverse diagonal
2151 sA->use_dA = 0;
2152
2153 const int bs = 4;
2154 int sda = sA->cn;
2155 float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2156 float *x = sx->pa + xi;
2157 srowad_lib(kmax, alpha, x, pA);
2158 return;
2159 }
2160
2161
2162
2163 // extract vector from column
blasfeo_scolex(int kmax,struct blasfeo_smat * sA,int ai,int aj,struct blasfeo_svec * sx,int xi)2164 void blasfeo_scolex(int kmax, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx, int xi)
2165 {
2166
2167 // invalidate stored inverse diagonal
2168 sA->use_dA = 0;
2169
2170 const int bs = 4;
2171 int sda = sA->cn;
2172 float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2173 float *x = sx->pa + xi;
2174 scolex_lib(kmax, ai%bs, pA, sda, x);
2175 return;
2176 }
2177
2178
2179
2180 // insert as vector as a column
blasfeo_scolin(int kmax,struct blasfeo_svec * sx,int xi,struct blasfeo_smat * sA,int ai,int aj)2181 void blasfeo_scolin(int kmax, struct blasfeo_svec *sx, int xi, struct blasfeo_smat *sA, int ai, int aj)
2182 {
2183
2184 // invalidate stored inverse diagonal
2185 sA->use_dA = 0;
2186
2187 const int bs = 4;
2188 int sda = sA->cn;
2189 float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2190 float *x = sx->pa + xi;
2191 scolin_lib(kmax, x, ai%bs, pA, sda);
2192 return;
2193 }
2194
2195
2196
2197 // add scaled vector to column
blasfeo_scolad(int kmax,float alpha,struct blasfeo_svec * sx,int xi,struct blasfeo_smat * sA,int ai,int aj)2198 void blasfeo_scolad(int kmax, float alpha, struct blasfeo_svec *sx, int xi, struct blasfeo_smat *sA, int ai, int aj)
2199 {
2200
2201 // invalidate stored inverse diagonal
2202 sA->use_dA = 0;
2203
2204 const int bs = 4;
2205 int sda = sA->cn;
2206 float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2207 float *x = sx->pa + xi;
2208 scolad_lib(kmax, alpha, x, ai%bs, pA, sda);
2209 return;
2210 }
2211
2212
2213
2214 // scale a column
blasfeo_scolsc(int kmax,float alpha,struct blasfeo_smat * sA,int ai,int aj)2215 void blasfeo_scolsc(int kmax, float alpha, struct blasfeo_smat *sA, int ai, int aj)
2216 {
2217
2218 // invalidate stored inverse diagonal
2219 sA->use_dA = 0;
2220
2221 const int bs = 4;
2222
2223 int sda = sA->cn;
2224 float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2225
2226 int kna = (bs-ai%bs)%bs;
2227 kna = kmax<kna ? kmax : kna;
2228
2229 int jj, ll;
2230
2231 if(kna>0)
2232 {
2233 for(ll=0; ll<kna; ll++)
2234 {
2235 pA[ll] *= alpha;
2236 }
2237 pA += kna + bs*(sda-1);
2238 kmax -= kna;
2239 }
2240 for(jj=0; jj<kmax-3; jj+=4)
2241 {
2242 pA[jj*sda+0] *= alpha;
2243 pA[jj*sda+1] *= alpha;
2244 pA[jj*sda+2] *= alpha;
2245 pA[jj*sda+3] *= alpha;
2246 }
2247 for(ll=0; ll<kmax-jj; ll++)
2248 {
2249 pA[jj*sda+ll] *= alpha;
2250 }
2251
2252 return;
2253 }
2254
2255
2256
2257 // swap two cols of two matrix structs
blasfeo_scolsw(int kmax,struct blasfeo_smat * sA,int ai,int aj,struct blasfeo_smat * sC,int ci,int cj)2258 void blasfeo_scolsw(int kmax, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_smat *sC, int ci, int cj)
2259 {
2260
2261 // invalidate stored inverse diagonal
2262 sA->use_dA = 0;
2263 sC->use_dA = 0;
2264
2265 const int bs = 4;
2266 int sda = sA->cn;
2267 float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2268 int sdc = sC->cn;
2269 float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
2270 scolsw_lib(kmax, ai%bs, pA, sda, ci%bs, pC, sdc);
2271 return;
2272 }
2273
2274
2275
2276 // permute the cols of a matrix struct
blasfeo_scolpe(int kmax,int * ipiv,struct blasfeo_smat * sA)2277 void blasfeo_scolpe(int kmax, int *ipiv, struct blasfeo_smat *sA)
2278 {
2279
2280 // invalidate stored inverse diagonal
2281 sA->use_dA = 0;
2282
2283 int ii;
2284 for(ii=0; ii<kmax; ii++)
2285 {
2286 if(ipiv[ii]!=ii)
2287 blasfeo_scolsw(sA->m, sA, 0, ii, sA, 0, ipiv[ii]);
2288 }
2289 return;
2290 }
2291
2292
2293
2294 // inverse permute the cols of a matrix struct
blasfeo_scolpei(int kmax,int * ipiv,struct blasfeo_smat * sA)2295 void blasfeo_scolpei(int kmax, int *ipiv, struct blasfeo_smat *sA)
2296 {
2297
2298 // invalidate stored inverse diagonal
2299 sA->use_dA = 0;
2300
2301 int ii;
2302 for(ii=kmax-1; ii>=0; ii--)
2303 {
2304 if(ipiv[ii]!=ii)
2305 blasfeo_scolsw(sA->m, sA, 0, ii, sA, 0, ipiv[ii]);
2306 }
2307 return;
2308 }
2309
2310
2311
2312 // --- ge
2313
2314
2315 // scale a generic strmat
blasfeo_sgesc(int m,int n,float alpha,struct blasfeo_smat * sA,int ai,int aj)2316 void blasfeo_sgesc(int m, int n, float alpha, struct blasfeo_smat *sA, int ai, int aj)
2317 {
2318
2319 if(m<=0 | n<=0)
2320 return;
2321
2322 // invalidate stored inverse diagonal
2323 sA->use_dA = 0;
2324
2325 #if defined(DIM_CHECK)
2326 // non-negative size
2327 if(m<0) printf("\n****** blasfeo_sgesc : m<0 : %d<0 *****\n", m);
2328 if(n<0) printf("\n****** blasfeo_sgesc : n<0 : %d<0 *****\n", n);
2329 // non-negative offset
2330 if(ai<0) printf("\n****** blasfeo_sgesc : ai<0 : %d<0 *****\n", ai);
2331 if(aj<0) printf("\n****** blasfeo_sgesc : aj<0 : %d<0 *****\n", aj);
2332 // inside matrix
2333 // A: m x n
2334 if(ai+m > sA->m) printf("\n***** blasfeo_sgesc : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
2335 if(aj+n > sA->n) printf("\n***** blasfeo_sgesc : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
2336 #endif
2337
2338 const int bs = 4;
2339
2340 int mna, ii;
2341
2342 int sda = sA->cn;
2343 float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
2344 int offA = ai%bs;
2345
2346 // same alignment
2347 ii = 0;
2348 // clean up at the beginning
2349 mna = (4-offA)%bs;
2350 if(mna>0)
2351 {
2352 if(m<mna) // mna<=3 ==> m = { 1, 2 }
2353 {
2354 if(m==1)
2355 {
2356 kernel_sgecpsc_1_0_lib4(n, &alpha, pA+offA, pA+offA);
2357 return;
2358 }
2359 else //if(m==2 && mna==3)
2360 {
2361 kernel_sgecpsc_2_0_lib4(n, &alpha, pA+offA, pA+offA);
2362 return;
2363 }
2364 }
2365 if(mna==1)
2366 {
2367 kernel_sgecpsc_1_0_lib4(n, &alpha, pA+offA, pA+offA);
2368 pA += 4*sda;
2369 ii += 1;
2370 }
2371 else if(mna==2)
2372 {
2373 kernel_sgecpsc_2_0_lib4(n, &alpha, pA+offA, pA+offA);
2374 pA += 4*sda;
2375 ii += 2;
2376 }
2377 else // if(mna==3)
2378 {
2379 kernel_sgecpsc_3_0_lib4(n, &alpha, pA+offA, pA+offA);
2380 pA += 4*sda;
2381 ii += 3;
2382 }
2383 }
2384 // main loop
2385 for(; ii<m-3; ii+=4)
2386 {
2387 kernel_sgecpsc_4_0_lib4(n, &alpha, pA, pA);
2388 pA += 4*sda;
2389 }
2390 // clean up at the end
2391 if(ii<m)
2392 {
2393 if(m-ii==1)
2394 kernel_sgecpsc_1_0_lib4(n, &alpha, pA, pA);
2395 else if(m-ii==2)
2396 kernel_sgecpsc_2_0_lib4(n, &alpha, pA, pA);
2397 else // if(m-ii==3)
2398 kernel_sgecpsc_3_0_lib4(n, &alpha, pA, pA);
2399 }
2400
2401 return;
2402
2403 }
2404
2405
2406
2407 // copy and scale a generic strmat into a generic strmat
blasfeo_sgecpsc(int m,int n,float alpha,struct blasfeo_smat * sA,int ai,int aj,struct blasfeo_smat * sB,int bi,int bj)2408 void blasfeo_sgecpsc(int m, int n, float alpha, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_smat *sB, int bi, int bj)
2409 {
2410
2411 if(m<=0 | n<=0)
2412 return;
2413
2414 // invalidate stored inverse diagonal
2415 sB->use_dA = 0;
2416
2417 #if defined(DIM_CHECK)
2418 // non-negative size
2419 if(m<0) printf("\n****** blasfeo_sgecpsc : m<0 : %d<0 *****\n", m);
2420 if(n<0) printf("\n****** blasfeo_sgecpsc : n<0 : %d<0 *****\n", n);
2421 // non-negative offset
2422 if(ai<0) printf("\n****** blasfeo_sgecpsc : ai<0 : %d<0 *****\n", ai);
2423 if(aj<0) printf("\n****** blasfeo_sgecpsc : aj<0 : %d<0 *****\n", aj);
2424 if(bi<0) printf("\n****** blasfeo_sgecpsc : bi<0 : %d<0 *****\n", bi);
2425 if(bj<0) printf("\n****** blasfeo_sgecpsc : bj<0 : %d<0 *****\n", bj);
2426 // inside matrix
2427 // A: m x n
2428 if(ai+m > sA->m) printf("\n***** blasfeo_sgecpsc : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
2429 if(aj+n > sA->n) printf("\n***** blasfeo_sgecpsc : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
2430 // B: m x n
2431 if(bi+m > sB->m) printf("\n***** blasfeo_sgecpsc : bi+m > row(B) : %d+%d > %d *****\n", bi, m, sB->m);
2432 if(bj+n > sB->n) printf("\n***** blasfeo_sgecpsc : bj+n > col(B) : %d+%d > %d *****\n", bj, n, sB->n);
2433 #endif
2434
2435 const int bs = 4;
2436
2437 int mna, ii;
2438
2439 int sda = sA->cn;
2440 int sdb = sB->cn;
2441 float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
2442 float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
2443 int offA = ai%bs;
2444 int offB = bi%bs;
2445
2446 // same alignment
2447 if(offA==offB)
2448 {
2449 ii = 0;
2450 // clean up at the beginning
2451 mna = (4-offB)%bs;
2452 if(mna>0)
2453 {
2454 if(m<mna) // mna<=3 ==> m = { 1, 2 }
2455 {
2456 if(m==1)
2457 {
2458 kernel_sgecpsc_1_0_lib4(n, &alpha, pA+offA, pB+offB);
2459 return;
2460 }
2461 else //if(m==2 && mna==3)
2462 {
2463 kernel_sgecpsc_2_0_lib4(n, &alpha, pA+offA, pB+offB);
2464 return;
2465 }
2466 }
2467 if(mna==1)
2468 {
2469 kernel_sgecpsc_1_0_lib4(n, &alpha, pA+offA, pB+offB);
2470 pA += 4*sda;
2471 pB += 4*sdb;
2472 ii += 1;
2473 }
2474 else if(mna==2)
2475 {
2476 kernel_sgecpsc_2_0_lib4(n, &alpha, pA+offA, pB+offB);
2477 pA += 4*sda;
2478 pB += 4*sdb;
2479 ii += 2;
2480 }
2481 else // if(mna==3)
2482 {
2483 kernel_sgecpsc_3_0_lib4(n, &alpha, pA+offA, pB+offB);
2484 pA += 4*sda;
2485 pB += 4*sdb;
2486 ii += 3;
2487 }
2488 }
2489 // main loop
2490 for(; ii<m-3; ii+=4)
2491 {
2492 kernel_sgecpsc_4_0_lib4(n, &alpha, pA, pB);
2493 pA += 4*sda;
2494 pB += 4*sdb;
2495 }
2496 // clean up at the end
2497 if(ii<m)
2498 {
2499 if(m-ii==1)
2500 kernel_sgecpsc_1_0_lib4(n, &alpha, pA, pB);
2501 else if(m-ii==2)
2502 kernel_sgecpsc_2_0_lib4(n, &alpha, pA, pB);
2503 else // if(m-ii==3)
2504 kernel_sgecpsc_3_0_lib4(n, &alpha, pA, pB);
2505 }
2506 }
2507 // skip one element of pA
2508 else if(offA==(offB+1)%bs)
2509 {
2510 ii = 0;
2511 // clean up at the beginning
2512 mna = (4-offB)%bs;
2513 if(mna>0)
2514 {
2515 if(m<mna) // mna<=3 ==> m = { 1, 2 }
2516 {
2517 if(m==1)
2518 {
2519 kernel_sgecpsc_1_0_lib4(n, &alpha, pA+offA, pB+offB);
2520 return;
2521 }
2522 else //if(m==2 && mna==3)
2523 {
2524 kernel_sgecpsc_2_0_lib4(n, &alpha, pA+offA, pB+offB);
2525 return;
2526 }
2527 }
2528 if(mna==1)
2529 {
2530 kernel_sgecpsc_1_0_lib4(n, &alpha, pA+offA, pB+offB);
2531 //pA += 4*sda;
2532 pB += 4*sdb;
2533 ii += 1;
2534 }
2535 else if(mna==2)
2536 {
2537 kernel_sgecpsc_2_3_lib4(n, &alpha, pA, sda, pB+2);
2538 pA += 4*sda;
2539 pB += 4*sdb;
2540 ii += 2;
2541 }
2542 else // if(mna==3)
2543 {
2544 kernel_sgecpsc_3_2_lib4(n, &alpha, pA, sda, pB+1);
2545 pA += 4*sda;
2546 pB += 4*sdb;
2547 ii += 3;
2548 }
2549 }
2550 // main loop
2551 for( ; ii<m-3; ii+=4)
2552 {
2553 kernel_sgecpsc_4_1_lib4(n, &alpha, pA, sda, pB);
2554 pA += 4*sda;
2555 pB += 4*sdb;
2556 }
2557 // clean up at the end
2558 if(ii<m)
2559 {
2560 if(m-ii==1)
2561 kernel_sgecpsc_1_0_lib4(n, &alpha, pA+1, pB);
2562 else if(m-ii==2)
2563 kernel_sgecpsc_2_0_lib4(n, &alpha, pA+1, pB);
2564 else // if(m-ii==3)
2565 kernel_sgecpsc_3_0_lib4(n, &alpha, pA+1, pB);
2566 }
2567 }
2568 // skip 2 elements of pA
2569 else if(offA==(offB+2)%bs)
2570 {
2571 ii = 0;
2572 // clean up at the beginning
2573 mna = (4-offB)%bs;
2574 if(mna>0)
2575 {
2576 if(m<mna)
2577 {
2578 if(m==1)
2579 {
2580 kernel_sgecpsc_1_0_lib4(n, &alpha, pA+offA, pB+offB);
2581 return;
2582 }
2583 else // if(m==2 && mna==3)
2584 {
2585 kernel_sgecpsc_2_3_lib4(n, &alpha, pA, sda, pB+1);
2586 return;
2587 }
2588 }
2589 if(mna==1)
2590 {
2591 kernel_sgecpsc_1_0_lib4(n, &alpha, pA+1, pB+3);
2592 // pA += 4*sda;
2593 pB += 4*sdb;
2594 ii += 1;
2595 }
2596 else if(mna==2)
2597 {
2598 kernel_sgecpsc_2_0_lib4(n, &alpha, pA, pB+2);
2599 // pA += 4*sda;
2600 pB += 4*sdb;
2601 ii += 2;
2602 }
2603 else // if(mna==3)
2604 {
2605 kernel_sgecpsc_3_3_lib4(n, &alpha, pA, sda, pB+1);
2606 pA += 4*sda;
2607 pB += 4*sdb;
2608 ii += 3;
2609 }
2610 }
2611 // main loop
2612 for(; ii<m-3; ii+=4)
2613 {
2614 kernel_sgecpsc_4_2_lib4(n, &alpha, pA, sda, pB);
2615 pA += 4*sda;
2616 pB += 4*sdb;
2617 }
2618 // clean up at the end
2619 if(ii<m)
2620 {
2621 if(m-ii==1)
2622 kernel_sgecpsc_1_0_lib4(n, &alpha, pA+2, pB);
2623 else if(m-ii==2)
2624 kernel_sgecpsc_2_0_lib4(n, &alpha, pA+2, pB);
2625 else // if(m-ii==3)
2626 kernel_sgecpsc_3_2_lib4(n, &alpha, pA, sda, pB);
2627 }
2628 }
2629 // skip 3 elements of pA
2630 else // if(offA==(offB+3)%bs)
2631 {
2632 ii = 0;
2633 // clean up at the beginning
2634 mna = (4-offB)%bs;
2635
2636 if(mna>0)
2637 {
2638
2639 if(m<mna)
2640 {
2641 if(m==1)
2642 {
2643 kernel_sgecpsc_1_0_lib4(n, &alpha, pA+offA, pB+offB);
2644 return;
2645 }
2646 else // if(m==2 && mna==3)
2647 {
2648 kernel_sgecpsc_2_0_lib4(n, &alpha, pA+offA, pB+offB);
2649 return;
2650 }
2651 }
2652
2653 if(mna==1)
2654 {
2655 kernel_sgecpsc_1_0_lib4(n, &alpha, pA+offA, pB+offB);
2656 // pA += 4*sda;
2657 pB += 4*sdb;
2658 ii += 1;
2659 }
2660 else if(mna==2)
2661 {
2662 kernel_sgecpsc_2_0_lib4(n, &alpha, pA+offA, pB+offB);
2663 // pA += 4*sda;
2664 pB += 4*sdb;
2665 ii += 2;
2666 }
2667 else // if(mna==3)
2668 {
2669 kernel_sgecpsc_3_0_lib4(n, &alpha, pA+offA, pB+offB);
2670 // pA += 4*sda;
2671 pB += 4*sdb;
2672 ii += 3;
2673 }
2674
2675 }
2676
2677 // main loop
2678
2679 for(; ii<m-3; ii+=4)
2680 {
2681 kernel_sgecpsc_4_3_lib4(n, &alpha, pA, sda, pB);
2682 pA += 4*sda;
2683 pB += 4*sdb;
2684 }
2685
2686 // clean up at the end
2687 if(ii<m)
2688 {
2689 if(m-ii==1)
2690 kernel_sgecpsc_1_0_lib4(n, &alpha, pA+3, pB);
2691 else if(m-ii==2)
2692 kernel_sgecpsc_2_3_lib4(n, &alpha, pA, sda, pB);
2693 else // if(m-ii==3)
2694 kernel_sgecpsc_3_3_lib4(n, &alpha, pA, sda, pB);
2695 }
2696 }
2697
2698 return;
2699
2700 }
2701
2702
2703
2704 // copy a generic strmat into a generic strmat
blasfeo_sgecp(int m,int n,struct blasfeo_smat * sA,int ai,int aj,struct blasfeo_smat * sB,int bi,int bj)2705 void blasfeo_sgecp(int m, int n, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_smat *sB, int bi, int bj)
2706 {
2707
2708 if(m<=0 | n<=0)
2709 return;
2710
2711 // invalidate stored inverse diagonal
2712 sB->use_dA = 0;
2713
2714 #if defined(DIM_CHECK)
2715 // non-negative size
2716 if(m<0) printf("\n****** blasfeo_sgecp : m<0 : %d<0 *****\n", m);
2717 if(n<0) printf("\n****** blasfeo_sgecp : n<0 : %d<0 *****\n", n);
2718 // non-negative offset
2719 if(ai<0) printf("\n****** blasfeo_sgecp : ai<0 : %d<0 *****\n", ai);
2720 if(aj<0) printf("\n****** blasfeo_sgecp : aj<0 : %d<0 *****\n", aj);
2721 if(bi<0) printf("\n****** blasfeo_sgecp : bi<0 : %d<0 *****\n", bi);
2722 if(bj<0) printf("\n****** blasfeo_sgecp : bj<0 : %d<0 *****\n", bj);
2723 // inside matrix
2724 // A: m x n
2725 if(ai+m > sA->m) printf("\n***** blasfeo_sgecp : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
2726 if(aj+n > sA->n) printf("\n***** blasfeo_sgecp : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
2727 // B: m x n
2728 if(bi+m > sB->m) printf("\n***** blasfeo_sgecp : bi+m > row(B) : %d+%d > %d *****\n", bi, m, sB->m);
2729 if(bj+n > sB->n) printf("\n***** blasfeo_sgecp : bj+n > col(B) : %d+%d > %d *****\n", bj, n, sB->n);
2730 #endif
2731
2732 const int bs = 4;
2733
2734 int mna, ii;
2735
2736 int sda = sA->cn;
2737 int sdb = sB->cn;
2738 float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
2739 float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
2740 int offA = ai%bs;
2741 int offB = bi%bs;
2742
2743 // same alignment
2744 if(offA==offB)
2745 {
2746 ii = 0;
2747 // clean up at the beginning
2748 mna = (4-offB)%bs;
2749 if(mna>0)
2750 {
2751 if(m<mna) // mna<=3 ==> m = { 1, 2 }
2752 {
2753 if(m==1)
2754 {
2755 kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
2756 return;
2757 }
2758 else //if(m==2 && mna==3)
2759 {
2760 kernel_sgecp_2_0_lib4(n, pA+offA, pB+offB);
2761 return;
2762 }
2763 }
2764 if(mna==1)
2765 {
2766 kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
2767 pA += 4*sda;
2768 pB += 4*sdb;
2769 ii += 1;
2770 }
2771 else if(mna==2)
2772 {
2773 kernel_sgecp_2_0_lib4(n, pA+offA, pB+offB);
2774 pA += 4*sda;
2775 pB += 4*sdb;
2776 ii += 2;
2777 }
2778 else // if(mna==3)
2779 {
2780 kernel_sgecp_3_0_lib4(n, pA+offA, pB+offB);
2781 pA += 4*sda;
2782 pB += 4*sdb;
2783 ii += 3;
2784 }
2785 }
2786 // main loop
2787 for(; ii<m-3; ii+=4)
2788 {
2789 kernel_sgecp_4_0_lib4(n, pA, pB);
2790 pA += 4*sda;
2791 pB += 4*sdb;
2792 }
2793 // clean up at the end
2794 if(ii<m)
2795 {
2796 if(m-ii==1)
2797 kernel_sgecp_1_0_lib4(n, pA, pB);
2798 else if(m-ii==2)
2799 kernel_sgecp_2_0_lib4(n, pA, pB);
2800 else // if(m-ii==3)
2801 kernel_sgecp_3_0_lib4(n, pA, pB);
2802 }
2803 }
2804 // skip one element of pA
2805 else if(offA==(offB+1)%bs)
2806 {
2807 ii = 0;
2808 // clean up at the beginning
2809 mna = (4-offB)%bs;
2810 if(mna>0)
2811 {
2812 if(m<mna) // mna<=3 ==> m = { 1, 2 }
2813 {
2814 if(m==1)
2815 {
2816 kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
2817 return;
2818 }
2819 else //if(m==2 && mna==3)
2820 {
2821 kernel_sgecp_2_0_lib4(n, pA+offA, pB+offB);
2822 return;
2823 }
2824 }
2825 if(mna==1)
2826 {
2827 kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
2828 //pA += 4*sda;
2829 pB += 4*sdb;
2830 ii += 1;
2831 }
2832 else if(mna==2)
2833 {
2834 kernel_sgecp_2_3_lib4(n, pA, sda, pB+2);
2835 pA += 4*sda;
2836 pB += 4*sdb;
2837 ii += 2;
2838 }
2839 else // if(mna==3)
2840 {
2841 kernel_sgecp_3_2_lib4(n, pA, sda, pB+1);
2842 pA += 4*sda;
2843 pB += 4*sdb;
2844 ii += 3;
2845 }
2846 }
2847 // main loop
2848 for( ; ii<m-3; ii+=4)
2849 {
2850 kernel_sgecp_4_1_lib4(n, pA, sda, pB);
2851 pA += 4*sda;
2852 pB += 4*sdb;
2853 }
2854 // clean up at the end
2855 if(ii<m)
2856 {
2857 if(m-ii==1)
2858 kernel_sgecp_1_0_lib4(n, pA+1, pB);
2859 else if(m-ii==2)
2860 kernel_sgecp_2_0_lib4(n, pA+1, pB);
2861 else // if(m-ii==3)
2862 kernel_sgecp_3_0_lib4(n, pA+1, pB);
2863 }
2864 }
2865 // skip 2 elements of pA
2866 else if(offA==(offB+2)%bs)
2867 {
2868 ii = 0;
2869 // clean up at the beginning
2870 mna = (4-offB)%bs;
2871 if(mna>0)
2872 {
2873 if(m<mna)
2874 {
2875 if(m==1)
2876 {
2877 kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
2878 return;
2879 }
2880 else // if(m==2 && mna==3)
2881 {
2882 kernel_sgecp_2_3_lib4(n, pA, sda, pB+1);
2883 return;
2884 }
2885 }
2886 if(mna==1)
2887 {
2888 kernel_sgecp_1_0_lib4(n, pA+1, pB+3);
2889 // pA += 4*sda;
2890 pB += 4*sdb;
2891 ii += 1;
2892 }
2893 else if(mna==2)
2894 {
2895 kernel_sgecp_2_0_lib4(n, pA, pB+2);
2896 // pA += 4*sda;
2897 pB += 4*sdb;
2898 ii += 2;
2899 }
2900 else // if(mna==3)
2901 {
2902 kernel_sgecp_3_3_lib4(n, pA, sda, pB+1);
2903 pA += 4*sda;
2904 pB += 4*sdb;
2905 ii += 3;
2906 }
2907 }
2908 // main loop
2909 for(; ii<m-3; ii+=4)
2910 {
2911 kernel_sgecp_4_2_lib4(n, pA, sda, pB);
2912 pA += 4*sda;
2913 pB += 4*sdb;
2914 }
2915 // clean up at the end
2916 if(ii<m)
2917 {
2918 if(m-ii==1)
2919 kernel_sgecp_1_0_lib4(n, pA+2, pB);
2920 else if(m-ii==2)
2921 kernel_sgecp_2_0_lib4(n, pA+2, pB);
2922 else // if(m-ii==3)
2923 kernel_sgecp_3_2_lib4(n, pA, sda, pB);
2924 }
2925 }
2926 // skip 3 elements of pA
2927 else // if(offA==(offB+3)%bs)
2928 {
2929 ii = 0;
2930 // clean up at the beginning
2931 mna = (4-offB)%bs;
2932 if(mna>0)
2933 {
2934 if(m<mna)
2935 {
2936 if(m==1)
2937 {
2938 kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
2939 return;
2940 }
2941 else // if(m==2 && mna==3)
2942 {
2943 kernel_sgecp_2_0_lib4(n, pA+offA, pB+offB);
2944 return;
2945 }
2946 }
2947 if(mna==1)
2948 {
2949 kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
2950 // pA += 4*sda;
2951 pB += 4*sdb;
2952 ii += 1;
2953 }
2954 else if(mna==2)
2955 {
2956 kernel_sgecp_2_0_lib4(n, pA+offA, pB+offB);
2957 // pA += 4*sda;
2958 pB += 4*sdb;
2959 ii += 2;
2960 }
2961 else // if(mna==3)
2962 {
2963 kernel_sgecp_3_0_lib4(n, pA+offA, pB+offB);
2964 // pA += 4*sda;
2965 pB += 4*sdb;
2966 ii += 3;
2967 }
2968 }
2969 // main loop
2970 for(; ii<m-3; ii+=4)
2971 {
2972 kernel_sgecp_4_3_lib4(n, pA, sda, pB);
2973 pA += 4*sda;
2974 pB += 4*sdb;
2975 }
2976 // clean up at the end
2977 if(ii<m)
2978 {
2979 if(m-ii==1)
2980 kernel_sgecp_1_0_lib4(n, pA+3, pB);
2981 else if(m-ii==2)
2982 kernel_sgecp_2_3_lib4(n, pA, sda, pB);
2983 else // if(m-ii==3)
2984 kernel_sgecp_3_3_lib4(n, pA, sda, pB);
2985 }
2986 }
2987
2988 return;
2989
2990 }
2991
2992
2993
2994 // scale a strvec
blasfeo_svecsc(int m,float alpha,struct blasfeo_svec * sa,int ai)2995 void blasfeo_svecsc(int m, float alpha, struct blasfeo_svec *sa, int ai)
2996 {
2997 float *pa = sa->pa + ai;
2998 int ii;
2999 ii = 0;
3000 for(; ii<m-3; ii+=4)
3001 {
3002 pa[ii+0] *= alpha;
3003 pa[ii+1] *= alpha;
3004 pa[ii+2] *= alpha;
3005 pa[ii+3] *= alpha;
3006 }
3007 for(; ii<m; ii++)
3008 {
3009 pa[ii+0] *= alpha;
3010 }
3011 return;
3012 }
3013
3014
3015
3016 // copy a strvec into a strvec
blasfeo_sveccp(int m,struct blasfeo_svec * sa,int ai,struct blasfeo_svec * sc,int ci)3017 void blasfeo_sveccp(int m, struct blasfeo_svec *sa, int ai, struct blasfeo_svec *sc, int ci)
3018 {
3019 float *pa = sa->pa + ai;
3020 float *pc = sc->pa + ci;
3021 int ii;
3022 ii = 0;
3023 for(; ii<m-3; ii+=4)
3024 {
3025 pc[ii+0] = pa[ii+0];
3026 pc[ii+1] = pa[ii+1];
3027 pc[ii+2] = pa[ii+2];
3028 pc[ii+3] = pa[ii+3];
3029 }
3030 for(; ii<m; ii++)
3031 {
3032 pc[ii+0] = pa[ii+0];
3033 }
3034 return;
3035 }
3036
3037
3038
3039 // copy and scale a strvec into a strvec
blasfeo_sveccpsc(int m,float alpha,struct blasfeo_svec * sa,int ai,struct blasfeo_svec * sc,int ci)3040 void blasfeo_sveccpsc(int m, float alpha, struct blasfeo_svec *sa, int ai, struct blasfeo_svec *sc, int ci)
3041 {
3042 float *pa = sa->pa + ai;
3043 float *pc = sc->pa + ci;
3044 int ii;
3045 ii = 0;
3046 for(; ii<m-3; ii+=4)
3047 {
3048 pc[ii+0] = alpha*pa[ii+0];
3049 pc[ii+1] = alpha*pa[ii+1];
3050 pc[ii+2] = alpha*pa[ii+2];
3051 pc[ii+3] = alpha*pa[ii+3];
3052 }
3053 for(; ii<m; ii++)
3054 {
3055 pc[ii+0] = alpha*pa[ii+0];
3056 }
3057 return;
3058 }
3059
3060
3061
3062 // copy a lower triangular strmat into a lower triangular strmat
blasfeo_strcp_l(int m,struct blasfeo_smat * sA,int ai,int aj,struct blasfeo_smat * sB,int bi,int bj)3063 void blasfeo_strcp_l(int m, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_smat *sB, int bi, int bj)
3064 {
3065
3066 if(m<=0)
3067 return;
3068
3069 // invalidate stored inverse diagonal
3070 sB->use_dA = 0;
3071
3072 const int bs = 4;
3073
3074 int sda = sA->cn;
3075 int sdb = sB->cn;
3076 float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
3077 float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
3078 int offA = ai%bs;
3079 int offB = bi%bs;
3080
3081 int ii, mna;
3082
3083 // same alignment
3084 if(offA==offB)
3085 {
3086 ii = 0;
3087 // clean up at the beginning
3088 mna = (4-offB)%bs;
3089 if(mna>0)
3090 {
3091 if(m<mna) // mna<=3 ==> m = { 1, 2 }
3092 {
3093 if(m==1)
3094 {
3095 kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
3096 return;
3097 }
3098 else //if(m==2 && mna==3)
3099 {
3100 kernel_strcp_l_2_0_lib4(ii, pA+offA, pB+offB);
3101 return;
3102 }
3103 }
3104 if(mna==1)
3105 {
3106 kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
3107 pA += 4*sda;
3108 pB += 4*sdb;
3109 ii += 1;
3110 }
3111 else if(mna==2)
3112 {
3113 kernel_strcp_l_2_0_lib4(ii, pA+offA, pB+offB);
3114 pA += 4*sda;
3115 pB += 4*sdb;
3116 ii += 2;
3117 }
3118 else // if(mna==3)
3119 {
3120 kernel_strcp_l_3_0_lib4(ii, pA+offA, pB+offB);
3121 pA += 4*sda;
3122 pB += 4*sdb;
3123 ii += 3;
3124 }
3125 }
3126 // main loop
3127 for(; ii<m-3; ii+=4)
3128 {
3129 kernel_strcp_l_4_0_lib4(ii, pA, pB);
3130 pA += 4*sda;
3131 pB += 4*sdb;
3132 }
3133 // clean up at the end
3134 if(ii<m)
3135 {
3136 if(m-ii==1)
3137 kernel_strcp_l_1_0_lib4(ii, pA, pB);
3138 else if(m-ii==2)
3139 kernel_strcp_l_2_0_lib4(ii, pA, pB);
3140 else // if(m-ii==3)
3141 kernel_strcp_l_3_0_lib4(ii, pA, pB);
3142 }
3143 }
3144 // skip one element of pA
3145 else if(offA==(offB+1)%bs)
3146 {
3147 ii = 0;
3148 // clean up at the beginning
3149 mna = (4-offB)%bs;
3150 if(mna>0)
3151 {
3152 if(m<mna) // mna<=3 ==> m = { 1, 2 }
3153 {
3154 if(m==1)
3155 {
3156 kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
3157 return;
3158 }
3159 else //if(m==2 && mna==3)
3160 {
3161 kernel_strcp_l_2_0_lib4(ii, pA+offA, pB+offB);
3162 return;
3163 }
3164 }
3165 if(mna==1)
3166 {
3167 kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
3168 //pA += 4*sda;
3169 pB += 4*sdb;
3170 ii += 1;
3171 }
3172 else if(mna==2)
3173 {
3174 kernel_strcp_l_2_3_lib4(ii, pA, sda, pB+2);
3175 pA += 4*sda;
3176 pB += 4*sdb;
3177 ii += 2;
3178 }
3179 else // if(mna==3)
3180 {
3181 kernel_strcp_l_3_2_lib4(ii, pA, sda, pB+1);
3182 pA += 4*sda;
3183 pB += 4*sdb;
3184 ii += 3;
3185 }
3186 }
3187 // main loop
3188 for( ; ii<m-3; ii+=4)
3189 {
3190 kernel_strcp_l_4_1_lib4(ii, pA, sda, pB);
3191 pA += 4*sda;
3192 pB += 4*sdb;
3193 }
3194 // clean up at the end
3195 if(ii<m)
3196 {
3197 if(m-ii==1)
3198 kernel_strcp_l_1_0_lib4(ii, pA+1, pB);
3199 else if(m-ii==2)
3200 kernel_strcp_l_2_0_lib4(ii, pA+1, pB);
3201 else // if(m-ii==3)
3202 kernel_strcp_l_3_0_lib4(ii, pA+1, pB);
3203 }
3204 }
3205 // skip 2 elements of pA
3206 else if(offA==(offB+2)%bs)
3207 {
3208 ii = 0;
3209 // clean up at the beginning
3210 mna = (4-offB)%bs;
3211 if(mna>0)
3212 {
3213 if(m<mna)
3214 {
3215 if(m==1)
3216 {
3217 kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
3218 return;
3219 }
3220 else // if(m==2 && mna==3)
3221 {
3222 kernel_strcp_l_2_3_lib4(ii, pA, sda, pB+1);
3223 return;
3224 }
3225 }
3226 if(mna==1)
3227 {
3228 kernel_strcp_l_1_0_lib4(ii, pA+1, pB+3);
3229 // pA += 4*sda;
3230 pB += 4*sdb;
3231 ii += 1;
3232 }
3233 else if(mna==2)
3234 {
3235 kernel_strcp_l_2_0_lib4(ii, pA, pB+2);
3236 // pA += 4*sda;
3237 pB += 4*sdb;
3238 ii += 2;
3239 }
3240 else // if(mna==3)
3241 {
3242 kernel_strcp_l_3_3_lib4(ii, pA, sda, pB+1);
3243 pA += 4*sda;
3244 pB += 4*sdb;
3245 ii += 3;
3246 }
3247 }
3248 // main loop
3249 for(; ii<m-3; ii+=4)
3250 {
3251 kernel_strcp_l_4_2_lib4(ii, pA, sda, pB);
3252 pA += 4*sda;
3253 pB += 4*sdb;
3254 }
3255 // clean up at the end
3256 if(ii<m)
3257 {
3258 if(m-ii==1)
3259 kernel_strcp_l_1_0_lib4(ii, pA+2, pB);
3260 else if(m-ii==2)
3261 kernel_strcp_l_2_0_lib4(ii, pA+2, pB);
3262 else // if(m-ii==3)
3263 kernel_strcp_l_3_2_lib4(ii, pA, sda, pB);
3264 }
3265 }
3266 // skip 3 elements of pA
3267 else // if(offA==(offB+3)%bs)
3268 {
3269 ii = 0;
3270 // clean up at the beginning
3271 mna = (4-offB)%bs;
3272 if(mna>0)
3273 {
3274 if(m<mna)
3275 {
3276 if(m==1)
3277 {
3278 kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
3279 return;
3280 }
3281 else // if(m==2 && mna==3)
3282 {
3283 kernel_strcp_l_2_0_lib4(ii, pA+offA, pB+offB);
3284 return;
3285 }
3286 }
3287 if(mna==1)
3288 {
3289 kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
3290 // pA += 4*sda;
3291 pB += 4*sdb;
3292 ii += 1;
3293 }
3294 else if(mna==2)
3295 {
3296 kernel_strcp_l_2_0_lib4(ii, pA+offA, pB+offB);
3297 // pA += 4*sda;
3298 pB += 4*sdb;
3299 ii += 2;
3300 }
3301 else // if(mna==3)
3302 {
3303 kernel_strcp_l_3_0_lib4(ii, pA+offA, pB+offB);
3304 // pA += 4*sda;
3305 pB += 4*sdb;
3306 ii += 3;
3307 }
3308 }
3309 // main loop
3310 for(; ii<m-3; ii+=4)
3311 {
3312 kernel_strcp_l_4_3_lib4(ii, pA, sda, pB);
3313 pA += 4*sda;
3314 pB += 4*sdb;
3315 }
3316 // clean up at the end
3317 if(ii<m)
3318 {
3319 if(m-ii==1)
3320 kernel_strcp_l_1_0_lib4(ii, pA+3, pB);
3321 else if(m-ii==2)
3322 kernel_strcp_l_2_3_lib4(ii, pA, sda, pB);
3323 else // if(m-ii==3)
3324 kernel_strcp_l_3_3_lib4(ii, pA, sda, pB);
3325 }
3326 }
3327
3328 return;
3329
3330 }
3331
3332
3333
3334 // scale and add a generic strmat into a generic strmat
blasfeo_sgead(int m,int n,float alpha,struct blasfeo_smat * sA,int ai,int aj,struct blasfeo_smat * sB,int bi,int bj)3335 void blasfeo_sgead(int m, int n, float alpha, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_smat *sB, int bi, int bj)
3336 {
3337
3338 if(m<=0 || n<=0)
3339 return;
3340
3341 // invalidate stored inverse diagonal
3342 sB->use_dA = 0;
3343
3344 const int bs = 4;
3345
3346 int sda = sA->cn;
3347 int sdb = sB->cn;
3348 float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
3349 float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
3350 int offA = ai%bs;
3351 int offB = bi%bs;
3352
3353 int ii, mna;
3354
3355 // same alignment
3356 if(offA==offB)
3357 {
3358 ii = 0;
3359 // clean up at the beginning
3360 mna = (4-offB)%bs;
3361 if(mna>0)
3362 {
3363 if(m<mna) // mna<=3 ==> m = { 1, 2 }
3364 {
3365 if(m==1)
3366 {
3367 kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
3368 return;
3369 }
3370 else //if(m==2 && mna==3)
3371 {
3372 kernel_sgead_2_0_lib4(n, &alpha, pA+offA, pB+offB);
3373 return;
3374 }
3375 }
3376 if(mna==1)
3377 {
3378 kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
3379 pA += 4*sda;
3380 pB += 4*sdb;
3381 ii += 1;
3382 }
3383 else if(mna==2)
3384 {
3385 kernel_sgead_2_0_lib4(n, &alpha, pA+offA, pB+offB);
3386 pA += 4*sda;
3387 pB += 4*sdb;
3388 ii += 2;
3389 }
3390 else // if(mna==3)
3391 {
3392 kernel_sgead_3_0_lib4(n, &alpha, pA+offA, pB+offB);
3393 pA += 4*sda;
3394 pB += 4*sdb;
3395 ii += 3;
3396 }
3397 }
3398 // main loop
3399 for(; ii<m-3; ii+=4)
3400 {
3401 kernel_sgead_4_0_lib4(n, &alpha, pA, pB);
3402 pA += 4*sda;
3403 pB += 4*sdb;
3404 }
3405 // clean up at the end
3406 if(ii<m)
3407 {
3408 if(m-ii==1)
3409 kernel_sgead_1_0_lib4(n, &alpha, pA, pB);
3410 else if(m-ii==2)
3411 kernel_sgead_2_0_lib4(n, &alpha, pA, pB);
3412 else // if(m-ii==3)
3413 kernel_sgead_3_0_lib4(n, &alpha, pA, pB);
3414 }
3415 }
3416 // skip one element of pA
3417 else if(offA==(offB+1)%bs)
3418 {
3419 ii = 0;
3420 // clean up at the beginning
3421 mna = (4-offB)%bs;
3422 if(mna>0)
3423 {
3424 if(m<mna) // mna<=3 ==> m = { 1, 2 }
3425 {
3426 if(m==1)
3427 {
3428 kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
3429 return;
3430 }
3431 else //if(m==2 && mna==3)
3432 {
3433 kernel_sgead_2_0_lib4(n, &alpha, pA+offA, pB+offB);
3434 return;
3435 }
3436 }
3437 if(mna==1)
3438 {
3439 kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
3440 //pA += 4*sda;
3441 pB += 4*sdb;
3442 ii += 1;
3443 }
3444 else if(mna==2)
3445 {
3446 kernel_sgead_2_3_lib4(n, &alpha, pA, sda, pB+2);
3447 pA += 4*sda;
3448 pB += 4*sdb;
3449 ii += 2;
3450 }
3451 else // if(mna==3)
3452 {
3453 kernel_sgead_3_2_lib4(n, &alpha, pA, sda, pB+1);
3454 pA += 4*sda;
3455 pB += 4*sdb;
3456 ii += 3;
3457 }
3458 }
3459 // main loop
3460 for( ; ii<m-3; ii+=4)
3461 {
3462 kernel_sgead_4_1_lib4(n, &alpha, pA, sda, pB);
3463 pA += 4*sda;
3464 pB += 4*sdb;
3465 }
3466 // clean up at the end
3467 if(ii<m)
3468 {
3469 if(m-ii==1)
3470 kernel_sgead_1_0_lib4(n, &alpha, pA+1, pB);
3471 else if(m-ii==2)
3472 kernel_sgead_2_0_lib4(n, &alpha, pA+1, pB);
3473 else // if(m-ii==3)
3474 kernel_sgead_3_0_lib4(n, &alpha, pA+1, pB);
3475 }
3476 }
3477 // skip 2 elements of pA
3478 else if(offA==(offB+2)%bs)
3479 {
3480 ii = 0;
3481 // clean up at the beginning
3482 mna = (4-offB)%bs;
3483 if(mna>0)
3484 {
3485 if(m<mna)
3486 {
3487 if(m==1)
3488 {
3489 kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
3490 return;
3491 }
3492 else // if(m==2 && mna==3)
3493 {
3494 kernel_sgead_2_3_lib4(n, &alpha, pA, sda, pB+1);
3495 return;
3496 }
3497 }
3498 if(mna==1)
3499 {
3500 kernel_sgead_1_0_lib4(n, &alpha, pA+1, pB+3);
3501 // pA += 4*sda;
3502 pB += 4*sdb;
3503 ii += 1;
3504 }
3505 else if(mna==2)
3506 {
3507 kernel_sgead_2_0_lib4(n, &alpha, pA, pB+2);
3508 // pA += 4*sda;
3509 pB += 4*sdb;
3510 ii += 2;
3511 }
3512 else // if(mna==3)
3513 {
3514 kernel_sgead_3_3_lib4(n, &alpha, pA, sda, pB+1);
3515 pA += 4*sda;
3516 pB += 4*sdb;
3517 ii += 3;
3518 }
3519 }
3520 // main loop
3521 for(; ii<m-3; ii+=4)
3522 {
3523 kernel_sgead_4_2_lib4(n, &alpha, pA, sda, pB);
3524 pA += 4*sda;
3525 pB += 4*sdb;
3526 }
3527 // clean up at the end
3528 if(ii<m)
3529 {
3530 if(m-ii==1)
3531 kernel_sgead_1_0_lib4(n, &alpha, pA+2, pB);
3532 else if(m-ii==2)
3533 kernel_sgead_2_0_lib4(n, &alpha, pA+2, pB);
3534 else // if(m-ii==3)
3535 kernel_sgead_3_2_lib4(n, &alpha, pA, sda, pB);
3536 }
3537 }
3538 // skip 3 elements of pA
3539 else // if(offA==(offB+3)%bs)
3540 {
3541 ii = 0;
3542 // clean up at the beginning
3543 mna = (4-offB)%bs;
3544 if(mna>0)
3545 {
3546 if(m<mna)
3547 {
3548 if(m==1)
3549 {
3550 kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
3551 return;
3552 }
3553 else // if(m==2 && mna==3)
3554 {
3555 kernel_sgead_2_0_lib4(n, &alpha, pA+offA, pB+offB);
3556 return;
3557 }
3558 }
3559 if(mna==1)
3560 {
3561 kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
3562 // pA += 4*sda;
3563 pB += 4*sdb;
3564 ii += 1;
3565 }
3566 else if(mna==2)
3567 {
3568 kernel_sgead_2_0_lib4(n, &alpha, pA+offA, pB+offB);
3569 // pA += 4*sda;
3570 pB += 4*sdb;
3571 ii += 2;
3572 }
3573 else // if(mna==3)
3574 {
3575 kernel_sgead_3_0_lib4(n, &alpha, pA+offA, pB+offB);
3576 // pA += 4*sda;
3577 pB += 4*sdb;
3578 ii += 3;
3579 }
3580 }
3581 // main loop
3582 for(; ii<m-3; ii+=4)
3583 {
3584 kernel_sgead_4_3_lib4(n, &alpha, pA, sda, pB);
3585 pA += 4*sda;
3586 pB += 4*sdb;
3587 }
3588 // clean up at the end
3589 if(ii<m)
3590 {
3591 if(m-ii==1)
3592 kernel_sgead_1_0_lib4(n, &alpha, pA+3, pB);
3593 else if(m-ii==2)
3594 kernel_sgead_2_3_lib4(n, &alpha, pA, sda, pB);
3595 else // if(m-ii==3)
3596 kernel_sgead_3_3_lib4(n, &alpha, pA, sda, pB);
3597 }
3598 }
3599
3600 return;
3601
3602 }
3603
3604
3605
3606 // copy and transpose a generic strmat into a generic strmat
blasfeo_sgetr(int m,int n,struct blasfeo_smat * sA,int ai,int aj,struct blasfeo_smat * sC,int ci,int cj)3607 void blasfeo_sgetr(int m, int n, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_smat *sC, int ci, int cj)
3608 {
3609 if(m<=0 || n<=0)
3610 return;
3611
3612 // invalidate stored inverse diagonal
3613 sC->use_dA = 0;
3614
3615 const int bs = 4;
3616 int sda = sA->cn;
3617 float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
3618 int sdc = sC->cn;
3619 float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
3620 sgetr_lib(m, n, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
3621 return;
3622 }
3623
3624
3625
3626 // copy and transpose a lower triangular strmat into an upper triangular strmat
blasfeo_strtr_l(int m,struct blasfeo_smat * sA,int ai,int aj,struct blasfeo_smat * sC,int ci,int cj)3627 void blasfeo_strtr_l(int m, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_smat *sC, int ci, int cj)
3628 {
3629 if(m<=0)
3630 return;
3631
3632 // invalidate stored inverse diagonal
3633 sC->use_dA = 0;
3634
3635 const int bs = 4;
3636 int sda = sA->cn;
3637 float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
3638 int sdc = sC->cn;
3639 float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
3640 strtr_l_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
3641 return;
3642 }
3643
3644
3645
3646 // copy and transpose an upper triangular strmat into a lower triangular strmat
blasfeo_strtr_u(int m,struct blasfeo_smat * sA,int ai,int aj,struct blasfeo_smat * sC,int ci,int cj)3647 void blasfeo_strtr_u(int m, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_smat *sC, int ci, int cj)
3648 {
3649 if(m<=0)
3650 return;
3651
3652 // invalidate stored inverse diagonal
3653 sC->use_dA = 0;
3654
3655 const int bs = 4;
3656 int sda = sA->cn;
3657 float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
3658 int sdc = sC->cn;
3659 float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
3660 strtr_u_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
3661 return;
3662 }
3663
3664
3665
3666 // insert a strvec to diagonal of strmat, sparse formulation
blasfeo_sdiain_sp(int kmax,float alpha,struct blasfeo_svec * sx,int xi,int * idx,struct blasfeo_smat * sD,int di,int dj)3667 void blasfeo_sdiain_sp(int kmax, float alpha, struct blasfeo_svec *sx, int xi, int *idx, struct blasfeo_smat *sD, int di, int dj)
3668 {
3669 if(kmax<=0)
3670 return;
3671
3672 // invalidate stored inverse diagonal
3673 sD->use_dA = 0;
3674
3675 const int bs = 4;
3676 float *x = sx->pa + xi;
3677 int sdd = sD->cn;
3678 float *pD = sD->pA;
3679 int ii, jj;
3680 for(jj=0; jj<kmax; jj++)
3681 {
3682 ii = idx[jj];
3683 pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = alpha * x[jj];
3684 }
3685 return;
3686 }
3687
3688
3689
3690 // extract the diagonal of a strmat to a strvec, sparse formulation
blasfeo_sdiaex_sp(int kmax,float alpha,int * idx,struct blasfeo_smat * sD,int di,int dj,struct blasfeo_svec * sx,int xi)3691 void blasfeo_sdiaex_sp(int kmax, float alpha, int *idx, struct blasfeo_smat *sD, int di, int dj, struct blasfeo_svec *sx, int xi)
3692 {
3693 const int bs = 4;
3694 float *x = sx->pa + xi;
3695 int sdd = sD->cn;
3696 float *pD = sD->pA;
3697 int ii, jj;
3698 for(jj=0; jj<kmax; jj++)
3699 {
3700 ii = idx[jj];
3701 x[jj] = alpha * pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs];
3702 }
3703 return;
3704 }
3705
3706
3707
3708 // add scaled strvec to diagonal of strmat, sparse formulation
blasfeo_sdiaad_sp(int kmax,float alpha,struct blasfeo_svec * sx,int xi,int * idx,struct blasfeo_smat * sD,int di,int dj)3709 void blasfeo_sdiaad_sp(int kmax, float alpha, struct blasfeo_svec *sx, int xi, int *idx, struct blasfeo_smat *sD, int di, int dj)
3710 {
3711 if(kmax<=0)
3712 return;
3713
3714 // invalidate stored inverse diagonal
3715 sD->use_dA = 0;
3716
3717 const int bs = 4;
3718 float *x = sx->pa + xi;
3719 int sdd = sD->cn;
3720 float *pD = sD->pA;
3721 int ii, jj;
3722 for(jj=0; jj<kmax; jj++)
3723 {
3724 ii = idx[jj];
3725 pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] += alpha * x[jj];
3726 }
3727 return;
3728 }
3729
3730
3731
3732 // add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation
blasfeo_sdiaadin_sp(int kmax,float alpha,struct blasfeo_svec * sx,int xi,struct blasfeo_svec * sy,int yi,int * idx,struct blasfeo_smat * sD,int di,int dj)3733 void blasfeo_sdiaadin_sp(int kmax, float alpha, struct blasfeo_svec *sx, int xi, struct blasfeo_svec *sy, int yi, int *idx, struct blasfeo_smat *sD, int di, int dj)
3734 {
3735 if(kmax<=0)
3736 return;
3737
3738 // invalidate stored inverse diagonal
3739 sD->use_dA = 0;
3740
3741 const int bs = 4;
3742 float *x = sx->pa + xi;
3743 float *y = sy->pa + yi;
3744 int sdd = sD->cn;
3745 float *pD = sD->pA;
3746 int ii, jj;
3747 for(jj=0; jj<kmax; jj++)
3748 {
3749 ii = idx[jj];
3750 pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = y[jj] + alpha * x[jj];
3751 }
3752 return;
3753 }
3754
3755
3756
3757 // add scaled strvec to row of strmat, sparse formulation
blasfeo_srowad_sp(int kmax,float alpha,struct blasfeo_svec * sx,int xi,int * idx,struct blasfeo_smat * sD,int di,int dj)3758 void blasfeo_srowad_sp(int kmax, float alpha, struct blasfeo_svec *sx, int xi, int *idx, struct blasfeo_smat *sD, int di, int dj)
3759 {
3760 if(kmax<=0)
3761 return;
3762
3763 // invalidate stored inverse diagonal
3764 sD->use_dA = 0;
3765
3766 const int bs = 4;
3767 float *x = sx->pa + xi;
3768 int sdd = sD->cn;
3769 float *pD = sD->pA + di/bs*bs*sdd + di%bs + dj*bs;
3770 srowad_libsp(kmax, idx, alpha, x, pD);
3771 return;
3772 }
3773
3774
3775
3776 // adds strvec to strvec, sparse formulation
blasfeo_svecad_sp(int kmax,float alpha,struct blasfeo_svec * sx,int xi,int * idx,struct blasfeo_svec * sy,int yi)3777 void blasfeo_svecad_sp(int kmax, float alpha, struct blasfeo_svec *sx, int xi, int *idx, struct blasfeo_svec *sy, int yi)
3778 {
3779 float *x = sx->pa + xi;
3780 float *y = sy->pa + yi;
3781 svecad_libsp(kmax, idx, alpha, x, y);
3782 return;
3783 }
3784
3785
3786
blasfeo_svecin_sp(int m,float alpha,struct blasfeo_svec * sx,int xi,int * idx,struct blasfeo_svec * sz,int zi)3787 void blasfeo_svecin_sp(int m, float alpha, struct blasfeo_svec *sx, int xi, int *idx, struct blasfeo_svec *sz, int zi)
3788 {
3789 float *x = sx->pa + xi;
3790 float *z = sz->pa + zi;
3791 int ii;
3792 for(ii=0; ii<m; ii++)
3793 z[idx[ii]] = alpha * x[ii];
3794 return;
3795 }
3796
3797
3798
blasfeo_svecex_sp(int m,float alpha,int * idx,struct blasfeo_svec * sx,int xi,struct blasfeo_svec * sz,int zi)3799 void blasfeo_svecex_sp(int m, float alpha, int *idx, struct blasfeo_svec *sx, int xi, struct blasfeo_svec *sz, int zi)
3800 {
3801 float *x = sx->pa + xi;
3802 float *z = sz->pa + zi;
3803 int ii;
3804 for(ii=0; ii<m; ii++)
3805 z[ii] = alpha * x[idx[ii]];
3806 return;
3807 }
3808
3809
3810
blasfeo_svecnrm_inf(int m,struct blasfeo_svec * sx,int xi,float * ptr_norm)3811 void blasfeo_svecnrm_inf(int m, struct blasfeo_svec *sx, int xi, float *ptr_norm)
3812 {
3813 int ii;
3814 float *x = sx->pa + xi;
3815 float norm = 0.0;
3816 float tmp;
3817 for(ii=0; ii<m; ii++)
3818 {
3819 #ifdef USE_C99_MATH
3820 norm = fmaxf(norm, fabsf(x[ii]));
3821 #else
3822 tmp = fabs(x[ii]);
3823 norm = tmp>norm ? tmp : norm;
3824 #endif
3825 }
3826 *ptr_norm = norm;
3827 return;
3828 }
3829
3830
3831 // permute elements of a vector struct
blasfeo_svecpe(int kmax,int * ipiv,struct blasfeo_svec * sx,int xi)3832 void blasfeo_svecpe(int kmax, int *ipiv, struct blasfeo_svec *sx, int xi)
3833 {
3834 int ii;
3835 float tmp;
3836 float *x = sx->pa + xi;
3837 for(ii=0; ii<kmax; ii++)
3838 {
3839 if(ipiv[ii]!=ii)
3840 {
3841 tmp = x[ipiv[ii]];
3842 x[ipiv[ii]] = x[ii];
3843 x[ii] = tmp;
3844 }
3845 }
3846 return;
3847 }
3848
3849
3850
3851 // inverse permute elements of a vector struct
blasfeo_svecpei(int kmax,int * ipiv,struct blasfeo_svec * sx,int xi)3852 void blasfeo_svecpei(int kmax, int *ipiv, struct blasfeo_svec *sx, int xi)
3853 {
3854 int ii;
3855 float tmp;
3856 float *x = sx->pa + xi;
3857 for(ii=kmax-1; ii>=0; ii--)
3858 {
3859 if(ipiv[ii]!=ii)
3860 {
3861 tmp = x[ipiv[ii]];
3862 x[ipiv[ii]] = x[ii];
3863 x[ii] = tmp;
3864 }
3865 }
3866 return;
3867 }
3868
3869
3870
3871 #else
3872
3873 #error : wrong LA choice
3874
3875 #endif
3876
3877
3878
3879