1 /**************************************************************************************************
2 *                                                                                                 *
3 * This file is part of BLASFEO.                                                                   *
4 *                                                                                                 *
5 * BLASFEO -- BLAS For Embedded Optimization.                                                      *
6 * Copyright (C) 2019 by Gianluca Frison.                                                          *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
8 * All rights reserved.                                                                            *
9 *                                                                                                 *
10 * The 2-Clause BSD License                                                                        *
11 *                                                                                                 *
12 * Redistribution and use in source and binary forms, with or without                              *
13 * modification, are permitted provided that the following conditions are met:                     *
14 *                                                                                                 *
15 * 1. Redistributions of source code must retain the above copyright notice, this                  *
16 *    list of conditions and the following disclaimer.                                             *
17 * 2. Redistributions in binary form must reproduce the above copyright notice,                    *
18 *    this list of conditions and the following disclaimer in the documentation                    *
19 *    and/or other materials provided with the distribution.                                       *
20 *                                                                                                 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND                 *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED                   *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE                          *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR                 *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES                  *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;                    *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND                     *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT                      *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS                   *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                                    *
31 *                                                                                                 *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de                             *
33 *                                                                                                 *
34 **************************************************************************************************/
35 
36 #include <stdlib.h>
37 #include <stdio.h>
38 #include <math.h>
39 
40 #include "../include/blasfeo_common.h"
41 #include "../include/blasfeo_block_size.h"
42 #include "../include/blasfeo_s_kernel.h"
43 
44 
45 
46 // scales and adds a strvec into a strvec
blasfeo_svecad(int m,float * alphap,struct blasfeo_svec * sa,int ai,struct blasfeo_svec * sc,int ci)47 void blasfeo_svecad(int m, float *alphap, struct blasfeo_svec *sa, int ai, struct blasfeo_svec *sc, int ci)
48 	{
49 	float alpha = alphap[0];
50 	float *pa = sa->pa + ai;
51 	float *pc = sc->pa + ci;
52 	int ii;
53 	ii = 0;
54 	for(; ii<m-3; ii+=4)
55 		{
56 		pc[ii+0] += alpha*pa[ii+0];
57 		pc[ii+1] += alpha*pa[ii+1];
58 		pc[ii+2] += alpha*pa[ii+2];
59 		pc[ii+3] += alpha*pa[ii+3];
60 		}
61 	for(; ii<m; ii++)
62 		{
63 		pc[ii+0] += alpha*pa[ii+0];
64 		}
65 	return;
66 	}
67 
68 
69 
70 // transpose general matrix; m and n are referred to the original matrix
sgetr_lib(int m,int n,float alpha,int offsetA,float * pA,int sda,int offsetC,float * pC,int sdc)71 void sgetr_lib(int m, int n, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
72 	{
73 
74 /*
75 
76 m = 5
77 n = 3
78 offsetA = 1
79 offsetC = 2
80 
81 A =
82  x x x
83  -
84  x x x
85  x x x
86  x x x
87  x x x
88 
89 C =
90  x x x x x
91  x x x x x
92  -
93  x x x x x
94 
95 */
96 
97 	if(m<=0 || n<=0)
98 		return;
99 
100 	const int bs = 4;
101 
102 	int mna = (bs-offsetA%bs)%bs;
103 	mna = m<mna ? m : mna;
104 	int nna = (bs-offsetC%bs)%bs;
105 	nna = n<nna ? n : nna;
106 
107 	int ii;
108 
109 	ii = 0;
110 
111 	if(mna>0)
112 		{
113 		if(mna==1)
114 			kernel_sgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
115 		else if(mna==2)
116 			kernel_sgetr_2_lib4(0, n, nna, alpha, pA, pC, sdc);
117 		else //if(mna==3)
118 			kernel_sgetr_3_lib4(0, n, nna, alpha, pA, pC, sdc);
119 		ii += mna;
120 		pA += mna + bs*(sda-1);
121 		pC += mna*bs;
122 		}
123 	for( ; ii<m-3; ii+=4)
124 //	for( ; ii<m; ii+=4)
125 		{
126 		kernel_sgetr_4_lib4(0, n, nna, alpha, pA, pC, sdc);
127 		pA += bs*sda;
128 		pC += bs*bs;
129 		}
130 
131 	// clean-up at the end using smaller kernels
132 	if(ii==m)
133 		return;
134 
135 	if(m-ii==1)
136 		kernel_sgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
137 	else if(m-ii==2)
138 		kernel_sgetr_2_lib4(0, n, nna, alpha, pA, pC, sdc);
139 	else if(m-ii==3)
140 		kernel_sgetr_3_lib4(0, n, nna, alpha, pA, pC, sdc);
141 
142 	return;
143 
144 	}
145 
146 
147 
148 // transpose lower triangular matrix
strtr_l_lib(int m,float alpha,int offsetA,float * pA,int sda,int offsetC,float * pC,int sdc)149 void strtr_l_lib(int m, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
150 	{
151 
152 /*
153 
154 A =
155  x
156  x x
157  x x x
158  x x x x
159 
160  x x x x x
161  x x x x x x
162  x x x x x x x
163  x x x x x x x x
164 
165 C =
166  x x x x x x x x
167 
168    x x x x x x x
169      x x x x x x
170 	   x x x x x
171 	     x x x x
172 
173 	       x x x
174 	         x x
175 	           x
176 
177 */
178 
179 	int n = m;
180 
181 	if(m<=0 || n<=0)
182 		return;
183 
184 	const int bs = 4;
185 
186 	int mna = (bs-offsetA%bs)%bs;
187 	mna = m<mna ? m : mna;
188 	int nna = (bs-offsetC%bs)%bs;
189 	nna = n<nna ? n : nna;
190 
191 	int ii;
192 
193 	ii = 0;
194 
195 	if(mna>0)
196 		{
197 		if(mna==1)
198 			{
199 			pC[0] = alpha * pA[0];
200 			}
201 		else if(mna==2)
202 			{
203 			if(nna==1)
204 				{
205 				pC[0+bs*0] = alpha * pA[0+bs*0];
206 				pC[0+bs*1] = alpha * pA[1+bs*0];
207 				pC[1+bs*(0+sdc)] = alpha * pA[1+bs*1];
208 				}
209 			else
210 				{
211 				pC[0+bs*0] = alpha * pA[0+bs*0];
212 				pC[0+bs*1] = alpha * pA[1+bs*0];
213 				pC[1+bs*1] = alpha * pA[1+bs*1];
214 				}
215 			}
216 		else //if(mna==3)
217 			{
218 			if(nna==1)
219 				{
220 				pC[0+bs*0] = alpha * pA[0+bs*0];
221 				pC[0+bs*1] = alpha * pA[1+bs*0];
222 				pC[0+bs*2] = alpha * pA[2+bs*0];
223 				pC[1+bs*(0+sdc)] = alpha * pA[1+bs*1];
224 				pC[1+bs*(1+sdc)] = alpha * pA[2+bs*1];
225 				pC[2+bs*(1+sdc)] = alpha * pA[2+bs*2];
226 				}
227 			else if(nna==2)
228 				{
229 				pC[0+bs*0] = alpha * pA[0+bs*0];
230 				pC[0+bs*1] = alpha * pA[1+bs*0];
231 				pC[0+bs*2] = alpha * pA[2+bs*0];
232 				pC[1+bs*1] = alpha * pA[1+bs*1];
233 				pC[1+bs*2] = alpha * pA[2+bs*1];
234 				pC[2+bs*(1+sdc)] = alpha * pA[2+bs*2];
235 				}
236 			else
237 				{
238 				pC[0+bs*0] = alpha * pA[0+bs*0];
239 				pC[0+bs*1] = alpha * pA[1+bs*0];
240 				pC[0+bs*2] = alpha * pA[2+bs*0];
241 				pC[1+bs*1] = alpha * pA[1+bs*1];
242 				pC[1+bs*2] = alpha * pA[2+bs*1];
243 				pC[2+bs*2] = alpha * pA[2+bs*2];
244 				}
245 			}
246 		ii += mna;
247 		pA += mna + bs*(sda-1);
248 		pC += mna*bs;
249 		}
250 	for( ; ii<m-3; ii+=4)
251 		{
252 		kernel_sgetr_4_lib4(1, ii, nna, alpha, pA, pC, sdc);
253 		pA += bs*sda;
254 		pC += bs*bs;
255 		}
256 
257 	// clean-up at the end using smaller kernels
258 	if(ii==m)
259 		return;
260 
261 	if(m-ii==1)
262 		kernel_sgetr_1_lib4(1, ii, nna, alpha, pA, pC, sdc);
263 	else if(m-ii==2)
264 		kernel_sgetr_2_lib4(1, ii, nna, alpha, pA, pC, sdc);
265 	else if(m-ii==3)
266 		kernel_sgetr_3_lib4(1, ii, nna, alpha, pA, pC, sdc);
267 
268 	return;
269 
270 	}
271 
272 
273 
274 // transpose an aligned upper triangular matrix into an aligned lower triangular matrix
strtr_u_lib(int m,float alpha,int offsetA,float * pA,int sda,int offsetC,float * pC,int sdc)275 void strtr_u_lib(int m, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
276 	{
277 
278 /*
279 
280 A =
281  x x x x x x x x
282    x x x x x x x
283 
284      x x x x x x
285        x x x x x
286          x x x x
287            x x x
288              x x
289                x
290 
291 C =
292  x
293 
294  x x
295  x x x
296  x x x x
297  x x x x x
298  x x x x x x
299  x x x x x x x
300  x x x x x x x x
301 
302 */
303 
304 	int n = m;
305 
306 	if(m<=0 || n<=0)
307 		return;
308 
309 	const int bs = 4;
310 
311 	int mna = (bs-offsetA%bs)%bs;
312 	mna = m<mna ? m : mna;
313 	int nna = (bs-offsetC%bs)%bs;
314 	nna = n<nna ? n : nna;
315 	int tna = nna;
316 
317 	int ii;
318 
319 	ii = 0;
320 
321 	if(mna>0)
322 		{
323 		if(mna==1)
324 			{
325 			kernel_sgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
326 			if(nna!=1)
327 				{
328 //				pC[0+bs*0] = alpha * pA[0+bs*0];
329 				pA += 1*bs;
330 				pC += 1;
331 				tna = (bs-(offsetC+1)%bs)%bs;
332 				}
333 			else //if(nna==1)
334 				{
335 //				pC[0+bs*0] = alpha * pA[0+bs*0];
336 				pA += 1*bs;
337 				pC += 1 + (sdc-1)*bs;
338 				tna = 0; //(bs-(offsetC+1)%bs)%bs;
339 				}
340 //			kernel_sgetr_1_lib4(0, n-1, tna, alpha, pA, pC, sdc);
341 			}
342 		else if(mna==2)
343 			{
344 			if(nna==0 || nna==3)
345 				{
346 				pC[0+bs*0] = alpha * pA[0+bs*0];
347 				pC[1+bs*0] = alpha * pA[0+bs*1];
348 				pC[1+bs*1] = alpha * pA[1+bs*1];
349 				pA += 2*bs;
350 				pC += 2;
351 				tna = (bs-(offsetC+2)%bs)%bs;
352 				kernel_sgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
353 				}
354 			else if(nna==1)
355 				{
356 				pC[0+bs*0] = alpha * pA[0+bs*0];
357 				pA += 1*bs;
358 				pC += 1 + (sdc-1)*bs;
359 //				pC[0+bs*0] = alpha * pA[0+bs*0];
360 //				pC[0+bs*1] = alpha * pA[1+bs*0];
361 				kernel_sgetr_2_lib4(0, n-1, 0, alpha, pA, pC, sdc);
362 				pA += 1*bs;
363 				pC += 1;
364 				tna = 3; //(bs-(offsetC+2)%bs)%bs;
365 //				kernel_sgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
366 				}
367 			else if(nna==2)
368 				{
369 				pC[0+bs*0] = alpha * pA[0+bs*0];
370 				pC[1+bs*0] = alpha * pA[0+bs*1];
371 				pC[1+bs*1] = alpha * pA[1+bs*1];
372 				pA += 2*bs;
373 				pC += 2 + (sdc-1)*bs;
374 				tna = 0; //(bs-(offsetC+2)%bs)%bs;
375 				kernel_sgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
376 				}
377 			}
378 		else //if(mna==3)
379 			{
380 			if(nna==0)
381 				{
382 				pC[0+bs*0] = alpha * pA[0+bs*0];
383 				pC[1+bs*0] = alpha * pA[0+bs*1];
384 				pC[1+bs*1] = alpha * pA[1+bs*1];
385 				pC[2+bs*0] = alpha * pA[0+bs*2];
386 				pC[2+bs*1] = alpha * pA[1+bs*2];
387 				pC[2+bs*2] = alpha * pA[2+bs*2];
388 				pA += 3*bs;
389 				pC += 3;
390 				tna = 1;
391 				kernel_sgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
392 				}
393 			else if(nna==1)
394 				{
395 				pC[0+bs*0] = alpha * pA[0+bs*0];
396 				pA += bs;
397 				pC += 1 + (sdc-1)*bs;
398 				pC[0+bs*0] = alpha * pA[0+bs*0];
399 				pC[0+bs*1] = alpha * pA[1+bs*0];
400 				pC[1+bs*0] = alpha * pA[0+bs*1];
401 				pC[1+bs*1] = alpha * pA[1+bs*1];
402 				pC[1+bs*2] = alpha * pA[2+bs*1];
403 				pA += 2*bs;
404 				pC += 2;
405 				tna = 2;
406 				kernel_sgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
407 				}
408 			else if(nna==2)
409 				{
410 				pC[0+bs*0] = alpha * pA[0+bs*0];
411 				pC[1+bs*0] = alpha * pA[0+bs*1];
412 				pC[1+bs*1] = alpha * pA[1+bs*1];
413 				pA += 2*bs;
414 				pC += 2 + (sdc-1)*bs;
415 //				pC[0+bs*0] = alpha * pA[0+bs*0];
416 //				pC[0+bs*1] = alpha * pA[1+bs*0];
417 //				pC[0+bs*2] = alpha * pA[2+bs*0];
418 				kernel_sgetr_3_lib4(0, n-2, 0, alpha, pA, pC, sdc);
419 				pA += 1*bs;
420 				pC += 1;
421 				tna = 3;
422 //				kernel_sgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
423 				}
424 			else //if(nna==3)
425 				{
426 				pC[0+bs*0] = alpha * pA[0+bs*0];
427 				pC[1+bs*0] = alpha * pA[0+bs*1];
428 				pC[1+bs*1] = alpha * pA[1+bs*1];
429 				pC[2+bs*0] = alpha * pA[0+bs*2];
430 				pC[2+bs*1] = alpha * pA[1+bs*2];
431 				pC[2+bs*2] = alpha * pA[2+bs*2];
432 				pA += 3*bs;
433 				pC += 3 + (sdc-1)*bs;
434 				tna = 0;
435 				kernel_sgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
436 				}
437 			}
438 		ii += mna;
439 		pA += mna + bs*(sda-1);
440 		pC += mna*bs;
441 		}
442 	for( ; ii<m-3; ii+=4)
443 		{
444 		if(tna==0)
445 			{
446 			pC[0+bs*0] = alpha * pA[0+bs*0];
447 			pC[1+bs*0] = alpha * pA[0+bs*1];
448 			pC[1+bs*1] = alpha * pA[1+bs*1];
449 			pC[2+bs*0] = alpha * pA[0+bs*2];
450 			pC[2+bs*1] = alpha * pA[1+bs*2];
451 			pC[2+bs*2] = alpha * pA[2+bs*2];
452 			pC[3+bs*0] = alpha * pA[0+bs*3];
453 			pC[3+bs*1] = alpha * pA[1+bs*3];
454 			pC[3+bs*2] = alpha * pA[2+bs*3];
455 			pC[3+bs*3] = alpha * pA[3+bs*3];
456 			pA += 4*bs;
457 			pC += sdc*bs;
458 			kernel_sgetr_4_lib4(0, n-ii-4, 0, alpha, pA, pC, sdc);
459 			}
460 		else if(tna==1)
461 			{
462 			pC[0+bs*0] = alpha * pA[0+bs*0];
463 			pA += bs;
464 			pC += 1 + (sdc-1)*bs;
465 			pC[0+bs*0] = alpha * pA[0+bs*0];
466 			pC[0+bs*1] = alpha * pA[1+bs*0];
467 			pC[1+bs*0] = alpha * pA[0+bs*1];
468 			pC[1+bs*1] = alpha * pA[1+bs*1];
469 			pC[1+bs*2] = alpha * pA[2+bs*1];
470 			pC[2+bs*0] = alpha * pA[0+bs*2];
471 			pC[2+bs*1] = alpha * pA[1+bs*2];
472 			pC[2+bs*2] = alpha * pA[2+bs*2];
473 			pC[2+bs*3] = alpha * pA[3+bs*2];
474 			pA += 3*bs;
475 			pC += 3;
476 			kernel_sgetr_4_lib4(0, n-ii-4, 1, alpha, pA, pC, sdc);
477 			}
478 		else if(tna==2)
479 			{
480 			pC[0+bs*0] = alpha * pA[0+bs*0];
481 			pC[1+bs*0] = alpha * pA[0+bs*1];
482 			pC[1+bs*1] = alpha * pA[1+bs*1];
483 			pA += 2*bs;
484 			pC += 2 + (sdc-1)*bs;
485 			pC[0+bs*0] = alpha * pA[0+bs*0];
486 			pC[0+bs*1] = alpha * pA[1+bs*0];
487 			pC[0+bs*2] = alpha * pA[2+bs*0];
488 			pC[1+bs*0] = alpha * pA[0+bs*1];
489 			pC[1+bs*1] = alpha * pA[1+bs*1];
490 			pC[1+bs*2] = alpha * pA[2+bs*1];
491 			pC[1+bs*3] = alpha * pA[3+bs*1];
492 			pA += 2*bs;
493 			pC += 2;
494 			kernel_sgetr_4_lib4(0, n-ii-4, 2, alpha, pA, pC, sdc);
495 			}
496 		else //if(tna==3)
497 			{
498 			pC[0+bs*0] = alpha * pA[0+bs*0];
499 			pC[1+bs*0] = alpha * pA[0+bs*1];
500 			pC[1+bs*1] = alpha * pA[1+bs*1];
501 			pC[2+bs*0] = alpha * pA[0+bs*2];
502 			pC[2+bs*1] = alpha * pA[1+bs*2];
503 			pC[2+bs*2] = alpha * pA[2+bs*2];
504 			pA += 3*bs;
505 			pC += 3 + (sdc-1)*bs;
506 			kernel_sgetr_4_lib4(0, n-ii-3, 0, alpha, pA, pC, sdc);
507 //			pC[0+bs*0] = alpha * pA[0+bs*0];
508 //			pC[0+bs*1] = alpha * pA[1+bs*0];
509 //			pC[0+bs*2] = alpha * pA[2+bs*0];
510 //			pC[0+bs*3] = alpha * pA[3+bs*0];
511 			pA += bs;
512 			pC += 1;
513 //			kernel_sgetr_4_lib4(0, n-ii-4, tna, alpha, pA, pC, sdc);
514 			}
515 		pA += bs*sda;
516 		pC += bs*bs;
517 		}
518 
519 	// clean-up at the end
520 	if(ii==m)
521 		return;
522 
523 	if(m-ii==1)
524 		{
525 		pC[0+bs*0] = alpha * pA[0+bs*0];
526 		}
527 	else if(m-ii==2)
528 		{
529 		if(tna!=1)
530 			{
531 			pC[0+bs*0] = alpha * pA[0+bs*0];
532 			pC[1+bs*0] = alpha * pA[0+bs*1];
533 			pC[1+bs*1] = alpha * pA[1+bs*1];
534 			}
535 		else //if(tna==1)
536 			{
537 			pC[0+bs*0] = alpha * pA[0+bs*0];
538 			pA += bs;
539 			pC += 1 + (sdc-1)*bs;
540 			pC[0+bs*0] = alpha * pA[0+bs*0];
541 			pC[0+bs*1] = alpha * pA[1+bs*0];
542 			}
543 		}
544 	else if(m-ii==3)
545 		{
546 		if(tna==0 || tna==3)
547 			{
548 			pC[0+bs*0] = alpha * pA[0+bs*0];
549 			pC[1+bs*0] = alpha * pA[0+bs*1];
550 			pC[1+bs*1] = alpha * pA[1+bs*1];
551 			pC[2+bs*0] = alpha * pA[0+bs*2];
552 			pC[2+bs*1] = alpha * pA[1+bs*2];
553 			pC[2+bs*2] = alpha * pA[2+bs*2];
554 			}
555 		else if(tna==1)
556 			{
557 			pC[0+bs*0] = alpha * pA[0+bs*0];
558 			pA += bs;
559 			pC += 1 + (sdc-1)*bs;
560 			pC[0+bs*0] = alpha * pA[0+bs*0];
561 			pC[0+bs*1] = alpha * pA[1+bs*0];
562 			pC[1+bs*0] = alpha * pA[0+bs*0];
563 			pC[1+bs*1] = alpha * pA[1+bs*1];
564 			pC[1+bs*2] = alpha * pA[2+bs*1];
565 			}
566 		else //if(tna==2)
567 			{
568 			pC[0+bs*0] = alpha * pA[0+bs*0];
569 			pC[1+bs*0] = alpha * pA[0+bs*1];
570 			pC[1+bs*1] = alpha * pA[1+bs*1];
571 			pA += 2*bs;
572 			pC += 2 + (sdc-1)*bs;
573 			pC[0+bs*0] = alpha * pA[0+bs*0];
574 			pC[0+bs*1] = alpha * pA[1+bs*0];
575 			pC[0+bs*2] = alpha * pA[2+bs*0];
576 			}
577 		}
578 
579 	return;
580 
581 	}
582 
583 
584 
585 // regularize diagonal
sdiareg_lib(int kmax,float reg,int offset,float * pD,int sdd)586 void sdiareg_lib(int kmax, float reg, int offset, float *pD, int sdd)
587 	{
588 
589 	const int bs = 4;
590 
591 	int kna = (bs-offset%bs)%bs;
592 	kna = kmax<kna ? kmax : kna;
593 
594 	int jj, ll;
595 
596 	if(kna>0)
597 		{
598 		for(ll=0; ll<kna; ll++)
599 			{
600 			pD[ll+bs*ll] += reg;
601 			}
602 		pD += kna + bs*(sdd-1) + kna*bs;
603 		kmax -= kna;
604 		}
605 	for(jj=0; jj<kmax-3; jj+=4)
606 		{
607 		pD[jj*sdd+(jj+0)*bs+0] += reg;
608 		pD[jj*sdd+(jj+1)*bs+1] += reg;
609 		pD[jj*sdd+(jj+2)*bs+2] += reg;
610 		pD[jj*sdd+(jj+3)*bs+3] += reg;
611 		}
612 	for(ll=0; ll<kmax-jj; ll++)
613 		{
614 		pD[jj*sdd+(jj+ll)*bs+ll] += reg;
615 		}
616 
617 	}
618 
619 
620 
621 // insert vector to diagonal
sdiain_lib(int kmax,float alpha,float * x,int offset,float * pD,int sdd)622 void sdiain_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd)
623 	{
624 
625 	const int bs = 4;
626 
627 	int kna = (bs-offset%bs)%bs;
628 	kna = kmax<kna ? kmax : kna;
629 
630 	int jj, ll;
631 
632 	if(kna>0)
633 		{
634 		for(ll=0; ll<kna; ll++)
635 			{
636 			pD[ll+bs*ll] = alpha*x[ll];
637 			}
638 		pD += kna + bs*(sdd-1) + kna*bs;
639 		x  += kna;
640 		kmax -= kna;
641 		}
642 	for(jj=0; jj<kmax-3; jj+=4)
643 		{
644 		pD[jj*sdd+(jj+0)*bs+0] = alpha*x[jj+0];
645 		pD[jj*sdd+(jj+1)*bs+1] = alpha*x[jj+1];
646 		pD[jj*sdd+(jj+2)*bs+2] = alpha*x[jj+2];
647 		pD[jj*sdd+(jj+3)*bs+3] = alpha*x[jj+3];
648 		}
649 	for(ll=0; ll<kmax-jj; ll++)
650 		{
651 		pD[jj*sdd+(jj+ll)*bs+ll] = alpha*x[jj+ll];
652 		}
653 
654 	}
655 
656 
657 
658 // insert sqrt of vector to diagonal
sdiain_sqrt_lib(int kmax,float * x,int offset,float * pD,int sdd)659 void sdiain_sqrt_lib(int kmax, float *x, int offset, float *pD, int sdd)
660 	{
661 
662 	const int bs = 4;
663 
664 	int kna = (bs-offset%bs)%bs;
665 	kna = kmax<kna ? kmax : kna;
666 
667 	int jj, ll;
668 
669 	if(kna>0)
670 		{
671 		for(ll=0; ll<kna; ll++)
672 			{
673 			pD[ll+bs*ll] = sqrt(x[ll]);
674 			}
675 		pD += kna + bs*(sdd-1) + kna*bs;
676 		x  += kna;
677 		kmax -= kna;
678 		}
679 	for(jj=0; jj<kmax-3; jj+=4)
680 		{
681 		pD[jj*sdd+(jj+0)*bs+0] = sqrt(x[jj+0]);
682 		pD[jj*sdd+(jj+1)*bs+1] = sqrt(x[jj+1]);
683 		pD[jj*sdd+(jj+2)*bs+2] = sqrt(x[jj+2]);
684 		pD[jj*sdd+(jj+3)*bs+3] = sqrt(x[jj+3]);
685 		}
686 	for(ll=0; ll<kmax-jj; ll++)
687 		{
688 		pD[jj*sdd+(jj+ll)*bs+ll] = sqrt(x[jj+ll]);
689 		}
690 
691 	}
692 
693 
694 
695 // extract diagonal to vector
sdiaex_lib(int kmax,float alpha,int offset,float * pD,int sdd,float * x)696 void sdiaex_lib(int kmax, float alpha, int offset, float *pD, int sdd, float *x)
697 	{
698 
699 	const int bs = 4;
700 
701 	int kna = (bs-offset%bs)%bs;
702 	kna = kmax<kna ? kmax : kna;
703 
704 	int jj, ll;
705 
706 	if(kna>0)
707 		{
708 		for(ll=0; ll<kna; ll++)
709 			{
710 			x[ll] = alpha * pD[ll+bs*ll];
711 			}
712 		pD += kna + bs*(sdd-1) + kna*bs;
713 		x  += kna;
714 		kmax -= kna;
715 		}
716 	for(jj=0; jj<kmax-3; jj+=4)
717 		{
718 		x[jj+0] = alpha * pD[jj*sdd+(jj+0)*bs+0];
719 		x[jj+1] = alpha * pD[jj*sdd+(jj+1)*bs+1];
720 		x[jj+2] = alpha * pD[jj*sdd+(jj+2)*bs+2];
721 		x[jj+3] = alpha * pD[jj*sdd+(jj+3)*bs+3];
722 		}
723 	for(ll=0; ll<kmax-jj; ll++)
724 		{
725 		x[jj+ll] = alpha * pD[jj*sdd+(jj+ll)*bs+ll];
726 		}
727 
728 	}
729 
730 
731 
732 // add scaled vector to diagonal
sdiaad_lib(int kmax,float alpha,float * x,int offset,float * pD,int sdd)733 void sdiaad_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd)
734 	{
735 
736 	const int bs = 4;
737 
738 	int kna = (bs-offset%bs)%bs;
739 	kna = kmax<kna ? kmax : kna;
740 
741 	int jj, ll;
742 
743 	if(kna>0)
744 		{
745 		for(ll=0; ll<kna; ll++)
746 			{
747 			pD[ll+bs*ll] += alpha * x[ll];
748 			}
749 		pD += kna + bs*(sdd-1) + kna*bs;
750 		x  += kna;
751 		kmax -= kna;
752 		}
753 	for(jj=0; jj<kmax-3; jj+=4)
754 		{
755 		pD[jj*sdd+(jj+0)*bs+0] += alpha * x[jj+0];
756 		pD[jj*sdd+(jj+1)*bs+1] += alpha * x[jj+1];
757 		pD[jj*sdd+(jj+2)*bs+2] += alpha * x[jj+2];
758 		pD[jj*sdd+(jj+3)*bs+3] += alpha * x[jj+3];
759 		}
760 	for(ll=0; ll<kmax-jj; ll++)
761 		{
762 		pD[jj*sdd+(jj+ll)*bs+ll] += alpha * x[jj+ll];
763 		}
764 
765 	}
766 
767 
768 
769 // insert vector to diagonal, sparse formulation
sdiain_libsp(int kmax,int * idx,float alpha,float * x,float * pD,int sdd)770 void sdiain_libsp(int kmax, int *idx, float alpha, float *x, float *pD, int sdd)
771 	{
772 
773 	const int bs = 4;
774 
775 	int ii, jj;
776 
777 	for(jj=0; jj<kmax; jj++)
778 		{
779 		ii = idx[jj];
780 		pD[ii/bs*bs*sdd+ii%bs+ii*bs] = alpha * x[jj];
781 		}
782 
783 	}
784 
785 
786 
787 // extract diagonal to vector, sparse formulation
sdiaex_libsp(int kmax,int * idx,float alpha,float * pD,int sdd,float * x)788 void sdiaex_libsp(int kmax, int *idx, float alpha, float *pD, int sdd, float *x)
789 	{
790 
791 	const int bs = 4;
792 
793 	int ii, jj;
794 
795 	for(jj=0; jj<kmax; jj++)
796 		{
797 		ii = idx[jj];
798 		x[jj] = alpha * pD[ii/bs*bs*sdd+ii%bs+ii*bs];
799 		}
800 
801 	}
802 
803 
804 
805 // add scaled vector to diagonal, sparse formulation
sdiaad_libsp(int kmax,int * idx,float alpha,float * x,float * pD,int sdd)806 void sdiaad_libsp(int kmax, int *idx, float alpha, float *x, float *pD, int sdd)
807 	{
808 
809 	const int bs = 4;
810 
811 	int ii, jj;
812 
813 	for(jj=0; jj<kmax; jj++)
814 		{
815 		ii = idx[jj];
816 		pD[ii/bs*bs*sdd+ii%bs+ii*bs] += alpha * x[jj];
817 		}
818 
819 	}
820 
821 
822 
823 // add scaled vector to another vector and insert to diagonal, sparse formulation
sdiaadin_libsp(int kmax,int * idx,float alpha,float * x,float * y,float * pD,int sdd)824 void sdiaadin_libsp(int kmax, int *idx, float alpha, float *x, float *y, float *pD, int sdd)
825 	{
826 
827 	const int bs = 4;
828 
829 	int ii, jj;
830 
831 	for(jj=0; jj<kmax; jj++)
832 		{
833 		ii = idx[jj];
834 		pD[ii/bs*bs*sdd+ii%bs+ii*bs] = y[jj] + alpha * x[jj];
835 		}
836 
837 	}
838 
839 
840 
841 // insert vector to row
srowin_lib(int kmax,float alpha,float * x,float * pD)842 void srowin_lib(int kmax, float alpha, float *x, float *pD)
843 	{
844 
845 	const int bs = 4;
846 
847 	int jj, ll;
848 
849 	for(jj=0; jj<kmax-3; jj+=4)
850 		{
851 		pD[(jj+0)*bs] = alpha*x[jj+0];
852 		pD[(jj+1)*bs] = alpha*x[jj+1];
853 		pD[(jj+2)*bs] = alpha*x[jj+2];
854 		pD[(jj+3)*bs] = alpha*x[jj+3];
855 		}
856 	for(; jj<kmax; jj++)
857 		{
858 		pD[(jj)*bs] = alpha*x[jj];
859 		}
860 
861 	}
862 
863 
864 
865 // extract row to vector
srowex_lib(int kmax,float alpha,float * pD,float * x)866 void srowex_lib(int kmax, float alpha, float *pD, float *x)
867 	{
868 
869 	const int bs = 4;
870 
871 	int jj, ll;
872 
873 	for(jj=0; jj<kmax-3; jj+=4)
874 		{
875 		x[jj+0] = alpha*pD[(jj+0)*bs];
876 		x[jj+1] = alpha*pD[(jj+1)*bs];
877 		x[jj+2] = alpha*pD[(jj+2)*bs];
878 		x[jj+3] = alpha*pD[(jj+3)*bs];
879 		}
880 	for(; jj<kmax; jj++)
881 		{
882 		x[jj] = alpha*pD[(jj)*bs];
883 		}
884 
885 	}
886 
887 
888 
889 // add scaled vector to row
srowad_lib(int kmax,float alpha,float * x,float * pD)890 void srowad_lib(int kmax, float alpha, float *x, float *pD)
891 	{
892 
893 	const int bs = 4;
894 
895 	int jj, ll;
896 
897 	for(jj=0; jj<kmax-3; jj+=4)
898 		{
899 		pD[(jj+0)*bs] += alpha * x[jj+0];
900 		pD[(jj+1)*bs] += alpha * x[jj+1];
901 		pD[(jj+2)*bs] += alpha * x[jj+2];
902 		pD[(jj+3)*bs] += alpha * x[jj+3];
903 		}
904 	for(; jj<kmax; jj++)
905 		{
906 		pD[(jj)*bs] += alpha * x[jj];
907 		}
908 
909 	}
910 
911 
912 
913 // insert vector to row, sparse formulation
srowin_libsp(int kmax,float alpha,int * idx,float * x,float * pD)914 void srowin_libsp(int kmax, float alpha, int *idx, float *x, float *pD)
915 	{
916 
917 	const int bs = 4;
918 
919 	int ii, jj;
920 
921 	for(jj=0; jj<kmax; jj++)
922 		{
923 		ii = idx[jj];
924 		pD[ii*bs] = alpha*x[jj];
925 		}
926 
927 	}
928 
929 
930 
931 // add scaled vector to row, sparse formulation
srowad_libsp(int kmax,int * idx,float alpha,float * x,float * pD)932 void srowad_libsp(int kmax, int *idx, float alpha, float *x, float *pD)
933 	{
934 
935 	const int bs = 4;
936 
937 	int ii, jj;
938 
939 	for(jj=0; jj<kmax; jj++)
940 		{
941 		ii = idx[jj];
942 		pD[ii*bs] += alpha * x[jj];
943 		}
944 
945 	}
946 
947 
948 
949 // add scaled vector to another vector and insert to row, sparse formulation
srowadin_libsp(int kmax,int * idx,float alpha,float * x,float * y,float * pD)950 void srowadin_libsp(int kmax, int *idx, float alpha, float *x, float *y, float *pD)
951 	{
952 
953 	const int bs = 4;
954 
955 	int ii, jj;
956 
957 	for(jj=0; jj<kmax; jj++)
958 		{
959 		ii = idx[jj];
960 		pD[ii*bs] = y[jj] + alpha * x[jj];
961 		}
962 
963 	}
964 
965 
966 
967 // swap two rows
srowsw_lib(int kmax,float * pA,float * pC)968 void srowsw_lib(int kmax, float *pA, float *pC)
969 	{
970 
971 	const int bs = 4;
972 
973 	int ii;
974 	float tmp;
975 
976 	for(ii=0; ii<kmax-3; ii+=4)
977 		{
978 		tmp = pA[0+bs*0];
979 		pA[0+bs*0] = pC[0+bs*0];
980 		pC[0+bs*0] = tmp;
981 		tmp = pA[0+bs*1];
982 		pA[0+bs*1] = pC[0+bs*1];
983 		pC[0+bs*1] = tmp;
984 		tmp = pA[0+bs*2];
985 		pA[0+bs*2] = pC[0+bs*2];
986 		pC[0+bs*2] = tmp;
987 		tmp = pA[0+bs*3];
988 		pA[0+bs*3] = pC[0+bs*3];
989 		pC[0+bs*3] = tmp;
990 		pA += 4*bs;
991 		pC += 4*bs;
992 		}
993 	for( ; ii<kmax; ii++)
994 		{
995 		tmp = pA[0+bs*0];
996 		pA[0+bs*0] = pC[0+bs*0];
997 		pC[0+bs*0] = tmp;
998 		pA += 1*bs;
999 		pC += 1*bs;
1000 		}
1001 
1002 	}
1003 
1004 
1005 
1006 // extract vector from column
scolex_lib(int kmax,int offset,float * pD,int sdd,float * x)1007 void scolex_lib(int kmax, int offset, float *pD, int sdd, float *x)
1008 	{
1009 
1010 	const int bs = 4;
1011 
1012 	int kna = (bs-offset%bs)%bs;
1013 	kna = kmax<kna ? kmax : kna;
1014 
1015 	int jj, ll;
1016 
1017 	if(kna>0)
1018 		{
1019 		for(ll=0; ll<kna; ll++)
1020 			{
1021 			x[ll] = pD[ll];
1022 			}
1023 		pD += kna + bs*(sdd-1);
1024 		x  += kna;
1025 		kmax -= kna;
1026 		}
1027 	for(jj=0; jj<kmax-3; jj+=4)
1028 		{
1029 		x[jj*sdd+0] = pD[jj+0];
1030 		x[jj*sdd+1] = pD[jj+1];
1031 		x[jj*sdd+2] = pD[jj+2];
1032 		x[jj*sdd+3] = pD[jj+3];
1033 		}
1034 	for(ll=0; ll<kmax-jj; ll++)
1035 		{
1036 		x[jj*sdd+ll] = pD[jj+ll];
1037 		}
1038 
1039 	}
1040 
1041 
1042 
1043 // insert vector to column
scolin_lib(int kmax,float * x,int offset,float * pD,int sdd)1044 void scolin_lib(int kmax, float *x, int offset, float *pD, int sdd)
1045 	{
1046 
1047 	const int bs = 4;
1048 
1049 	int kna = (bs-offset%bs)%bs;
1050 	kna = kmax<kna ? kmax : kna;
1051 
1052 	int jj, ll;
1053 
1054 	if(kna>0)
1055 		{
1056 		for(ll=0; ll<kna; ll++)
1057 			{
1058 			pD[ll] = x[ll];
1059 			}
1060 		pD += kna + bs*(sdd-1);
1061 		x  += kna;
1062 		kmax -= kna;
1063 		}
1064 	for(jj=0; jj<kmax-3; jj+=4)
1065 		{
1066 		pD[jj*sdd+0] = x[jj+0];
1067 		pD[jj*sdd+1] = x[jj+1];
1068 		pD[jj*sdd+2] = x[jj+2];
1069 		pD[jj*sdd+3] = x[jj+3];
1070 		}
1071 	for(ll=0; ll<kmax-jj; ll++)
1072 		{
1073 		pD[jj*sdd+ll] = x[jj+ll];
1074 		}
1075 
1076 	}
1077 
1078 
1079 
1080 // add scaled vector to column
scolad_lib(int kmax,float alpha,float * x,int offset,float * pD,int sdd)1081 void scolad_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd)
1082 	{
1083 
1084 	const int bs = 4;
1085 
1086 	int kna = (bs-offset%bs)%bs;
1087 	kna = kmax<kna ? kmax : kna;
1088 
1089 	int jj, ll;
1090 
1091 	if(kna>0)
1092 		{
1093 		for(ll=0; ll<kna; ll++)
1094 			{
1095 			pD[ll] += alpha * x[ll];
1096 			}
1097 		pD += kna + bs*(sdd-1);
1098 		x  += kna;
1099 		kmax -= kna;
1100 		}
1101 	for(jj=0; jj<kmax-3; jj+=4)
1102 		{
1103 		pD[jj*sdd+0] += alpha * x[jj+0];
1104 		pD[jj*sdd+1] += alpha * x[jj+1];
1105 		pD[jj*sdd+2] += alpha * x[jj+2];
1106 		pD[jj*sdd+3] += alpha * x[jj+3];
1107 		}
1108 	for(ll=0; ll<kmax-jj; ll++)
1109 		{
1110 		pD[jj*sdd+ll] += alpha * x[jj+ll];
1111 		}
1112 
1113 	}
1114 
1115 
1116 
1117 // insert vector to diagonal, sparse formulation
scolin_libsp(int kmax,int * idx,float * x,float * pD,int sdd)1118 void scolin_libsp(int kmax, int *idx, float *x, float *pD, int sdd)
1119 	{
1120 
1121 	const int bs = 4;
1122 
1123 	int ii, jj;
1124 
1125 	for(jj=0; jj<kmax; jj++)
1126 		{
1127 		ii = idx[jj];
1128 		pD[ii/bs*bs*sdd+ii%bs] = x[jj];
1129 		}
1130 
1131 	}
1132 
1133 
1134 
1135 // add scaled vector to diagonal, sparse formulation
scolad_libsp(int kmax,float alpha,int * idx,float * x,float * pD,int sdd)1136 void scolad_libsp(int kmax, float alpha, int *idx, float *x, float *pD, int sdd)
1137 	{
1138 
1139 	const int bs = 4;
1140 
1141 	int ii, jj;
1142 
1143 	for(jj=0; jj<kmax; jj++)
1144 		{
1145 		ii = idx[jj];
1146 		pD[ii/bs*bs*sdd+ii%bs] += alpha * x[jj];
1147 		}
1148 
1149 	}
1150 
1151 
1152 
1153 // swaps two cols
scolsw_lib(int kmax,int offsetA,float * pA,int sda,int offsetC,float * pC,int sdc)1154 void scolsw_lib(int kmax, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
1155 	{
1156 
1157 	const int bs = 4;
1158 
1159 	int ii;
1160 
1161 	float tmp;
1162 
1163 	if(offsetA==offsetC)
1164 		{
1165 		if(offsetA>0)
1166 			{
1167 			ii = 0;
1168 			for(; ii<bs-offsetA; ii++)
1169 				{
1170 				tmp = pA[0+bs*0];
1171 				pA[0+bs*0] = pC[0+bs*0];
1172 				pC[0+bs*0] = tmp;
1173 				pA += 1;
1174 				pC += 1;
1175 				}
1176 			pA += bs*(sda-1);
1177 			pC += bs*(sdc-1);
1178 			kmax -= bs-offsetA;
1179 			}
1180 		ii = 0;
1181 		for(; ii<kmax-3; ii+=4)
1182 			{
1183 			tmp = pA[0+bs*0];
1184 			pA[0+bs*0] = pC[0+bs*0];
1185 			pC[0+bs*0] = tmp;
1186 			tmp = pA[1+bs*0];
1187 			pA[1+bs*0] = pC[1+bs*0];
1188 			pC[1+bs*0] = tmp;
1189 			tmp = pA[2+bs*0];
1190 			pA[2+bs*0] = pC[2+bs*0];
1191 			pC[2+bs*0] = tmp;
1192 			tmp = pA[3+bs*0];
1193 			pA[3+bs*0] = pC[3+bs*0];
1194 			pC[3+bs*0] = tmp;
1195 			pA += bs*sda;
1196 			pC += bs*sdc;
1197 			}
1198 		for(; ii<kmax; ii++)
1199 			{
1200 			tmp = pA[0+bs*0];
1201 			pA[0+bs*0] = pC[0+bs*0];
1202 			pC[0+bs*0] = tmp;
1203 			pA += 1;
1204 			pC += 1;
1205 			}
1206 		}
1207 	else
1208 		{
1209 		printf("\nscolsw: feature not implemented yet: offsetA!=offsetC\n\n");
1210 		exit(1);
1211 		}
1212 
1213 	return;
1214 
1215 	}
1216 
1217 
1218 
1219 // insert vector to vector, sparse formulation
svecin_libsp(int kmax,int * idx,float * x,float * y)1220 void svecin_libsp(int kmax, int *idx, float *x, float *y)
1221 	{
1222 
1223 	int jj;
1224 
1225 	for(jj=0; jj<kmax; jj++)
1226 		{
1227 		y[idx[jj]] = x[jj];
1228 		}
1229 
1230 	}
1231 
1232 
1233 
1234 // adds vector to vector, sparse formulation
svecad_libsp(int kmax,int * idx,float alpha,float * x,float * y)1235 void svecad_libsp(int kmax, int *idx, float alpha, float *x, float *y)
1236 	{
1237 
1238 	int jj;
1239 
1240 	for(jj=0; jj<kmax; jj++)
1241 		{
1242 		y[idx[jj]] += alpha * x[jj];
1243 		}
1244 
1245 	}
1246 
1247 
1248 
1249 /****************************
1250 * new interface
1251 ****************************/
1252 
1253 
1254 
1255 #if defined(LA_HIGH_PERFORMANCE)
1256 
1257 
1258 
1259 // return the memory size (in bytes) needed for a strmat
blasfeo_memsize_smat(int m,int n)1260 int blasfeo_memsize_smat(int m, int n)
1261 	{
1262 	const int bs = 4;
1263 	int nc = S_NC;
1264 	int al = bs*nc;
1265 	int pm = (m+bs-1)/bs*bs;
1266 	int cn = (n+nc-1)/nc*nc;
1267 	int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
1268 	int memsize = (pm*cn+tmp)*sizeof(float);
1269 	return memsize;
1270 	}
1271 
1272 
1273 
blasfeo_memsize_smat_ps(int ps,int m,int n)1274 int blasfeo_memsize_smat_ps(int ps, int m, int n)
1275 	{
1276 	int nc = S_NC;
1277 	int al = ps*nc;
1278 	int pm = (m+ps-1)/ps*ps;
1279 	int cn = (n+nc-1)/nc*nc;
1280 	int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
1281 	int memsize = (pm*cn+tmp)*sizeof(float);
1282 	return memsize;
1283 	}
1284 
1285 
1286 
1287 // return the memory size (in bytes) needed for the digonal of a strmat
blasfeo_memsize_diag_smat(int m,int n)1288 int blasfeo_memsize_diag_smat(int m, int n)
1289 	{
1290 	const int bs = 4;
1291 	int nc = S_NC;
1292 	int al = bs*nc;
1293 	int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
1294 	int memsize = tmp*sizeof(float);
1295 	return memsize;
1296 	}
1297 
1298 
1299 
1300 // create a matrix structure for a matrix of size m*n by using memory passed by a pointer
blasfeo_create_smat(int m,int n,struct blasfeo_smat * sA,void * memory)1301 void blasfeo_create_smat(int m, int n, struct blasfeo_smat *sA, void *memory)
1302 	{
1303 
1304 	// invalidate stored inverse diagonal
1305 	sA->use_dA = 0;
1306 
1307 	const int bs = 4;
1308 	int nc = S_NC;
1309 	int al = bs*nc;
1310 	sA->m = m;
1311 	sA->n = n;
1312 	int pm = (m+bs-1)/bs*bs;
1313 	int cn = (n+nc-1)/nc*nc;
1314 	sA->pm = pm;
1315 	sA->cn = cn;
1316 	float *ptr = (float *) memory;
1317 	sA->pA = ptr;
1318 	ptr += pm*cn;
1319 	int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
1320 	sA->dA = ptr;
1321 	ptr += tmp;
1322 	sA->memsize = (pm*cn+tmp)*sizeof(float);
1323 	return;
1324 	}
1325 
1326 
1327 
blasfeo_create_smat_ps(int ps,int m,int n,struct blasfeo_smat * sA,void * memory)1328 void blasfeo_create_smat_ps(int ps, int m, int n, struct blasfeo_smat *sA, void *memory)
1329 	{
1330 
1331 	// invalidate stored inverse diagonal
1332 	sA->use_dA = 0;
1333 
1334 	int nc = S_NC;
1335 	int al = ps*nc;
1336 	sA->m = m;
1337 	sA->n = n;
1338 	int pm = (m+ps-1)/ps*ps;
1339 	int cn = (n+nc-1)/nc*nc;
1340 	sA->pm = pm;
1341 	sA->cn = cn;
1342 	float *ptr = (float *) memory;
1343 	sA->pA = ptr;
1344 	ptr += pm*cn;
1345 	int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
1346 	sA->dA = ptr;
1347 	ptr += tmp;
1348 	sA->memsize = (pm*cn+tmp)*sizeof(float);
1349 	return;
1350 	}
1351 
1352 
1353 
1354 // return memory size (in bytes) needed for a strvec
blasfeo_memsize_svec(int m)1355 int blasfeo_memsize_svec(int m)
1356 	{
1357 	const int bs = 4;
1358 //	int nc = S_NC;
1359 //	int al = bs*nc;
1360 	int pm = (m+bs-1)/bs*bs;
1361 	int memsize = pm*sizeof(float);
1362 	return memsize;
1363 	}
1364 
1365 
1366 
1367 // create a vector structure for a vector of size m by using memory passed by a pointer
blasfeo_create_svec(int m,struct blasfeo_svec * sa,void * memory)1368 void blasfeo_create_svec(int m, struct blasfeo_svec *sa, void *memory)
1369 	{
1370 	const int bs = 4;
1371 //	int nc = S_NC;
1372 //	int al = bs*nc;
1373 	sa->m = m;
1374 	int pm = (m+bs-1)/bs*bs;
1375 	sa->pm = pm;
1376 	float *ptr = (float *) memory;
1377 	sa->pa = ptr;
1378 //	ptr += pm;
1379 	sa->memsize = pm*sizeof(float);
1380 	return;
1381 	}
1382 
1383 
1384 
1385 // convert a matrix into a matrix structure
blasfeo_pack_smat(int m,int n,float * A,int lda,struct blasfeo_smat * sA,int ai,int aj)1386 void blasfeo_pack_smat(int m, int n, float *A, int lda, struct blasfeo_smat *sA, int ai, int aj)
1387 	{
1388 
1389 	// invalidate stored inverse diagonal
1390 	sA->use_dA = 0;
1391 
1392 	const int bs = 4;
1393 	int sda = sA->cn;
1394 	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
1395 	int i, ii, j, jj, m0, m1, m2;
1396 	float *B, *pB;
1397 	m0 = (bs-ai%bs)%bs;
1398 	if(m0>m)
1399 		m0 = m;
1400 	m1 = m - m0;
1401 	jj = 0;
1402 	for( ; jj<n-3; jj+=4)
1403 		{
1404 		B  =  A + jj*lda;
1405 		pB = pA + jj*bs;
1406 		ii = 0;
1407 		if(m0>0)
1408 			{
1409 			for( ; ii<m0; ii++)
1410 				{
1411 				pB[ii+bs*0] = B[ii+lda*0];
1412 				pB[ii+bs*1] = B[ii+lda*1];
1413 				pB[ii+bs*2] = B[ii+lda*2];
1414 				pB[ii+bs*3] = B[ii+lda*3];
1415 				}
1416 			B  += m0;
1417 			pB += m0 + bs*(sda-1);
1418 			}
1419 		for( ; ii<m-3; ii+=4)
1420 			{
1421 			// col 0
1422 			pB[0+bs*0] = B[0+lda*0];
1423 			pB[1+bs*0] = B[1+lda*0];
1424 			pB[2+bs*0] = B[2+lda*0];
1425 			pB[3+bs*0] = B[3+lda*0];
1426 			// col 1
1427 			pB[0+bs*1] = B[0+lda*1];
1428 			pB[1+bs*1] = B[1+lda*1];
1429 			pB[2+bs*1] = B[2+lda*1];
1430 			pB[3+bs*1] = B[3+lda*1];
1431 			// col 2
1432 			pB[0+bs*2] = B[0+lda*2];
1433 			pB[1+bs*2] = B[1+lda*2];
1434 			pB[2+bs*2] = B[2+lda*2];
1435 			pB[3+bs*2] = B[3+lda*2];
1436 			// col 3
1437 			pB[0+bs*3] = B[0+lda*3];
1438 			pB[1+bs*3] = B[1+lda*3];
1439 			pB[2+bs*3] = B[2+lda*3];
1440 			pB[3+bs*3] = B[3+lda*3];
1441 			// update
1442 			B  += 4;
1443 			pB += bs*sda;
1444 			}
1445 		for( ; ii<m; ii++)
1446 			{
1447 			// col 0
1448 			pB[0+bs*0] = B[0+lda*0];
1449 			// col 1
1450 			pB[0+bs*1] = B[0+lda*1];
1451 			// col 2
1452 			pB[0+bs*2] = B[0+lda*2];
1453 			// col 3
1454 			pB[0+bs*3] = B[0+lda*3];
1455 			// update
1456 			B  += 1;
1457 			pB += 1;
1458 			}
1459 		}
1460 	for( ; jj<n; jj++)
1461 		{
1462 
1463 		B  =  A + jj*lda;
1464 		pB = pA + jj*bs;
1465 
1466 		ii = 0;
1467 		if(m0>0)
1468 			{
1469 			for( ; ii<m0; ii++)
1470 				{
1471 				pB[ii+bs*0] = B[ii+lda*0];
1472 				}
1473 			B  += m0;
1474 			pB += m0 + bs*(sda-1);
1475 			}
1476 		for( ; ii<m-3; ii+=4)
1477 			{
1478 			// col 0
1479 			pB[0+bs*0] = B[0+lda*0];
1480 			pB[1+bs*0] = B[1+lda*0];
1481 			pB[2+bs*0] = B[2+lda*0];
1482 			pB[3+bs*0] = B[3+lda*0];
1483 			// update
1484 			B  += 4;
1485 			pB += bs*sda;
1486 			}
1487 		for( ; ii<m; ii++)
1488 			{
1489 			// col 0
1490 			pB[0+bs*0] = B[0+lda*0];
1491 			// update
1492 			B  += 1;
1493 			pB += 1;
1494 			}
1495 		}
1496 	return;
1497 	}
1498 
1499 
1500 
1501 // convert and transpose a matrix into a matrix structure
blasfeo_pack_tran_smat(int m,int n,float * A,int lda,struct blasfeo_smat * sA,int ai,int aj)1502 void blasfeo_pack_tran_smat(int m, int n, float *A, int lda, struct blasfeo_smat *sA, int ai, int aj)
1503 	{
1504 
1505 	// invalidate stored inverse diagonal
1506 	sA->use_dA = 0;
1507 
1508 	const int bs = 4;
1509 	int sda = sA->cn;
1510 	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
1511 	int i, ii, j, m0, m1, m2;
1512 	float 	*B, *pB;
1513 	m0 = (bs-ai%bs)%bs;
1514 	if(m0>n)
1515 		m0 = n;
1516 	m1 = n - m0;
1517 	ii = 0;
1518 	if(m0>0)
1519 		{
1520 		for(j=0; j<m; j++)
1521 			{
1522 			for(i=0; i<m0; i++)
1523 				{
1524 				pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
1525 				}
1526 			}
1527 		A  += m0*lda;
1528 		pA += m0 + bs*(sda-1);
1529 		}
1530 	ii = 0;
1531 	for(; ii<m1-3; ii+=bs)
1532 		{
1533 		j=0;
1534 		B  = A + ii*lda;
1535 		pB = pA + ii*sda;
1536 		for(; j<m-3; j+=4)
1537 			{
1538 			// unroll 0
1539 			pB[0+0*bs] = B[0+0*lda];
1540 			pB[1+0*bs] = B[0+1*lda];
1541 			pB[2+0*bs] = B[0+2*lda];
1542 			pB[3+0*bs] = B[0+3*lda];
1543 			// unroll 1
1544 			pB[0+1*bs] = B[1+0*lda];
1545 			pB[1+1*bs] = B[1+1*lda];
1546 			pB[2+1*bs] = B[1+2*lda];
1547 			pB[3+1*bs] = B[1+3*lda];
1548 			// unroll 2
1549 			pB[0+2*bs] = B[2+0*lda];
1550 			pB[1+2*bs] = B[2+1*lda];
1551 			pB[2+2*bs] = B[2+2*lda];
1552 			pB[3+2*bs] = B[2+3*lda];
1553 			// unroll 3
1554 			pB[0+3*bs] = B[3+0*lda];
1555 			pB[1+3*bs] = B[3+1*lda];
1556 			pB[2+3*bs] = B[3+2*lda];
1557 			pB[3+3*bs] = B[3+3*lda];
1558 			B  += 4;
1559 			pB += 4*bs;
1560 			}
1561 		for(; j<m; j++)
1562 			{
1563 			// unroll 0
1564 			pB[0+0*bs] = B[0+0*lda];
1565 			pB[1+0*bs] = B[0+1*lda];
1566 			pB[2+0*bs] = B[0+2*lda];
1567 			pB[3+0*bs] = B[0+3*lda];
1568 			B  += 1;
1569 			pB += 1*bs;
1570 			}
1571 		}
1572 	if(ii<m1)
1573 		{
1574 		m2 = m1-ii;
1575 		if(bs<m2) m2 = bs;
1576 		for(j=0; j<m; j++)
1577 			{
1578 			for(i=0; i<m2; i++)
1579 				{
1580 				pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
1581 				}
1582 			}
1583 		}
1584 	return;
1585 	}
1586 
1587 
1588 
1589 // convert a vector into a vector structure
blasfeo_pack_svec(int m,float * a,struct blasfeo_svec * sa,int ai)1590 void blasfeo_pack_svec(int m, float *a, struct blasfeo_svec *sa, int ai)
1591 	{
1592 	float *pa = sa->pa + ai;
1593 	int ii;
1594 	for(ii=0; ii<m; ii++)
1595 		pa[ii] = a[ii];
1596 	return;
1597 	}
1598 
1599 
1600 
1601 // convert a matrix structure into a matrix
blasfeo_unpack_smat(int m,int n,struct blasfeo_smat * sA,int ai,int aj,float * A,int lda)1602 void blasfeo_unpack_smat(int m, int n, struct blasfeo_smat *sA, int ai, int aj, float *A, int lda)
1603 	{
1604 	const int bs = 4;
1605 	int sda = sA->cn;
1606 	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
1607 	int i, ii, jj;
1608 	int m0 = (bs-ai%bs)%bs;
1609 	if(m0>m)
1610 		m0 = m;
1611 	float *ptr_pA;
1612 	jj=0;
1613 	for(; jj<n-3; jj+=4)
1614 		{
1615 		ptr_pA = pA + jj*bs;
1616 		ii = 0;
1617 		if(m0>0)
1618 			{
1619 			for(; ii<m0; ii++)
1620 				{
1621 				// unroll 0
1622 				A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
1623 				// unroll 1
1624 				A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
1625 				// unroll 2
1626 				A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
1627 				// unroll 3
1628 				A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
1629 				ptr_pA++;
1630 				}
1631 			ptr_pA += (sda-1)*bs;
1632 			}
1633 		for(; ii<m-bs+1; ii+=bs)
1634 			{
1635 			// unroll 0
1636 			A[0+ii+lda*(jj+0)] = ptr_pA[0+bs*0];
1637 			A[1+ii+lda*(jj+0)] = ptr_pA[1+bs*0];
1638 			A[2+ii+lda*(jj+0)] = ptr_pA[2+bs*0];
1639 			A[3+ii+lda*(jj+0)] = ptr_pA[3+bs*0];
1640 			// unroll 0
1641 			A[0+ii+lda*(jj+1)] = ptr_pA[0+bs*1];
1642 			A[1+ii+lda*(jj+1)] = ptr_pA[1+bs*1];
1643 			A[2+ii+lda*(jj+1)] = ptr_pA[2+bs*1];
1644 			A[3+ii+lda*(jj+1)] = ptr_pA[3+bs*1];
1645 			// unroll 0
1646 			A[0+ii+lda*(jj+2)] = ptr_pA[0+bs*2];
1647 			A[1+ii+lda*(jj+2)] = ptr_pA[1+bs*2];
1648 			A[2+ii+lda*(jj+2)] = ptr_pA[2+bs*2];
1649 			A[3+ii+lda*(jj+2)] = ptr_pA[3+bs*2];
1650 			// unroll 0
1651 			A[0+ii+lda*(jj+3)] = ptr_pA[0+bs*3];
1652 			A[1+ii+lda*(jj+3)] = ptr_pA[1+bs*3];
1653 			A[2+ii+lda*(jj+3)] = ptr_pA[2+bs*3];
1654 			A[3+ii+lda*(jj+3)] = ptr_pA[3+bs*3];
1655 			ptr_pA += sda*bs;
1656 			}
1657 		for(; ii<m; ii++)
1658 			{
1659 			// unroll 0
1660 			A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
1661 			// unroll 1
1662 			A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
1663 			// unroll 2
1664 			A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
1665 			// unroll 3
1666 			A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
1667 			ptr_pA++;
1668 			}
1669 		}
1670 	for(; jj<n; jj++)
1671 		{
1672 		ptr_pA = pA + jj*bs;
1673 		ii = 0;
1674 		if(m0>0)
1675 			{
1676 			for(; ii<m0; ii++)
1677 				{
1678 				A[ii+lda*jj] = ptr_pA[0];
1679 				ptr_pA++;
1680 				}
1681 			ptr_pA += (sda-1)*bs;
1682 			}
1683 		for(; ii<m-bs+1; ii+=bs)
1684 			{
1685 			A[0+ii+lda*jj] = ptr_pA[0];
1686 			A[1+ii+lda*jj] = ptr_pA[1];
1687 			A[2+ii+lda*jj] = ptr_pA[2];
1688 			A[3+ii+lda*jj] = ptr_pA[3];
1689 			ptr_pA += sda*bs;
1690 			}
1691 		for(; ii<m; ii++)
1692 			{
1693 			A[ii+lda*jj] = ptr_pA[0];
1694 			ptr_pA++;
1695 			}
1696 		}
1697 	return;
1698 	}
1699 
1700 
1701 
1702 // convert and transpose a matrix structure into a matrix
blasfeo_unpack_tran_smat(int m,int n,struct blasfeo_smat * sA,int ai,int aj,float * A,int lda)1703 void blasfeo_unpack_tran_smat(int m, int n, struct blasfeo_smat *sA, int ai, int aj, float *A, int lda)
1704 	{
1705 	const int bs = 4;
1706 	int sda = sA->cn;
1707 	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
1708 	int i, ii, jj;
1709 	int m0 = (bs-ai%bs)%bs;
1710 	if(m0>m)
1711 		m0 = m;
1712 	float *ptr_pA;
1713 	jj=0;
1714 	for(; jj<n-3; jj+=4)
1715 		{
1716 		ptr_pA = pA + jj*bs;
1717 		ii = 0;
1718 		if(m0>0)
1719 			{
1720 			for(; ii<m0; ii++)
1721 				{
1722 				// unroll 0
1723 				A[jj+0+lda*ii] = ptr_pA[0+bs*0];
1724 				// unroll 1
1725 				A[jj+1+lda*ii] = ptr_pA[0+bs*1];
1726 				// unroll 2
1727 				A[jj+2+lda*ii] = ptr_pA[0+bs*2];
1728 				// unroll 3
1729 				A[jj+3+lda*ii] = ptr_pA[0+bs*3];
1730 				ptr_pA++;
1731 				}
1732 			ptr_pA += (sda-1)*bs;
1733 			}
1734 		for(; ii<m-bs+1; ii+=bs)
1735 			{
1736 			// unroll 0
1737 			A[jj+0+lda*(ii+0)] = ptr_pA[0+bs*0];
1738 			A[jj+0+lda*(ii+1)] = ptr_pA[1+bs*0];
1739 			A[jj+0+lda*(ii+2)] = ptr_pA[2+bs*0];
1740 			A[jj+0+lda*(ii+3)] = ptr_pA[3+bs*0];
1741 			// unroll 1
1742 			A[jj+1+lda*(ii+0)] = ptr_pA[0+bs*1];
1743 			A[jj+1+lda*(ii+1)] = ptr_pA[1+bs*1];
1744 			A[jj+1+lda*(ii+2)] = ptr_pA[2+bs*1];
1745 			A[jj+1+lda*(ii+3)] = ptr_pA[3+bs*1];
1746 			// unroll 2
1747 			A[jj+2+lda*(ii+0)] = ptr_pA[0+bs*2];
1748 			A[jj+2+lda*(ii+1)] = ptr_pA[1+bs*2];
1749 			A[jj+2+lda*(ii+2)] = ptr_pA[2+bs*2];
1750 			A[jj+2+lda*(ii+3)] = ptr_pA[3+bs*2];
1751 			// unroll 3
1752 			A[jj+3+lda*(ii+0)] = ptr_pA[0+bs*3];
1753 			A[jj+3+lda*(ii+1)] = ptr_pA[1+bs*3];
1754 			A[jj+3+lda*(ii+2)] = ptr_pA[2+bs*3];
1755 			A[jj+3+lda*(ii+3)] = ptr_pA[3+bs*3];
1756 			ptr_pA += sda*bs;
1757 			}
1758 		for(; ii<m; ii++)
1759 			{
1760 			// unroll 0
1761 			A[jj+0+lda*ii] = ptr_pA[0+bs*0];
1762 			// unroll 1
1763 			A[jj+1+lda*ii] = ptr_pA[0+bs*1];
1764 			// unroll 2
1765 			A[jj+2+lda*ii] = ptr_pA[0+bs*2];
1766 			// unroll 3
1767 			A[jj+3+lda*ii] = ptr_pA[0+bs*3];
1768 			ptr_pA++;
1769 			}
1770 		}
1771 	for(; jj<n; jj++)
1772 		{
1773 		ptr_pA = pA + jj*bs;
1774 		ii = 0;
1775 		if(m0>0)
1776 			{
1777 			for(; ii<m0; ii++)
1778 				{
1779 				A[jj+lda*ii] = ptr_pA[0];
1780 				ptr_pA++;
1781 				}
1782 			ptr_pA += (sda-1)*bs;
1783 			}
1784 		for(; ii<m-bs+1; ii+=bs)
1785 			{
1786 			i=0;
1787 			for(; i<bs; i++)
1788 				{
1789 				A[jj+lda*(i+ii)] = ptr_pA[0];
1790 				ptr_pA++;
1791 				}
1792 			ptr_pA += (sda-1)*bs;
1793 			}
1794 		for(; ii<m; ii++)
1795 			{
1796 			A[jj+lda*ii] = ptr_pA[0];
1797 			ptr_pA++;
1798 			}
1799 		}
1800 	return;
1801 	}
1802 
1803 
1804 
1805 // convert a vector structure into a vector
blasfeo_unpack_svec(int m,struct blasfeo_svec * sa,int ai,float * a)1806 void blasfeo_unpack_svec(int m, struct blasfeo_svec *sa, int ai, float *a)
1807 	{
1808 	float *pa = sa->pa + ai;
1809 	int ii;
1810 	for(ii=0; ii<m; ii++)
1811 		a[ii] = pa[ii];
1812 	return;
1813 	}
1814 
1815 
1816 
1817 // cast a matrix into a matrix structure
s_cast_mat2strmat(float * A,struct blasfeo_smat * sA)1818 void s_cast_mat2strmat(float *A, struct blasfeo_smat *sA)
1819 	{
1820 
1821 	// invalidate stored inverse diagonal
1822 	sA->use_dA = 0;
1823 
1824 	sA->pA = A;
1825 	return;
1826 	}
1827 
1828 
1829 
1830 // cast a matrix into the diagonal of a matrix structure
s_cast_diag_mat2strmat(float * dA,struct blasfeo_smat * sA)1831 void s_cast_diag_mat2strmat(float *dA, struct blasfeo_smat *sA)
1832 	{
1833 
1834 	// invalidate stored inverse diagonal
1835 	sA->use_dA = 0;
1836 
1837 	sA->dA = dA;
1838 	return;
1839 	}
1840 
1841 
1842 
1843 // cast a vector into a vector structure
s_cast_vec2vecmat(float * a,struct blasfeo_svec * sa)1844 void s_cast_vec2vecmat(float *a, struct blasfeo_svec *sa)
1845 	{
1846 	sa->pa = a;
1847 	return;
1848 	}
1849 
1850 
1851 
1852 // insert element into strmat
blasfeo_sgein1(float a,struct blasfeo_smat * sA,int ai,int aj)1853 void blasfeo_sgein1(float a, struct blasfeo_smat *sA, int ai, int aj)
1854 	{
1855 
1856 	if (ai == aj)
1857 		{
1858 		// invalidate stored inverse diagonal
1859 		sA->use_dA = 0;
1860 		}
1861 
1862 	const int bs = 4;
1863 	int sda = sA->cn;
1864 	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
1865 	pA[0] = a;
1866 	return;
1867 	}
1868 
1869 
1870 
1871 // extract element from strmat
blasfeo_sgeex1(struct blasfeo_smat * sA,int ai,int aj)1872 float blasfeo_sgeex1(struct blasfeo_smat *sA, int ai, int aj)
1873 	{
1874 	const int bs = 4;
1875 	int sda = sA->cn;
1876 	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
1877 	return pA[0];
1878 	}
1879 
1880 
1881 
1882 // insert element into strvec
blasfeo_svecin1(float a,struct blasfeo_svec * sx,int xi)1883 void blasfeo_svecin1(float a, struct blasfeo_svec *sx, int xi)
1884 	{
1885 	const int bs = 4;
1886 	float *x = sx->pa + xi;
1887 	x[0] = a;
1888 	return;
1889 	}
1890 
1891 
1892 
1893 // extract element from strvec
blasfeo_svecex1(struct blasfeo_svec * sx,int xi)1894 float blasfeo_svecex1(struct blasfeo_svec *sx, int xi)
1895 	{
1896 	const int bs = 4;
1897 	float *x = sx->pa + xi;
1898 	return x[0];
1899 	}
1900 
1901 
1902 
1903 // set all elements of a strmat to a value
blasfeo_sgese(int m,int n,float alpha,struct blasfeo_smat * sA,int ai,int aj)1904 void blasfeo_sgese(int m, int n, float alpha, struct blasfeo_smat *sA, int ai, int aj)
1905 	{
1906 
1907 	// invalidate stored inverse diagonal
1908 	sA->use_dA = 0;
1909 
1910 	const int bs = 4;
1911 	int sda = sA->cn;
1912 	float *pA = sA->pA + ai%bs + ai/bs*bs*sda + aj*bs;
1913 	int m0 = m<(bs-ai%bs)%bs ? m : (bs-ai%bs)%bs;
1914 	int ii, jj;
1915 	if(m0>0)
1916 		{
1917 		for(ii=0; ii<m0; ii++)
1918 			{
1919 			for(jj=0; jj<n; jj++)
1920 				{
1921 				pA[jj*bs] = alpha;
1922 				}
1923 			pA += 1;
1924 			}
1925 		pA += bs*(sda-1);
1926 		m -= m0;
1927 		}
1928 	for(ii=0; ii<m-3; ii+=4)
1929 		{
1930 		for(jj=0; jj<n; jj++)
1931 			{
1932 			pA[0+jj*bs] = alpha;
1933 			pA[1+jj*bs] = alpha;
1934 			pA[2+jj*bs] = alpha;
1935 			pA[3+jj*bs] = alpha;
1936 			}
1937 		pA += bs*sda;
1938 		}
1939 	for( ; ii<m; ii++)
1940 		{
1941 		for(jj=0; jj<n; jj++)
1942 			{
1943 			pA[jj*bs] = alpha;
1944 			}
1945 		pA += 1;
1946 		}
1947 	return;
1948 	}
1949 
1950 
1951 
1952 // set all elements of a strvec to a value
blasfeo_svecse(int m,float alpha,struct blasfeo_svec * sx,int xi)1953 void blasfeo_svecse(int m, float alpha, struct blasfeo_svec *sx, int xi)
1954 	{
1955 	float *x = sx->pa + xi;
1956 	int ii;
1957 	for(ii=0; ii<m; ii++)
1958 		x[ii] = alpha;
1959 	return;
1960 	}
1961 
1962 
1963 
1964 // extract diagonal to vector
blasfeo_sdiaex(int kmax,float alpha,struct blasfeo_smat * sA,int ai,int aj,struct blasfeo_svec * sx,int xi)1965 void blasfeo_sdiaex(int kmax, float alpha, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx, int xi)
1966 	{
1967 	const int bs = 4;
1968 	int sda = sA->cn;
1969 	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
1970 	float *x = sx->pa + xi;
1971 	sdiaex_lib(kmax, alpha, ai%bs, pA, sda, x);
1972 	return;
1973 	}
1974 
1975 
1976 
1977 // insert a vector into diagonal
blasfeo_sdiain(int kmax,float alpha,struct blasfeo_svec * sx,int xi,struct blasfeo_smat * sA,int ai,int aj)1978 void blasfeo_sdiain(int kmax, float alpha, struct blasfeo_svec *sx, int xi, struct blasfeo_smat *sA, int ai, int aj)
1979 	{
1980 
1981 	// invalidate stored inverse diagonal
1982 	sA->use_dA = 0;
1983 
1984 	const int bs = 4;
1985 	int sda = sA->cn;
1986 	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
1987 	float *x = sx->pa + xi;
1988 	int offsetA = ai%bs;
1989 
1990 	int kna = (bs-offsetA%bs)%bs;
1991 	kna = kmax<kna ? kmax : kna;
1992 
1993 	int jj, ll;
1994 
1995 	if(kna>0)
1996 		{
1997 		for(ll=0; ll<kna; ll++)
1998 			{
1999 			pA[ll+bs*ll] = alpha*x[ll];
2000 			}
2001 		pA += kna + bs*(sda-1) + kna*bs;
2002 		x  += kna;
2003 		kmax -= kna;
2004 		}
2005 	for(jj=0; jj<kmax-3; jj+=4)
2006 		{
2007 		pA[jj*sda+(jj+0)*bs+0] = alpha*x[jj+0];
2008 		pA[jj*sda+(jj+1)*bs+1] = alpha*x[jj+1];
2009 		pA[jj*sda+(jj+2)*bs+2] = alpha*x[jj+2];
2010 		pA[jj*sda+(jj+3)*bs+3] = alpha*x[jj+3];
2011 		}
2012 	for(ll=0; ll<kmax-jj; ll++)
2013 		{
2014 		pA[jj*sda+(jj+ll)*bs+ll] = alpha*x[jj+ll];
2015 		}
2016 	return;
2017 	}
2018 
2019 
2020 
2021 // add scalar to diagonal
blasfeo_sdiare(int kmax,float alpha,struct blasfeo_smat * sA,int ai,int aj)2022 void blasfeo_sdiare(int kmax, float alpha, struct blasfeo_smat *sA, int ai, int aj)
2023 	{
2024 
2025 	// invalidate stored inverse diagonal
2026 	sA->use_dA = 0;
2027 
2028 	const int bs = 4;
2029 	int sda = sA->cn;
2030 	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2031 	int offsetA = ai%bs;
2032 
2033 	int kna = (bs-offsetA%bs)%bs;
2034 	kna = kmax<kna ? kmax : kna;
2035 
2036 	int jj, ll;
2037 
2038 	if(kna>0)
2039 		{
2040 		for(ll=0; ll<kna; ll++)
2041 			{
2042 			pA[ll+bs*ll] += alpha;
2043 			}
2044 		pA += kna + bs*(sda-1) + kna*bs;
2045 		kmax -= kna;
2046 		}
2047 	for(jj=0; jj<kmax-3; jj+=4)
2048 		{
2049 		pA[jj*sda+(jj+0)*bs+0] += alpha;
2050 		pA[jj*sda+(jj+1)*bs+1] += alpha;
2051 		pA[jj*sda+(jj+2)*bs+2] += alpha;
2052 		pA[jj*sda+(jj+3)*bs+3] += alpha;
2053 		}
2054 	for(ll=0; ll<kmax-jj; ll++)
2055 		{
2056 		pA[jj*sda+(jj+ll)*bs+ll] += alpha;
2057 		}
2058 	return;
2059 	}
2060 
2061 
2062 
2063 // swap two rows of two matrix structs
blasfeo_srowsw(int kmax,struct blasfeo_smat * sA,int ai,int aj,struct blasfeo_smat * sC,int ci,int cj)2064 void blasfeo_srowsw(int kmax, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_smat *sC, int ci, int cj)
2065 	{
2066 
2067 	// invalidate stored inverse diagonal
2068 	sA->use_dA = 0;
2069 	sC->use_dA = 0;
2070 
2071 	const int bs = 4;
2072 	int sda = sA->cn;
2073 	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2074 	int sdc = sC->cn;
2075 	float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
2076 	srowsw_lib(kmax, pA, pC);
2077 	return;
2078 	}
2079 
2080 
2081 
2082 // permute the rows of a matrix struct
blasfeo_srowpe(int kmax,int * ipiv,struct blasfeo_smat * sA)2083 void blasfeo_srowpe(int kmax, int *ipiv, struct blasfeo_smat *sA)
2084 	{
2085 
2086 	// invalidate stored inverse diagonal
2087 	sA->use_dA = 0;
2088 
2089 	int ii;
2090 	for(ii=0; ii<kmax; ii++)
2091 		{
2092 		if(ipiv[ii]!=ii)
2093 			blasfeo_srowsw(sA->n, sA, ii, 0, sA, ipiv[ii], 0);
2094 		}
2095 	return;
2096 	}
2097 
2098 
2099 // inverse permute the rows of a matrix struct
blasfeo_srowpei(int kmax,int * ipiv,struct blasfeo_smat * sA)2100 void blasfeo_srowpei(int kmax, int *ipiv, struct blasfeo_smat *sA)
2101 	{
2102 
2103 	// invalidate stored inverse diagonal
2104 	sA->use_dA = 0;
2105 
2106 	int ii;
2107 	for(ii=kmax-1; ii>=0; ii--)
2108 		{
2109 		if(ipiv[ii]!=ii)
2110 			blasfeo_srowsw(sA->n, sA, ii, 0, sA, ipiv[ii], 0);
2111 		}
2112 	return;
2113 	}
2114 
2115 
2116 // extract a row int a vector
blasfeo_srowex(int kmax,float alpha,struct blasfeo_smat * sA,int ai,int aj,struct blasfeo_svec * sx,int xi)2117 void blasfeo_srowex(int kmax, float alpha, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx, int xi)
2118 	{
2119 	const int bs = 4;
2120 	int sda = sA->cn;
2121 	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2122 	float *x = sx->pa + xi;
2123 	srowex_lib(kmax, alpha, pA, x);
2124 	return;
2125 	}
2126 
2127 
2128 
2129 // insert a vector into a row
blasfeo_srowin(int kmax,float alpha,struct blasfeo_svec * sx,int xi,struct blasfeo_smat * sA,int ai,int aj)2130 void blasfeo_srowin(int kmax, float alpha, struct blasfeo_svec *sx, int xi, struct blasfeo_smat *sA, int ai, int aj)
2131 	{
2132 
2133 	// invalidate stored inverse diagonal
2134 	sA->use_dA = 0;
2135 
2136 	const int bs = 4;
2137 	int sda = sA->cn;
2138 	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2139 	float *x = sx->pa + xi;
2140 	srowin_lib(kmax, alpha, x, pA);
2141 	return;
2142 	}
2143 
2144 
2145 
2146 // add a vector to a row
blasfeo_srowad(int kmax,float alpha,struct blasfeo_svec * sx,int xi,struct blasfeo_smat * sA,int ai,int aj)2147 void blasfeo_srowad(int kmax, float alpha, struct blasfeo_svec *sx, int xi, struct blasfeo_smat *sA, int ai, int aj)
2148 	{
2149 
2150 	// invalidate stored inverse diagonal
2151 	sA->use_dA = 0;
2152 
2153 	const int bs = 4;
2154 	int sda = sA->cn;
2155 	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2156 	float *x = sx->pa + xi;
2157 	srowad_lib(kmax, alpha, x, pA);
2158 	return;
2159 	}
2160 
2161 
2162 
2163 // extract vector from column
blasfeo_scolex(int kmax,struct blasfeo_smat * sA,int ai,int aj,struct blasfeo_svec * sx,int xi)2164 void blasfeo_scolex(int kmax, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx, int xi)
2165 	{
2166 
2167 	// invalidate stored inverse diagonal
2168 	sA->use_dA = 0;
2169 
2170 	const int bs = 4;
2171 	int sda = sA->cn;
2172 	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2173 	float *x = sx->pa + xi;
2174 	scolex_lib(kmax, ai%bs, pA, sda, x);
2175 	return;
2176 	}
2177 
2178 
2179 
2180 // insert as vector as a column
blasfeo_scolin(int kmax,struct blasfeo_svec * sx,int xi,struct blasfeo_smat * sA,int ai,int aj)2181 void blasfeo_scolin(int kmax, struct blasfeo_svec *sx, int xi, struct blasfeo_smat *sA, int ai, int aj)
2182 	{
2183 
2184 	// invalidate stored inverse diagonal
2185 	sA->use_dA = 0;
2186 
2187 	const int bs = 4;
2188 	int sda = sA->cn;
2189 	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2190 	float *x = sx->pa + xi;
2191 	scolin_lib(kmax, x, ai%bs, pA, sda);
2192 	return;
2193 	}
2194 
2195 
2196 
2197 // add scaled vector to column
blasfeo_scolad(int kmax,float alpha,struct blasfeo_svec * sx,int xi,struct blasfeo_smat * sA,int ai,int aj)2198 void blasfeo_scolad(int kmax, float alpha, struct blasfeo_svec *sx, int xi, struct blasfeo_smat *sA, int ai, int aj)
2199 	{
2200 
2201 	// invalidate stored inverse diagonal
2202 	sA->use_dA = 0;
2203 
2204 	const int bs = 4;
2205 	int sda = sA->cn;
2206 	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2207 	float *x = sx->pa + xi;
2208 	scolad_lib(kmax, alpha, x, ai%bs, pA, sda);
2209 	return;
2210 	}
2211 
2212 
2213 
2214 // scale a column
blasfeo_scolsc(int kmax,float alpha,struct blasfeo_smat * sA,int ai,int aj)2215 void blasfeo_scolsc(int kmax, float alpha, struct blasfeo_smat *sA, int ai, int aj)
2216 	{
2217 
2218 	// invalidate stored inverse diagonal
2219 	sA->use_dA = 0;
2220 
2221 	const int bs = 4;
2222 
2223 	int sda = sA->cn;
2224 	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2225 
2226 	int kna = (bs-ai%bs)%bs;
2227 	kna = kmax<kna ? kmax : kna;
2228 
2229 	int jj, ll;
2230 
2231 	if(kna>0)
2232 		{
2233 		for(ll=0; ll<kna; ll++)
2234 			{
2235 			pA[ll] *= alpha;
2236 			}
2237 		pA += kna + bs*(sda-1);
2238 		kmax -= kna;
2239 		}
2240 	for(jj=0; jj<kmax-3; jj+=4)
2241 		{
2242 		pA[jj*sda+0] *= alpha;
2243 		pA[jj*sda+1] *= alpha;
2244 		pA[jj*sda+2] *= alpha;
2245 		pA[jj*sda+3] *= alpha;
2246 		}
2247 	for(ll=0; ll<kmax-jj; ll++)
2248 		{
2249 		pA[jj*sda+ll] *= alpha;
2250 		}
2251 
2252 	return;
2253 	}
2254 
2255 
2256 
2257 // swap two cols of two matrix structs
blasfeo_scolsw(int kmax,struct blasfeo_smat * sA,int ai,int aj,struct blasfeo_smat * sC,int ci,int cj)2258 void blasfeo_scolsw(int kmax, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_smat *sC, int ci, int cj)
2259 	{
2260 
2261 	// invalidate stored inverse diagonal
2262 	sA->use_dA = 0;
2263 	sC->use_dA = 0;
2264 
2265 	const int bs = 4;
2266 	int sda = sA->cn;
2267 	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
2268 	int sdc = sC->cn;
2269 	float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
2270 	scolsw_lib(kmax, ai%bs, pA, sda, ci%bs, pC, sdc);
2271 	return;
2272 	}
2273 
2274 
2275 
2276 // permute the cols of a matrix struct
blasfeo_scolpe(int kmax,int * ipiv,struct blasfeo_smat * sA)2277 void blasfeo_scolpe(int kmax, int *ipiv, struct blasfeo_smat *sA)
2278 	{
2279 
2280 	// invalidate stored inverse diagonal
2281 	sA->use_dA = 0;
2282 
2283 	int ii;
2284 	for(ii=0; ii<kmax; ii++)
2285 		{
2286 		if(ipiv[ii]!=ii)
2287 			blasfeo_scolsw(sA->m, sA, 0, ii, sA, 0, ipiv[ii]);
2288 		}
2289 	return;
2290 	}
2291 
2292 
2293 
2294 // inverse permute the cols of a matrix struct
blasfeo_scolpei(int kmax,int * ipiv,struct blasfeo_smat * sA)2295 void blasfeo_scolpei(int kmax, int *ipiv, struct blasfeo_smat *sA)
2296 	{
2297 
2298 	// invalidate stored inverse diagonal
2299 	sA->use_dA = 0;
2300 
2301 	int ii;
2302 	for(ii=kmax-1; ii>=0; ii--)
2303 		{
2304 		if(ipiv[ii]!=ii)
2305 			blasfeo_scolsw(sA->m, sA, 0, ii, sA, 0, ipiv[ii]);
2306 		}
2307 	return;
2308 	}
2309 
2310 
2311 
2312 // --- ge
2313 
2314 
2315 // scale a generic strmat
blasfeo_sgesc(int m,int n,float alpha,struct blasfeo_smat * sA,int ai,int aj)2316 void blasfeo_sgesc(int m, int n, float alpha, struct blasfeo_smat *sA, int ai, int aj)
2317 	{
2318 
2319 	if(m<=0 | n<=0)
2320 		return;
2321 
2322 	// invalidate stored inverse diagonal
2323 	sA->use_dA = 0;
2324 
2325 #if defined(DIM_CHECK)
2326 	// non-negative size
2327 	if(m<0) printf("\n****** blasfeo_sgesc : m<0 : %d<0 *****\n", m);
2328 	if(n<0) printf("\n****** blasfeo_sgesc : n<0 : %d<0 *****\n", n);
2329 	// non-negative offset
2330 	if(ai<0) printf("\n****** blasfeo_sgesc : ai<0 : %d<0 *****\n", ai);
2331 	if(aj<0) printf("\n****** blasfeo_sgesc : aj<0 : %d<0 *****\n", aj);
2332 	// inside matrix
2333 	// A: m x n
2334 	if(ai+m > sA->m) printf("\n***** blasfeo_sgesc : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
2335 	if(aj+n > sA->n) printf("\n***** blasfeo_sgesc : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
2336 #endif
2337 
2338 	const int bs = 4;
2339 
2340 	int mna, ii;
2341 
2342 	int sda = sA->cn;
2343 	float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
2344 	int offA = ai%bs;
2345 
2346 	// same alignment
2347 	ii = 0;
2348 	// clean up at the beginning
2349 	mna = (4-offA)%bs;
2350 	if(mna>0)
2351 		{
2352 		if(m<mna) // mna<=3  ==>  m = { 1, 2 }
2353 			{
2354 			if(m==1)
2355 				{
2356 				kernel_sgecpsc_1_0_lib4(n, &alpha, pA+offA, pA+offA);
2357 				return;
2358 				}
2359 			else //if(m==2 && mna==3)
2360 				{
2361 				kernel_sgecpsc_2_0_lib4(n, &alpha, pA+offA, pA+offA);
2362 				return;
2363 				}
2364 			}
2365 		if(mna==1)
2366 			{
2367 			kernel_sgecpsc_1_0_lib4(n, &alpha, pA+offA, pA+offA);
2368 			pA += 4*sda;
2369 			ii += 1;
2370 			}
2371 		else if(mna==2)
2372 			{
2373 			kernel_sgecpsc_2_0_lib4(n, &alpha, pA+offA, pA+offA);
2374 			pA += 4*sda;
2375 			ii += 2;
2376 			}
2377 		else // if(mna==3)
2378 			{
2379 			kernel_sgecpsc_3_0_lib4(n, &alpha, pA+offA, pA+offA);
2380 			pA += 4*sda;
2381 			ii += 3;
2382 			}
2383 		}
2384 	// main loop
2385 	for(; ii<m-3; ii+=4)
2386 		{
2387 		kernel_sgecpsc_4_0_lib4(n, &alpha, pA, pA);
2388 		pA += 4*sda;
2389 		}
2390 	// clean up at the end
2391 	if(ii<m)
2392 		{
2393 		if(m-ii==1)
2394 			kernel_sgecpsc_1_0_lib4(n, &alpha, pA, pA);
2395 		else if(m-ii==2)
2396 			kernel_sgecpsc_2_0_lib4(n, &alpha, pA, pA);
2397 		else // if(m-ii==3)
2398 			kernel_sgecpsc_3_0_lib4(n, &alpha, pA, pA);
2399 		}
2400 
2401 	return;
2402 
2403 	}
2404 
2405 
2406 
2407 // copy and scale a generic strmat into a generic strmat
blasfeo_sgecpsc(int m,int n,float alpha,struct blasfeo_smat * sA,int ai,int aj,struct blasfeo_smat * sB,int bi,int bj)2408 void blasfeo_sgecpsc(int m, int n, float alpha, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_smat *sB, int bi, int bj)
2409 	{
2410 
2411 	if(m<=0 | n<=0)
2412 		return;
2413 
2414 	// invalidate stored inverse diagonal
2415 	sB->use_dA = 0;
2416 
2417 #if defined(DIM_CHECK)
2418 	// non-negative size
2419 	if(m<0) printf("\n****** blasfeo_sgecpsc : m<0 : %d<0 *****\n", m);
2420 	if(n<0) printf("\n****** blasfeo_sgecpsc : n<0 : %d<0 *****\n", n);
2421 	// non-negative offset
2422 	if(ai<0) printf("\n****** blasfeo_sgecpsc : ai<0 : %d<0 *****\n", ai);
2423 	if(aj<0) printf("\n****** blasfeo_sgecpsc : aj<0 : %d<0 *****\n", aj);
2424 	if(bi<0) printf("\n****** blasfeo_sgecpsc : bi<0 : %d<0 *****\n", bi);
2425 	if(bj<0) printf("\n****** blasfeo_sgecpsc : bj<0 : %d<0 *****\n", bj);
2426 	// inside matrix
2427 	// A: m x n
2428 	if(ai+m > sA->m) printf("\n***** blasfeo_sgecpsc : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
2429 	if(aj+n > sA->n) printf("\n***** blasfeo_sgecpsc : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
2430 	// B: m x n
2431 	if(bi+m > sB->m) printf("\n***** blasfeo_sgecpsc : bi+m > row(B) : %d+%d > %d *****\n", bi, m, sB->m);
2432 	if(bj+n > sB->n) printf("\n***** blasfeo_sgecpsc : bj+n > col(B) : %d+%d > %d *****\n", bj, n, sB->n);
2433 #endif
2434 
2435 	const int bs = 4;
2436 
2437 	int mna, ii;
2438 
2439 	int sda = sA->cn;
2440 	int sdb = sB->cn;
2441 	float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
2442 	float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
2443 	int offA = ai%bs;
2444 	int offB = bi%bs;
2445 
2446 	// same alignment
2447 	if(offA==offB)
2448 		{
2449 		ii = 0;
2450 		// clean up at the beginning
2451 		mna = (4-offB)%bs;
2452 		if(mna>0)
2453 			{
2454 			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
2455 				{
2456 				if(m==1)
2457 					{
2458 					kernel_sgecpsc_1_0_lib4(n, &alpha, pA+offA, pB+offB);
2459 					return;
2460 					}
2461 				else //if(m==2 && mna==3)
2462 					{
2463 					kernel_sgecpsc_2_0_lib4(n, &alpha, pA+offA, pB+offB);
2464 					return;
2465 					}
2466 				}
2467 			if(mna==1)
2468 				{
2469 				kernel_sgecpsc_1_0_lib4(n, &alpha, pA+offA, pB+offB);
2470 				pA += 4*sda;
2471 				pB += 4*sdb;
2472 				ii += 1;
2473 				}
2474 			else if(mna==2)
2475 				{
2476 				kernel_sgecpsc_2_0_lib4(n, &alpha, pA+offA, pB+offB);
2477 				pA += 4*sda;
2478 				pB += 4*sdb;
2479 				ii += 2;
2480 				}
2481 			else // if(mna==3)
2482 				{
2483 				kernel_sgecpsc_3_0_lib4(n, &alpha, pA+offA, pB+offB);
2484 				pA += 4*sda;
2485 				pB += 4*sdb;
2486 				ii += 3;
2487 				}
2488 			}
2489 		// main loop
2490 		for(; ii<m-3; ii+=4)
2491 			{
2492 			kernel_sgecpsc_4_0_lib4(n, &alpha, pA, pB);
2493 			pA += 4*sda;
2494 			pB += 4*sdb;
2495 			}
2496 		// clean up at the end
2497 		if(ii<m)
2498 			{
2499 			if(m-ii==1)
2500 				kernel_sgecpsc_1_0_lib4(n, &alpha, pA, pB);
2501 			else if(m-ii==2)
2502 				kernel_sgecpsc_2_0_lib4(n, &alpha, pA, pB);
2503 			else // if(m-ii==3)
2504 				kernel_sgecpsc_3_0_lib4(n, &alpha, pA, pB);
2505 			}
2506 		}
2507 	// skip one element of pA
2508 	else if(offA==(offB+1)%bs)
2509 		{
2510 		ii = 0;
2511 		// clean up at the beginning
2512 		mna = (4-offB)%bs;
2513 		if(mna>0)
2514 			{
2515 			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
2516 				{
2517 				if(m==1)
2518 					{
2519 					kernel_sgecpsc_1_0_lib4(n, &alpha, pA+offA, pB+offB);
2520 					return;
2521 					}
2522 				else //if(m==2 && mna==3)
2523 					{
2524 					kernel_sgecpsc_2_0_lib4(n, &alpha, pA+offA, pB+offB);
2525 					return;
2526 					}
2527 				}
2528 			if(mna==1)
2529 				{
2530 				kernel_sgecpsc_1_0_lib4(n, &alpha, pA+offA, pB+offB);
2531 				//pA += 4*sda;
2532 				pB += 4*sdb;
2533 				ii += 1;
2534 				}
2535 			else if(mna==2)
2536 				{
2537 				kernel_sgecpsc_2_3_lib4(n, &alpha, pA, sda, pB+2);
2538 				pA += 4*sda;
2539 				pB += 4*sdb;
2540 				ii += 2;
2541 				}
2542 			else // if(mna==3)
2543 				{
2544 				kernel_sgecpsc_3_2_lib4(n, &alpha, pA, sda, pB+1);
2545 				pA += 4*sda;
2546 				pB += 4*sdb;
2547 				ii += 3;
2548 				}
2549 			}
2550 		// main loop
2551 		for( ; ii<m-3; ii+=4)
2552 			{
2553 			kernel_sgecpsc_4_1_lib4(n, &alpha, pA, sda, pB);
2554 			pA += 4*sda;
2555 			pB += 4*sdb;
2556 			}
2557 		// clean up at the end
2558 		if(ii<m)
2559 			{
2560 			if(m-ii==1)
2561 				kernel_sgecpsc_1_0_lib4(n, &alpha, pA+1, pB);
2562 			else if(m-ii==2)
2563 				kernel_sgecpsc_2_0_lib4(n, &alpha, pA+1, pB);
2564 			else // if(m-ii==3)
2565 				kernel_sgecpsc_3_0_lib4(n, &alpha, pA+1, pB);
2566 			}
2567 		}
2568 	// skip 2 elements of pA
2569 	else if(offA==(offB+2)%bs)
2570 		{
2571 		ii = 0;
2572 		// clean up at the beginning
2573 		mna = (4-offB)%bs;
2574 		if(mna>0)
2575 			{
2576 			if(m<mna)
2577 				{
2578 				if(m==1)
2579 					{
2580 					kernel_sgecpsc_1_0_lib4(n, &alpha, pA+offA, pB+offB);
2581 					return;
2582 					}
2583 				else // if(m==2 && mna==3)
2584 					{
2585 					kernel_sgecpsc_2_3_lib4(n, &alpha, pA, sda, pB+1);
2586 					return;
2587 					}
2588 				}
2589 			if(mna==1)
2590 				{
2591 				kernel_sgecpsc_1_0_lib4(n, &alpha, pA+1, pB+3);
2592 				// pA += 4*sda;
2593 				pB += 4*sdb;
2594 				ii += 1;
2595 				}
2596 			else if(mna==2)
2597 				{
2598 				kernel_sgecpsc_2_0_lib4(n, &alpha, pA, pB+2);
2599 				// pA += 4*sda;
2600 				pB += 4*sdb;
2601 				ii += 2;
2602 				}
2603 			else // if(mna==3)
2604 				{
2605 				kernel_sgecpsc_3_3_lib4(n, &alpha, pA, sda, pB+1);
2606 				pA += 4*sda;
2607 				pB += 4*sdb;
2608 				ii += 3;
2609 				}
2610 			}
2611 		// main loop
2612 		for(; ii<m-3; ii+=4)
2613 			{
2614 			kernel_sgecpsc_4_2_lib4(n, &alpha, pA, sda, pB);
2615 			pA += 4*sda;
2616 			pB += 4*sdb;
2617 			}
2618 		// clean up at the end
2619 		if(ii<m)
2620 			{
2621 			if(m-ii==1)
2622 				kernel_sgecpsc_1_0_lib4(n, &alpha, pA+2, pB);
2623 			else if(m-ii==2)
2624 				kernel_sgecpsc_2_0_lib4(n, &alpha, pA+2, pB);
2625 			else // if(m-ii==3)
2626 				kernel_sgecpsc_3_2_lib4(n, &alpha, pA, sda, pB);
2627 			}
2628 		}
2629 	// skip 3 elements of pA
2630 	else // if(offA==(offB+3)%bs)
2631 		{
2632 		ii = 0;
2633 		// clean up at the beginning
2634 		mna = (4-offB)%bs;
2635 
2636 		if(mna>0)
2637 			{
2638 
2639 			if(m<mna)
2640 				{
2641 				if(m==1)
2642 					{
2643 					kernel_sgecpsc_1_0_lib4(n, &alpha, pA+offA, pB+offB);
2644 					return;
2645 					}
2646 				else // if(m==2 && mna==3)
2647 					{
2648 					kernel_sgecpsc_2_0_lib4(n, &alpha, pA+offA, pB+offB);
2649 					return;
2650 					}
2651 				}
2652 
2653 			if(mna==1)
2654 				{
2655 				kernel_sgecpsc_1_0_lib4(n, &alpha, pA+offA, pB+offB);
2656 				// pA += 4*sda;
2657 				pB += 4*sdb;
2658 				ii += 1;
2659 				}
2660 			else if(mna==2)
2661 				{
2662 				kernel_sgecpsc_2_0_lib4(n, &alpha, pA+offA, pB+offB);
2663 				// pA += 4*sda;
2664 				pB += 4*sdb;
2665 				ii += 2;
2666 				}
2667 			else // if(mna==3)
2668 				{
2669 				kernel_sgecpsc_3_0_lib4(n, &alpha, pA+offA, pB+offB);
2670 				// pA += 4*sda;
2671 				pB += 4*sdb;
2672 				ii += 3;
2673 				}
2674 
2675 			}
2676 
2677 		// main loop
2678 
2679 		for(; ii<m-3; ii+=4)
2680 			{
2681 			kernel_sgecpsc_4_3_lib4(n, &alpha, pA, sda, pB);
2682 			pA += 4*sda;
2683 			pB += 4*sdb;
2684 			}
2685 
2686 		// clean up at the end
2687 		if(ii<m)
2688 			{
2689 			if(m-ii==1)
2690 				kernel_sgecpsc_1_0_lib4(n, &alpha, pA+3, pB);
2691 			else if(m-ii==2)
2692 				kernel_sgecpsc_2_3_lib4(n, &alpha, pA, sda, pB);
2693 			else // if(m-ii==3)
2694 				kernel_sgecpsc_3_3_lib4(n, &alpha, pA, sda, pB);
2695 			}
2696 		}
2697 
2698 	return;
2699 
2700 	}
2701 
2702 
2703 
2704 // copy a generic strmat into a generic strmat
blasfeo_sgecp(int m,int n,struct blasfeo_smat * sA,int ai,int aj,struct blasfeo_smat * sB,int bi,int bj)2705 void blasfeo_sgecp(int m, int n, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_smat *sB, int bi, int bj)
2706 	{
2707 
2708 	if(m<=0 | n<=0)
2709 		return;
2710 
2711 	// invalidate stored inverse diagonal
2712 	sB->use_dA = 0;
2713 
2714 #if defined(DIM_CHECK)
2715 	// non-negative size
2716 	if(m<0) printf("\n****** blasfeo_sgecp : m<0 : %d<0 *****\n", m);
2717 	if(n<0) printf("\n****** blasfeo_sgecp : n<0 : %d<0 *****\n", n);
2718 	// non-negative offset
2719 	if(ai<0) printf("\n****** blasfeo_sgecp : ai<0 : %d<0 *****\n", ai);
2720 	if(aj<0) printf("\n****** blasfeo_sgecp : aj<0 : %d<0 *****\n", aj);
2721 	if(bi<0) printf("\n****** blasfeo_sgecp : bi<0 : %d<0 *****\n", bi);
2722 	if(bj<0) printf("\n****** blasfeo_sgecp : bj<0 : %d<0 *****\n", bj);
2723 	// inside matrix
2724 	// A: m x n
2725 	if(ai+m > sA->m) printf("\n***** blasfeo_sgecp : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
2726 	if(aj+n > sA->n) printf("\n***** blasfeo_sgecp : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
2727 	// B: m x n
2728 	if(bi+m > sB->m) printf("\n***** blasfeo_sgecp : bi+m > row(B) : %d+%d > %d *****\n", bi, m, sB->m);
2729 	if(bj+n > sB->n) printf("\n***** blasfeo_sgecp : bj+n > col(B) : %d+%d > %d *****\n", bj, n, sB->n);
2730 #endif
2731 
2732 	const int bs = 4;
2733 
2734 	int mna, ii;
2735 
2736 	int sda = sA->cn;
2737 	int sdb = sB->cn;
2738 	float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
2739 	float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
2740 	int offA = ai%bs;
2741 	int offB = bi%bs;
2742 
2743 	// same alignment
2744 	if(offA==offB)
2745 		{
2746 		ii = 0;
2747 		// clean up at the beginning
2748 		mna = (4-offB)%bs;
2749 		if(mna>0)
2750 			{
2751 			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
2752 				{
2753 				if(m==1)
2754 					{
2755 					kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
2756 					return;
2757 					}
2758 				else //if(m==2 && mna==3)
2759 					{
2760 					kernel_sgecp_2_0_lib4(n, pA+offA, pB+offB);
2761 					return;
2762 					}
2763 				}
2764 			if(mna==1)
2765 				{
2766 				kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
2767 				pA += 4*sda;
2768 				pB += 4*sdb;
2769 				ii += 1;
2770 				}
2771 			else if(mna==2)
2772 				{
2773 				kernel_sgecp_2_0_lib4(n, pA+offA, pB+offB);
2774 				pA += 4*sda;
2775 				pB += 4*sdb;
2776 				ii += 2;
2777 				}
2778 			else // if(mna==3)
2779 				{
2780 				kernel_sgecp_3_0_lib4(n, pA+offA, pB+offB);
2781 				pA += 4*sda;
2782 				pB += 4*sdb;
2783 				ii += 3;
2784 				}
2785 			}
2786 		// main loop
2787 		for(; ii<m-3; ii+=4)
2788 			{
2789 			kernel_sgecp_4_0_lib4(n, pA, pB);
2790 			pA += 4*sda;
2791 			pB += 4*sdb;
2792 			}
2793 		// clean up at the end
2794 		if(ii<m)
2795 			{
2796 			if(m-ii==1)
2797 				kernel_sgecp_1_0_lib4(n, pA, pB);
2798 			else if(m-ii==2)
2799 				kernel_sgecp_2_0_lib4(n, pA, pB);
2800 			else // if(m-ii==3)
2801 				kernel_sgecp_3_0_lib4(n, pA, pB);
2802 			}
2803 		}
2804 	// skip one element of pA
2805 	else if(offA==(offB+1)%bs)
2806 		{
2807 		ii = 0;
2808 		// clean up at the beginning
2809 		mna = (4-offB)%bs;
2810 		if(mna>0)
2811 			{
2812 			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
2813 				{
2814 				if(m==1)
2815 					{
2816 					kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
2817 					return;
2818 					}
2819 				else //if(m==2 && mna==3)
2820 					{
2821 					kernel_sgecp_2_0_lib4(n, pA+offA, pB+offB);
2822 					return;
2823 					}
2824 				}
2825 			if(mna==1)
2826 				{
2827 				kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
2828 				//pA += 4*sda;
2829 				pB += 4*sdb;
2830 				ii += 1;
2831 				}
2832 			else if(mna==2)
2833 				{
2834 				kernel_sgecp_2_3_lib4(n, pA, sda, pB+2);
2835 				pA += 4*sda;
2836 				pB += 4*sdb;
2837 				ii += 2;
2838 				}
2839 			else // if(mna==3)
2840 				{
2841 				kernel_sgecp_3_2_lib4(n, pA, sda, pB+1);
2842 				pA += 4*sda;
2843 				pB += 4*sdb;
2844 				ii += 3;
2845 				}
2846 			}
2847 		// main loop
2848 		for( ; ii<m-3; ii+=4)
2849 			{
2850 			kernel_sgecp_4_1_lib4(n, pA, sda, pB);
2851 			pA += 4*sda;
2852 			pB += 4*sdb;
2853 			}
2854 		// clean up at the end
2855 		if(ii<m)
2856 			{
2857 			if(m-ii==1)
2858 				kernel_sgecp_1_0_lib4(n, pA+1, pB);
2859 			else if(m-ii==2)
2860 				kernel_sgecp_2_0_lib4(n, pA+1, pB);
2861 			else // if(m-ii==3)
2862 				kernel_sgecp_3_0_lib4(n, pA+1, pB);
2863 			}
2864 		}
2865 	// skip 2 elements of pA
2866 	else if(offA==(offB+2)%bs)
2867 		{
2868 		ii = 0;
2869 		// clean up at the beginning
2870 		mna = (4-offB)%bs;
2871 		if(mna>0)
2872 			{
2873 			if(m<mna)
2874 				{
2875 				if(m==1)
2876 					{
2877 					kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
2878 					return;
2879 					}
2880 				else // if(m==2 && mna==3)
2881 					{
2882 					kernel_sgecp_2_3_lib4(n, pA, sda, pB+1);
2883 					return;
2884 					}
2885 				}
2886 			if(mna==1)
2887 				{
2888 				kernel_sgecp_1_0_lib4(n, pA+1, pB+3);
2889 				// pA += 4*sda;
2890 				pB += 4*sdb;
2891 				ii += 1;
2892 				}
2893 			else if(mna==2)
2894 				{
2895 				kernel_sgecp_2_0_lib4(n, pA, pB+2);
2896 				// pA += 4*sda;
2897 				pB += 4*sdb;
2898 				ii += 2;
2899 				}
2900 			else // if(mna==3)
2901 				{
2902 				kernel_sgecp_3_3_lib4(n, pA, sda, pB+1);
2903 				pA += 4*sda;
2904 				pB += 4*sdb;
2905 				ii += 3;
2906 				}
2907 			}
2908 		// main loop
2909 		for(; ii<m-3; ii+=4)
2910 			{
2911 			kernel_sgecp_4_2_lib4(n, pA, sda, pB);
2912 			pA += 4*sda;
2913 			pB += 4*sdb;
2914 			}
2915 		// clean up at the end
2916 		if(ii<m)
2917 			{
2918 			if(m-ii==1)
2919 				kernel_sgecp_1_0_lib4(n, pA+2, pB);
2920 			else if(m-ii==2)
2921 				kernel_sgecp_2_0_lib4(n, pA+2, pB);
2922 			else // if(m-ii==3)
2923 				kernel_sgecp_3_2_lib4(n, pA, sda, pB);
2924 			}
2925 		}
2926 	// skip 3 elements of pA
2927 	else // if(offA==(offB+3)%bs)
2928 		{
2929 		ii = 0;
2930 		// clean up at the beginning
2931 		mna = (4-offB)%bs;
2932 		if(mna>0)
2933 			{
2934 			if(m<mna)
2935 				{
2936 				if(m==1)
2937 					{
2938 					kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
2939 					return;
2940 					}
2941 				else // if(m==2 && mna==3)
2942 					{
2943 					kernel_sgecp_2_0_lib4(n, pA+offA, pB+offB);
2944 					return;
2945 					}
2946 				}
2947 			if(mna==1)
2948 				{
2949 				kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
2950 				// pA += 4*sda;
2951 				pB += 4*sdb;
2952 				ii += 1;
2953 				}
2954 			else if(mna==2)
2955 				{
2956 				kernel_sgecp_2_0_lib4(n, pA+offA, pB+offB);
2957 				// pA += 4*sda;
2958 				pB += 4*sdb;
2959 				ii += 2;
2960 				}
2961 			else // if(mna==3)
2962 				{
2963 				kernel_sgecp_3_0_lib4(n, pA+offA, pB+offB);
2964 				// pA += 4*sda;
2965 				pB += 4*sdb;
2966 				ii += 3;
2967 				}
2968 			}
2969 		// main loop
2970 		for(; ii<m-3; ii+=4)
2971 			{
2972 			kernel_sgecp_4_3_lib4(n, pA, sda, pB);
2973 			pA += 4*sda;
2974 			pB += 4*sdb;
2975 			}
2976 		// clean up at the end
2977 		if(ii<m)
2978 			{
2979 			if(m-ii==1)
2980 				kernel_sgecp_1_0_lib4(n, pA+3, pB);
2981 			else if(m-ii==2)
2982 				kernel_sgecp_2_3_lib4(n, pA, sda, pB);
2983 			else // if(m-ii==3)
2984 				kernel_sgecp_3_3_lib4(n, pA, sda, pB);
2985 			}
2986 		}
2987 
2988 	return;
2989 
2990 	}
2991 
2992 
2993 
2994 // scale a strvec
blasfeo_svecsc(int m,float alpha,struct blasfeo_svec * sa,int ai)2995 void blasfeo_svecsc(int m, float alpha, struct blasfeo_svec *sa, int ai)
2996 	{
2997 	float *pa = sa->pa + ai;
2998 	int ii;
2999 	ii = 0;
3000 	for(; ii<m-3; ii+=4)
3001 		{
3002 		pa[ii+0] *= alpha;
3003 		pa[ii+1] *= alpha;
3004 		pa[ii+2] *= alpha;
3005 		pa[ii+3] *= alpha;
3006 		}
3007 	for(; ii<m; ii++)
3008 		{
3009 		pa[ii+0] *= alpha;
3010 		}
3011 	return;
3012 	}
3013 
3014 
3015 
3016 // copy a strvec into a strvec
blasfeo_sveccp(int m,struct blasfeo_svec * sa,int ai,struct blasfeo_svec * sc,int ci)3017 void blasfeo_sveccp(int m, struct blasfeo_svec *sa, int ai, struct blasfeo_svec *sc, int ci)
3018 	{
3019 	float *pa = sa->pa + ai;
3020 	float *pc = sc->pa + ci;
3021 	int ii;
3022 	ii = 0;
3023 	for(; ii<m-3; ii+=4)
3024 		{
3025 		pc[ii+0] = pa[ii+0];
3026 		pc[ii+1] = pa[ii+1];
3027 		pc[ii+2] = pa[ii+2];
3028 		pc[ii+3] = pa[ii+3];
3029 		}
3030 	for(; ii<m; ii++)
3031 		{
3032 		pc[ii+0] = pa[ii+0];
3033 		}
3034 	return;
3035 	}
3036 
3037 
3038 
3039 // copy and scale a strvec into a strvec
blasfeo_sveccpsc(int m,float alpha,struct blasfeo_svec * sa,int ai,struct blasfeo_svec * sc,int ci)3040 void blasfeo_sveccpsc(int m, float alpha, struct blasfeo_svec *sa, int ai, struct blasfeo_svec *sc, int ci)
3041 	{
3042 	float *pa = sa->pa + ai;
3043 	float *pc = sc->pa + ci;
3044 	int ii;
3045 	ii = 0;
3046 	for(; ii<m-3; ii+=4)
3047 		{
3048 		pc[ii+0] = alpha*pa[ii+0];
3049 		pc[ii+1] = alpha*pa[ii+1];
3050 		pc[ii+2] = alpha*pa[ii+2];
3051 		pc[ii+3] = alpha*pa[ii+3];
3052 		}
3053 	for(; ii<m; ii++)
3054 		{
3055 		pc[ii+0] = alpha*pa[ii+0];
3056 		}
3057 	return;
3058 	}
3059 
3060 
3061 
3062 // copy a lower triangular strmat into a lower triangular strmat
blasfeo_strcp_l(int m,struct blasfeo_smat * sA,int ai,int aj,struct blasfeo_smat * sB,int bi,int bj)3063 void blasfeo_strcp_l(int m, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_smat *sB, int bi, int bj)
3064 	{
3065 
3066 	if(m<=0)
3067 		return;
3068 
3069 	// invalidate stored inverse diagonal
3070 	sB->use_dA = 0;
3071 
3072 	const int bs = 4;
3073 
3074 	int sda = sA->cn;
3075 	int sdb = sB->cn;
3076 	float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
3077 	float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
3078 	int offA = ai%bs;
3079 	int offB = bi%bs;
3080 
3081 	int ii, mna;
3082 
3083 	// same alignment
3084 	if(offA==offB)
3085 		{
3086 		ii = 0;
3087 		// clean up at the beginning
3088 		mna = (4-offB)%bs;
3089 		if(mna>0)
3090 			{
3091 			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
3092 				{
3093 				if(m==1)
3094 					{
3095 					kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
3096 					return;
3097 					}
3098 				else //if(m==2 && mna==3)
3099 					{
3100 					kernel_strcp_l_2_0_lib4(ii, pA+offA, pB+offB);
3101 					return;
3102 					}
3103 				}
3104 			if(mna==1)
3105 				{
3106 				kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
3107 				pA += 4*sda;
3108 				pB += 4*sdb;
3109 				ii += 1;
3110 				}
3111 			else if(mna==2)
3112 				{
3113 				kernel_strcp_l_2_0_lib4(ii, pA+offA, pB+offB);
3114 				pA += 4*sda;
3115 				pB += 4*sdb;
3116 				ii += 2;
3117 				}
3118 			else // if(mna==3)
3119 				{
3120 				kernel_strcp_l_3_0_lib4(ii, pA+offA, pB+offB);
3121 				pA += 4*sda;
3122 				pB += 4*sdb;
3123 				ii += 3;
3124 				}
3125 			}
3126 		// main loop
3127 		for(; ii<m-3; ii+=4)
3128 			{
3129 			kernel_strcp_l_4_0_lib4(ii, pA, pB);
3130 			pA += 4*sda;
3131 			pB += 4*sdb;
3132 			}
3133 		// clean up at the end
3134 		if(ii<m)
3135 			{
3136 			if(m-ii==1)
3137 				kernel_strcp_l_1_0_lib4(ii, pA, pB);
3138 			else if(m-ii==2)
3139 				kernel_strcp_l_2_0_lib4(ii, pA, pB);
3140 			else // if(m-ii==3)
3141 				kernel_strcp_l_3_0_lib4(ii, pA, pB);
3142 			}
3143 		}
3144 	// skip one element of pA
3145 	else if(offA==(offB+1)%bs)
3146 		{
3147 		ii = 0;
3148 		// clean up at the beginning
3149 		mna = (4-offB)%bs;
3150 		if(mna>0)
3151 			{
3152 			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
3153 				{
3154 				if(m==1)
3155 					{
3156 					kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
3157 					return;
3158 					}
3159 				else //if(m==2 && mna==3)
3160 					{
3161 					kernel_strcp_l_2_0_lib4(ii, pA+offA, pB+offB);
3162 					return;
3163 					}
3164 				}
3165 			if(mna==1)
3166 				{
3167 				kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
3168 				//pA += 4*sda;
3169 				pB += 4*sdb;
3170 				ii += 1;
3171 				}
3172 			else if(mna==2)
3173 				{
3174 				kernel_strcp_l_2_3_lib4(ii, pA, sda, pB+2);
3175 				pA += 4*sda;
3176 				pB += 4*sdb;
3177 				ii += 2;
3178 				}
3179 			else // if(mna==3)
3180 				{
3181 				kernel_strcp_l_3_2_lib4(ii, pA, sda, pB+1);
3182 				pA += 4*sda;
3183 				pB += 4*sdb;
3184 				ii += 3;
3185 				}
3186 			}
3187 		// main loop
3188 		for( ; ii<m-3; ii+=4)
3189 			{
3190 			kernel_strcp_l_4_1_lib4(ii, pA, sda, pB);
3191 			pA += 4*sda;
3192 			pB += 4*sdb;
3193 			}
3194 		// clean up at the end
3195 		if(ii<m)
3196 			{
3197 			if(m-ii==1)
3198 				kernel_strcp_l_1_0_lib4(ii, pA+1, pB);
3199 			else if(m-ii==2)
3200 				kernel_strcp_l_2_0_lib4(ii, pA+1, pB);
3201 			else // if(m-ii==3)
3202 				kernel_strcp_l_3_0_lib4(ii, pA+1, pB);
3203 			}
3204 		}
3205 	// skip 2 elements of pA
3206 	else if(offA==(offB+2)%bs)
3207 		{
3208 		ii = 0;
3209 		// clean up at the beginning
3210 		mna = (4-offB)%bs;
3211 		if(mna>0)
3212 			{
3213 			if(m<mna)
3214 				{
3215 				if(m==1)
3216 					{
3217 					kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
3218 					return;
3219 					}
3220 				else // if(m==2 && mna==3)
3221 					{
3222 					kernel_strcp_l_2_3_lib4(ii, pA, sda, pB+1);
3223 					return;
3224 					}
3225 				}
3226 			if(mna==1)
3227 				{
3228 				kernel_strcp_l_1_0_lib4(ii, pA+1, pB+3);
3229 				// pA += 4*sda;
3230 				pB += 4*sdb;
3231 				ii += 1;
3232 				}
3233 			else if(mna==2)
3234 				{
3235 				kernel_strcp_l_2_0_lib4(ii, pA, pB+2);
3236 				// pA += 4*sda;
3237 				pB += 4*sdb;
3238 				ii += 2;
3239 				}
3240 			else // if(mna==3)
3241 				{
3242 				kernel_strcp_l_3_3_lib4(ii, pA, sda, pB+1);
3243 				pA += 4*sda;
3244 				pB += 4*sdb;
3245 				ii += 3;
3246 				}
3247 			}
3248 		// main loop
3249 		for(; ii<m-3; ii+=4)
3250 			{
3251 			kernel_strcp_l_4_2_lib4(ii, pA, sda, pB);
3252 			pA += 4*sda;
3253 			pB += 4*sdb;
3254 			}
3255 		// clean up at the end
3256 		if(ii<m)
3257 			{
3258 			if(m-ii==1)
3259 				kernel_strcp_l_1_0_lib4(ii, pA+2, pB);
3260 			else if(m-ii==2)
3261 				kernel_strcp_l_2_0_lib4(ii, pA+2, pB);
3262 			else // if(m-ii==3)
3263 				kernel_strcp_l_3_2_lib4(ii, pA, sda, pB);
3264 			}
3265 		}
3266 	// skip 3 elements of pA
3267 	else // if(offA==(offB+3)%bs)
3268 		{
3269 		ii = 0;
3270 		// clean up at the beginning
3271 		mna = (4-offB)%bs;
3272 		if(mna>0)
3273 			{
3274 			if(m<mna)
3275 				{
3276 				if(m==1)
3277 					{
3278 					kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
3279 					return;
3280 					}
3281 				else // if(m==2 && mna==3)
3282 					{
3283 					kernel_strcp_l_2_0_lib4(ii, pA+offA, pB+offB);
3284 					return;
3285 					}
3286 				}
3287 			if(mna==1)
3288 				{
3289 				kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
3290 				// pA += 4*sda;
3291 				pB += 4*sdb;
3292 				ii += 1;
3293 				}
3294 			else if(mna==2)
3295 				{
3296 				kernel_strcp_l_2_0_lib4(ii, pA+offA, pB+offB);
3297 				// pA += 4*sda;
3298 				pB += 4*sdb;
3299 				ii += 2;
3300 				}
3301 			else // if(mna==3)
3302 				{
3303 				kernel_strcp_l_3_0_lib4(ii, pA+offA, pB+offB);
3304 				// pA += 4*sda;
3305 				pB += 4*sdb;
3306 				ii += 3;
3307 				}
3308 			}
3309 		// main loop
3310 		for(; ii<m-3; ii+=4)
3311 			{
3312 			kernel_strcp_l_4_3_lib4(ii, pA, sda, pB);
3313 			pA += 4*sda;
3314 			pB += 4*sdb;
3315 			}
3316 		// clean up at the end
3317 		if(ii<m)
3318 			{
3319 			if(m-ii==1)
3320 				kernel_strcp_l_1_0_lib4(ii, pA+3, pB);
3321 			else if(m-ii==2)
3322 				kernel_strcp_l_2_3_lib4(ii, pA, sda, pB);
3323 			else // if(m-ii==3)
3324 				kernel_strcp_l_3_3_lib4(ii, pA, sda, pB);
3325 			}
3326 		}
3327 
3328 	return;
3329 
3330 	}
3331 
3332 
3333 
3334 // scale and add a generic strmat into a generic strmat
blasfeo_sgead(int m,int n,float alpha,struct blasfeo_smat * sA,int ai,int aj,struct blasfeo_smat * sB,int bi,int bj)3335 void blasfeo_sgead(int m, int n, float alpha, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_smat *sB, int bi, int bj)
3336 	{
3337 
3338 	if(m<=0 || n<=0)
3339 		return;
3340 
3341 	// invalidate stored inverse diagonal
3342 	sB->use_dA = 0;
3343 
3344 	const int bs = 4;
3345 
3346 	int sda = sA->cn;
3347 	int sdb = sB->cn;
3348 	float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
3349 	float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
3350 	int offA = ai%bs;
3351 	int offB = bi%bs;
3352 
3353 	int ii, mna;
3354 
3355 	// same alignment
3356 	if(offA==offB)
3357 		{
3358 		ii = 0;
3359 		// clean up at the beginning
3360 		mna = (4-offB)%bs;
3361 		if(mna>0)
3362 			{
3363 			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
3364 				{
3365 				if(m==1)
3366 					{
3367 					kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
3368 					return;
3369 					}
3370 				else //if(m==2 && mna==3)
3371 					{
3372 					kernel_sgead_2_0_lib4(n, &alpha, pA+offA, pB+offB);
3373 					return;
3374 					}
3375 				}
3376 			if(mna==1)
3377 				{
3378 				kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
3379 				pA += 4*sda;
3380 				pB += 4*sdb;
3381 				ii += 1;
3382 				}
3383 			else if(mna==2)
3384 				{
3385 				kernel_sgead_2_0_lib4(n, &alpha, pA+offA, pB+offB);
3386 				pA += 4*sda;
3387 				pB += 4*sdb;
3388 				ii += 2;
3389 				}
3390 			else // if(mna==3)
3391 				{
3392 				kernel_sgead_3_0_lib4(n, &alpha, pA+offA, pB+offB);
3393 				pA += 4*sda;
3394 				pB += 4*sdb;
3395 				ii += 3;
3396 				}
3397 			}
3398 		// main loop
3399 		for(; ii<m-3; ii+=4)
3400 			{
3401 			kernel_sgead_4_0_lib4(n, &alpha, pA, pB);
3402 			pA += 4*sda;
3403 			pB += 4*sdb;
3404 			}
3405 		// clean up at the end
3406 		if(ii<m)
3407 			{
3408 			if(m-ii==1)
3409 				kernel_sgead_1_0_lib4(n, &alpha, pA, pB);
3410 			else if(m-ii==2)
3411 				kernel_sgead_2_0_lib4(n, &alpha, pA, pB);
3412 			else // if(m-ii==3)
3413 				kernel_sgead_3_0_lib4(n, &alpha, pA, pB);
3414 			}
3415 		}
3416 	// skip one element of pA
3417 	else if(offA==(offB+1)%bs)
3418 		{
3419 		ii = 0;
3420 		// clean up at the beginning
3421 		mna = (4-offB)%bs;
3422 		if(mna>0)
3423 			{
3424 			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
3425 				{
3426 				if(m==1)
3427 					{
3428 					kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
3429 					return;
3430 					}
3431 				else //if(m==2 && mna==3)
3432 					{
3433 					kernel_sgead_2_0_lib4(n, &alpha, pA+offA, pB+offB);
3434 					return;
3435 					}
3436 				}
3437 			if(mna==1)
3438 				{
3439 				kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
3440 				//pA += 4*sda;
3441 				pB += 4*sdb;
3442 				ii += 1;
3443 				}
3444 			else if(mna==2)
3445 				{
3446 				kernel_sgead_2_3_lib4(n, &alpha, pA, sda, pB+2);
3447 				pA += 4*sda;
3448 				pB += 4*sdb;
3449 				ii += 2;
3450 				}
3451 			else // if(mna==3)
3452 				{
3453 				kernel_sgead_3_2_lib4(n, &alpha, pA, sda, pB+1);
3454 				pA += 4*sda;
3455 				pB += 4*sdb;
3456 				ii += 3;
3457 				}
3458 			}
3459 		// main loop
3460 		for( ; ii<m-3; ii+=4)
3461 			{
3462 			kernel_sgead_4_1_lib4(n, &alpha, pA, sda, pB);
3463 			pA += 4*sda;
3464 			pB += 4*sdb;
3465 			}
3466 		// clean up at the end
3467 		if(ii<m)
3468 			{
3469 			if(m-ii==1)
3470 				kernel_sgead_1_0_lib4(n, &alpha, pA+1, pB);
3471 			else if(m-ii==2)
3472 				kernel_sgead_2_0_lib4(n, &alpha, pA+1, pB);
3473 			else // if(m-ii==3)
3474 				kernel_sgead_3_0_lib4(n, &alpha, pA+1, pB);
3475 			}
3476 		}
3477 	// skip 2 elements of pA
3478 	else if(offA==(offB+2)%bs)
3479 		{
3480 		ii = 0;
3481 		// clean up at the beginning
3482 		mna = (4-offB)%bs;
3483 		if(mna>0)
3484 			{
3485 			if(m<mna)
3486 				{
3487 				if(m==1)
3488 					{
3489 					kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
3490 					return;
3491 					}
3492 				else // if(m==2 && mna==3)
3493 					{
3494 					kernel_sgead_2_3_lib4(n, &alpha, pA, sda, pB+1);
3495 					return;
3496 					}
3497 				}
3498 			if(mna==1)
3499 				{
3500 				kernel_sgead_1_0_lib4(n, &alpha, pA+1, pB+3);
3501 				// pA += 4*sda;
3502 				pB += 4*sdb;
3503 				ii += 1;
3504 				}
3505 			else if(mna==2)
3506 				{
3507 				kernel_sgead_2_0_lib4(n, &alpha, pA, pB+2);
3508 				// pA += 4*sda;
3509 				pB += 4*sdb;
3510 				ii += 2;
3511 				}
3512 			else // if(mna==3)
3513 				{
3514 				kernel_sgead_3_3_lib4(n, &alpha, pA, sda, pB+1);
3515 				pA += 4*sda;
3516 				pB += 4*sdb;
3517 				ii += 3;
3518 				}
3519 			}
3520 		// main loop
3521 		for(; ii<m-3; ii+=4)
3522 			{
3523 			kernel_sgead_4_2_lib4(n, &alpha, pA, sda, pB);
3524 			pA += 4*sda;
3525 			pB += 4*sdb;
3526 			}
3527 		// clean up at the end
3528 		if(ii<m)
3529 			{
3530 			if(m-ii==1)
3531 				kernel_sgead_1_0_lib4(n, &alpha, pA+2, pB);
3532 			else if(m-ii==2)
3533 				kernel_sgead_2_0_lib4(n, &alpha, pA+2, pB);
3534 			else // if(m-ii==3)
3535 				kernel_sgead_3_2_lib4(n, &alpha, pA, sda, pB);
3536 			}
3537 		}
3538 	// skip 3 elements of pA
3539 	else // if(offA==(offB+3)%bs)
3540 		{
3541 		ii = 0;
3542 		// clean up at the beginning
3543 		mna = (4-offB)%bs;
3544 		if(mna>0)
3545 			{
3546 			if(m<mna)
3547 				{
3548 				if(m==1)
3549 					{
3550 					kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
3551 					return;
3552 					}
3553 				else // if(m==2 && mna==3)
3554 					{
3555 					kernel_sgead_2_0_lib4(n, &alpha, pA+offA, pB+offB);
3556 					return;
3557 					}
3558 				}
3559 			if(mna==1)
3560 				{
3561 				kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
3562 				// pA += 4*sda;
3563 				pB += 4*sdb;
3564 				ii += 1;
3565 				}
3566 			else if(mna==2)
3567 				{
3568 				kernel_sgead_2_0_lib4(n, &alpha, pA+offA, pB+offB);
3569 				// pA += 4*sda;
3570 				pB += 4*sdb;
3571 				ii += 2;
3572 				}
3573 			else // if(mna==3)
3574 				{
3575 				kernel_sgead_3_0_lib4(n, &alpha, pA+offA, pB+offB);
3576 				// pA += 4*sda;
3577 				pB += 4*sdb;
3578 				ii += 3;
3579 				}
3580 			}
3581 		// main loop
3582 		for(; ii<m-3; ii+=4)
3583 			{
3584 			kernel_sgead_4_3_lib4(n, &alpha, pA, sda, pB);
3585 			pA += 4*sda;
3586 			pB += 4*sdb;
3587 			}
3588 		// clean up at the end
3589 		if(ii<m)
3590 			{
3591 			if(m-ii==1)
3592 				kernel_sgead_1_0_lib4(n, &alpha, pA+3, pB);
3593 			else if(m-ii==2)
3594 				kernel_sgead_2_3_lib4(n, &alpha, pA, sda, pB);
3595 			else // if(m-ii==3)
3596 				kernel_sgead_3_3_lib4(n, &alpha, pA, sda, pB);
3597 			}
3598 		}
3599 
3600 	return;
3601 
3602 	}
3603 
3604 
3605 
3606 // copy and transpose a generic strmat into a generic strmat
blasfeo_sgetr(int m,int n,struct blasfeo_smat * sA,int ai,int aj,struct blasfeo_smat * sC,int ci,int cj)3607 void blasfeo_sgetr(int m, int n, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_smat *sC, int ci, int cj)
3608 	{
3609 	if(m<=0 || n<=0)
3610 		return;
3611 
3612 	// invalidate stored inverse diagonal
3613 	sC->use_dA = 0;
3614 
3615 	const int bs = 4;
3616 	int sda = sA->cn;
3617 	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
3618 	int sdc = sC->cn;
3619 	float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
3620 	sgetr_lib(m, n, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
3621 	return;
3622 	}
3623 
3624 
3625 
3626 // copy and transpose a lower triangular strmat into an upper triangular strmat
blasfeo_strtr_l(int m,struct blasfeo_smat * sA,int ai,int aj,struct blasfeo_smat * sC,int ci,int cj)3627 void blasfeo_strtr_l(int m, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_smat *sC, int ci, int cj)
3628 	{
3629 	if(m<=0)
3630 		return;
3631 
3632 	// invalidate stored inverse diagonal
3633 	sC->use_dA = 0;
3634 
3635 	const int bs = 4;
3636 	int sda = sA->cn;
3637 	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
3638 	int sdc = sC->cn;
3639 	float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
3640 	strtr_l_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
3641 	return;
3642 	}
3643 
3644 
3645 
3646 // copy and transpose an upper triangular strmat into a lower triangular strmat
blasfeo_strtr_u(int m,struct blasfeo_smat * sA,int ai,int aj,struct blasfeo_smat * sC,int ci,int cj)3647 void blasfeo_strtr_u(int m, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_smat *sC, int ci, int cj)
3648 	{
3649 	if(m<=0)
3650 		return;
3651 
3652 	// invalidate stored inverse diagonal
3653 	sC->use_dA = 0;
3654 
3655 	const int bs = 4;
3656 	int sda = sA->cn;
3657 	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
3658 	int sdc = sC->cn;
3659 	float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
3660 	strtr_u_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
3661 	return;
3662 	}
3663 
3664 
3665 
3666 // insert a strvec to diagonal of strmat, sparse formulation
blasfeo_sdiain_sp(int kmax,float alpha,struct blasfeo_svec * sx,int xi,int * idx,struct blasfeo_smat * sD,int di,int dj)3667 void blasfeo_sdiain_sp(int kmax, float alpha, struct blasfeo_svec *sx, int xi, int *idx, struct blasfeo_smat *sD, int di, int dj)
3668 	{
3669 	if(kmax<=0)
3670 		return;
3671 
3672 	// invalidate stored inverse diagonal
3673 	sD->use_dA = 0;
3674 
3675 	const int bs = 4;
3676 	float *x = sx->pa + xi;
3677 	int sdd = sD->cn;
3678 	float *pD = sD->pA;
3679 	int ii, jj;
3680 	for(jj=0; jj<kmax; jj++)
3681 		{
3682 		ii = idx[jj];
3683 		pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = alpha * x[jj];
3684 		}
3685 	return;
3686 	}
3687 
3688 
3689 
3690 // extract the diagonal of a strmat to a strvec, sparse formulation
blasfeo_sdiaex_sp(int kmax,float alpha,int * idx,struct blasfeo_smat * sD,int di,int dj,struct blasfeo_svec * sx,int xi)3691 void blasfeo_sdiaex_sp(int kmax, float alpha, int *idx, struct blasfeo_smat *sD, int di, int dj, struct blasfeo_svec *sx, int xi)
3692 	{
3693 	const int bs = 4;
3694 	float *x = sx->pa + xi;
3695 	int sdd = sD->cn;
3696 	float *pD = sD->pA;
3697 	int ii, jj;
3698 	for(jj=0; jj<kmax; jj++)
3699 		{
3700 		ii = idx[jj];
3701 		x[jj] = alpha * pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs];
3702 		}
3703 	return;
3704 	}
3705 
3706 
3707 
3708 // add scaled strvec to diagonal of strmat, sparse formulation
blasfeo_sdiaad_sp(int kmax,float alpha,struct blasfeo_svec * sx,int xi,int * idx,struct blasfeo_smat * sD,int di,int dj)3709 void blasfeo_sdiaad_sp(int kmax, float alpha, struct blasfeo_svec *sx, int xi, int *idx, struct blasfeo_smat *sD, int di, int dj)
3710 	{
3711 	if(kmax<=0)
3712 		return;
3713 
3714 	// invalidate stored inverse diagonal
3715 	sD->use_dA = 0;
3716 
3717 	const int bs = 4;
3718 	float *x = sx->pa + xi;
3719 	int sdd = sD->cn;
3720 	float *pD = sD->pA;
3721 	int ii, jj;
3722 	for(jj=0; jj<kmax; jj++)
3723 		{
3724 		ii = idx[jj];
3725 		pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] += alpha * x[jj];
3726 		}
3727 	return;
3728 	}
3729 
3730 
3731 
3732 // add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation
blasfeo_sdiaadin_sp(int kmax,float alpha,struct blasfeo_svec * sx,int xi,struct blasfeo_svec * sy,int yi,int * idx,struct blasfeo_smat * sD,int di,int dj)3733 void blasfeo_sdiaadin_sp(int kmax, float alpha, struct blasfeo_svec *sx, int xi, struct blasfeo_svec *sy, int yi, int *idx, struct blasfeo_smat *sD, int di, int dj)
3734 	{
3735 	if(kmax<=0)
3736 		return;
3737 
3738 	// invalidate stored inverse diagonal
3739 	sD->use_dA = 0;
3740 
3741 	const int bs = 4;
3742 	float *x = sx->pa + xi;
3743 	float *y = sy->pa + yi;
3744 	int sdd = sD->cn;
3745 	float *pD = sD->pA;
3746 	int ii, jj;
3747 	for(jj=0; jj<kmax; jj++)
3748 		{
3749 		ii = idx[jj];
3750 		pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = y[jj] + alpha * x[jj];
3751 		}
3752 	return;
3753 	}
3754 
3755 
3756 
3757 // add scaled strvec to row of strmat, sparse formulation
blasfeo_srowad_sp(int kmax,float alpha,struct blasfeo_svec * sx,int xi,int * idx,struct blasfeo_smat * sD,int di,int dj)3758 void blasfeo_srowad_sp(int kmax, float alpha, struct blasfeo_svec *sx, int xi, int *idx, struct blasfeo_smat *sD, int di, int dj)
3759 	{
3760 	if(kmax<=0)
3761 		return;
3762 
3763 	// invalidate stored inverse diagonal
3764 	sD->use_dA = 0;
3765 
3766 	const int bs = 4;
3767 	float *x = sx->pa + xi;
3768 	int sdd = sD->cn;
3769 	float *pD = sD->pA + di/bs*bs*sdd + di%bs + dj*bs;
3770 	srowad_libsp(kmax, idx, alpha, x, pD);
3771 	return;
3772 	}
3773 
3774 
3775 
3776 // adds strvec to strvec, sparse formulation
blasfeo_svecad_sp(int kmax,float alpha,struct blasfeo_svec * sx,int xi,int * idx,struct blasfeo_svec * sy,int yi)3777 void blasfeo_svecad_sp(int kmax, float alpha, struct blasfeo_svec *sx, int xi, int *idx, struct blasfeo_svec *sy, int yi)
3778 	{
3779 	float *x = sx->pa + xi;
3780 	float *y = sy->pa + yi;
3781 	svecad_libsp(kmax, idx, alpha, x, y);
3782 	return;
3783 	}
3784 
3785 
3786 
blasfeo_svecin_sp(int m,float alpha,struct blasfeo_svec * sx,int xi,int * idx,struct blasfeo_svec * sz,int zi)3787 void blasfeo_svecin_sp(int m, float alpha, struct blasfeo_svec *sx, int xi, int *idx, struct blasfeo_svec *sz, int zi)
3788 	{
3789 	float *x = sx->pa + xi;
3790 	float *z = sz->pa + zi;
3791 	int ii;
3792 	for(ii=0; ii<m; ii++)
3793 		z[idx[ii]] = alpha * x[ii];
3794 	return;
3795 	}
3796 
3797 
3798 
blasfeo_svecex_sp(int m,float alpha,int * idx,struct blasfeo_svec * sx,int xi,struct blasfeo_svec * sz,int zi)3799 void blasfeo_svecex_sp(int m, float alpha, int *idx, struct blasfeo_svec *sx, int xi, struct blasfeo_svec *sz, int zi)
3800 	{
3801 	float *x = sx->pa + xi;
3802 	float *z = sz->pa + zi;
3803 	int ii;
3804 	for(ii=0; ii<m; ii++)
3805 		z[ii] = alpha * x[idx[ii]];
3806 	return;
3807 	}
3808 
3809 
3810 
blasfeo_svecnrm_inf(int m,struct blasfeo_svec * sx,int xi,float * ptr_norm)3811 void blasfeo_svecnrm_inf(int m, struct blasfeo_svec *sx, int xi, float *ptr_norm)
3812 	{
3813 	int ii;
3814 	float *x = sx->pa + xi;
3815 	float norm = 0.0;
3816 	float tmp;
3817 	for(ii=0; ii<m; ii++)
3818 		{
3819 #ifdef USE_C99_MATH
3820 		norm = fmaxf(norm, fabsf(x[ii]));
3821 #else
3822 		tmp = fabs(x[ii]);
3823 		norm = tmp>norm ? tmp : norm;
3824 #endif
3825 		}
3826 	*ptr_norm = norm;
3827 	return;
3828 	}
3829 
3830 
3831 // permute elements of a vector struct
blasfeo_svecpe(int kmax,int * ipiv,struct blasfeo_svec * sx,int xi)3832 void blasfeo_svecpe(int kmax, int *ipiv, struct blasfeo_svec *sx, int xi)
3833 	{
3834 	int ii;
3835 	float tmp;
3836 	float *x = sx->pa + xi;
3837 	for(ii=0; ii<kmax; ii++)
3838 		{
3839 		if(ipiv[ii]!=ii)
3840 			{
3841 			tmp = x[ipiv[ii]];
3842 			x[ipiv[ii]] = x[ii];
3843 			x[ii] = tmp;
3844 			}
3845 		}
3846 	return;
3847 	}
3848 
3849 
3850 
3851 // inverse permute elements of a vector struct
blasfeo_svecpei(int kmax,int * ipiv,struct blasfeo_svec * sx,int xi)3852 void blasfeo_svecpei(int kmax, int *ipiv, struct blasfeo_svec *sx, int xi)
3853 	{
3854 	int ii;
3855 	float tmp;
3856 	float *x = sx->pa + xi;
3857 	for(ii=kmax-1; ii>=0; ii--)
3858 		{
3859 		if(ipiv[ii]!=ii)
3860 			{
3861 			tmp = x[ipiv[ii]];
3862 			x[ipiv[ii]] = x[ii];
3863 			x[ii] = tmp;
3864 			}
3865 		}
3866 	return;
3867 	}
3868 
3869 
3870 
3871 #else
3872 
3873 #error : wrong LA choice
3874 
3875 #endif
3876 
3877 
3878 
3879