1 /**************************************************************************************************
2 *                                                                                                 *
3 * This file is part of BLASFEO.                                                                   *
4 *                                                                                                 *
5 * BLASFEO -- BLAS for embedded optimization.                                                      *
6 * Copyright (C) 2019 by Gianluca Frison.                                                          *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
8 * All rights reserved.                                                                            *
9 *                                                                                                 *
10 * The 2-Clause BSD License                                                                        *
11 *                                                                                                 *
12 * Redistribution and use in source and binary forms, with or without                              *
13 * modification, are permitted provided that the following conditions are met:                     *
14 *                                                                                                 *
15 * 1. Redistributions of source code must retain the above copyright notice, this                  *
16 *    list of conditions and the following disclaimer.                                             *
17 * 2. Redistributions in binary form must reproduce the above copyright notice,                    *
18 *    this list of conditions and the following disclaimer in the documentation                    *
19 *    and/or other materials provided with the distribution.                                       *
20 *                                                                                                 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND                 *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED                   *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE                          *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR                 *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES                  *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;                    *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND                     *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT                      *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS                   *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                                    *
31 *                                                                                                 *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de                             *
33 *                                                                                                 *
34 **************************************************************************************************/
35 
36 #include <stdlib.h>
37 #include <stdio.h>
38 
39 #include "../include/blasfeo_common.h"
40 #include "../include/blasfeo_d_kernel.h"
41 #include "../include/blasfeo_d_blas.h"
42 #include "../include/blasfeo_d_aux.h"
43 
44 
45 
dtrsv_ln_inv_lib(int m,int n,double * pA,int sda,double * inv_diag_A,double * x,double * y)46 void dtrsv_ln_inv_lib(int m, int n, double *pA, int sda, double *inv_diag_A, double *x, double *y)
47 	{
48 
49 	if(m<=0 || n<=0)
50 		return;
51 
52 	// suppose m>=n
53 	if(m<n)
54 		m = n;
55 
56 	const int bs = 4;
57 
58 	double alpha = -1.0;
59 	double beta = 1.0;
60 
61 	int i;
62 
63 	if(x!=y)
64 		{
65 		for(i=0; i<m; i++)
66 			y[i] = x[i];
67 		}
68 
69 	i = 0;
70 	for( ; i<n-3; i+=4)
71 		{
72 		kernel_dtrsv_ln_inv_4_lib4(i, &pA[i*sda], &inv_diag_A[i], y, &y[i], &y[i]);
73 		}
74 	if(i<n)
75 		{
76 		kernel_dtrsv_ln_inv_4_vs_lib4(i, &pA[i*sda], &inv_diag_A[i], y, &y[i], &y[i], m-i, n-i);
77 		i+=4;
78 		}
79 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
80 	for( ; i<m-7; i+=8)
81 		{
82 		kernel_dgemv_n_8_lib4(n, &alpha, &pA[i*sda], sda, y, &beta, &y[i], &y[i]);
83 		}
84 	if(i<m-3)
85 		{
86 		kernel_dgemv_n_4_lib4(n, &alpha, &pA[i*sda], y, &beta, &y[i], &y[i]);
87 		i+=4;
88 		}
89 #else
90 	for( ; i<m-3; i+=4)
91 		{
92 		kernel_dgemv_n_4_lib4(n, &alpha, &pA[i*sda], y, &beta, &y[i], &y[i]);
93 		}
94 #endif
95 	if(i<m)
96 		{
97 		kernel_dgemv_n_4_vs_lib4(n, &alpha, &pA[i*sda], y, &beta, &y[i], &y[i], m-i);
98 		i+=4;
99 		}
100 
101 	}
102 
103 
104 
dtrsv_lt_inv_lib(int m,int n,double * pA,int sda,double * inv_diag_A,double * x,double * y)105 void dtrsv_lt_inv_lib(int m, int n, double *pA, int sda, double *inv_diag_A, double *x, double *y)
106 	{
107 
108 	if(m<=0 || n<=0)
109 		return;
110 
111 	if(n>m)
112 		n = m;
113 
114 	const int bs = 4;
115 
116 	int i;
117 
118 	if(x!=y)
119 		for(i=0; i<m; i++)
120 			y[i] = x[i];
121 
122 	i=0;
123 	if(n%4==1)
124 		{
125 		kernel_dtrsv_lt_inv_1_lib4(m-n+i+1, &pA[n/bs*bs*sda+(n-i-1)*bs], sda, &inv_diag_A[n-i-1], &y[n-i-1], &y[n-i-1], &y[n-i-1]);
126 		i++;
127 		}
128 	else if(n%4==2)
129 		{
130 		kernel_dtrsv_lt_inv_2_lib4(m-n+i+2, &pA[n/bs*bs*sda+(n-i-2)*bs], sda, &inv_diag_A[n-i-2], &y[n-i-2], &y[n-i-2], &y[n-i-2]);
131 		i+=2;
132 		}
133 	else if(n%4==3)
134 		{
135 		kernel_dtrsv_lt_inv_3_lib4(m-n+i+3, &pA[n/bs*bs*sda+(n-i-3)*bs], sda, &inv_diag_A[n-i-3], &y[n-i-3], &y[n-i-3], &y[n-i-3]);
136 		i+=3;
137 		}
138 	for(; i<n-3; i+=4)
139 		{
140 		kernel_dtrsv_lt_inv_4_lib4(m-n+i+4, &pA[(n-i-4)/bs*bs*sda+(n-i-4)*bs], sda, &inv_diag_A[n-i-4], &y[n-i-4], &y[n-i-4], &y[n-i-4]);
141 		}
142 
143 	}
144 
145 
146 
dgemv_nt_lib(int m,int n,double alpha_n,double alpha_t,double * pA,int sda,double * x_n,double * x_t,double beta_n,double beta_t,double * y_n,double * y_t,double * z_n,double * z_t)147 void dgemv_nt_lib(int m, int n, double alpha_n, double alpha_t, double *pA, int sda, double *x_n, double *x_t, double beta_n, double beta_t, double *y_n, double *y_t, double *z_n, double *z_t)
148 	{
149 
150 	if(m<=0 | n<=0)
151 		return;
152 
153 	const int bs = 4;
154 
155 	int ii;
156 
157 	// copy and scale y_n int z_n
158 	ii = 0;
159 	for(; ii<m-3; ii+=4)
160 		{
161 		z_n[ii+0] = beta_n*y_n[ii+0];
162 		z_n[ii+1] = beta_n*y_n[ii+1];
163 		z_n[ii+2] = beta_n*y_n[ii+2];
164 		z_n[ii+3] = beta_n*y_n[ii+3];
165 		}
166 	for(; ii<m; ii++)
167 		{
168 		z_n[ii+0] = beta_n*y_n[ii+0];
169 		}
170 
171 	ii = 0;
172 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
173 	for(; ii<n-5; ii+=6)
174 		{
175 		kernel_dgemv_nt_6_lib4(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii);
176 		}
177 #endif
178 	for(; ii<n-3; ii+=4)
179 		{
180 		kernel_dgemv_nt_4_lib4(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii);
181 		}
182 	if(ii<n)
183 		{
184 		kernel_dgemv_nt_4_vs_lib4(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii, n-ii);
185 		}
186 
187 	return;
188 
189 	}
190 
191 
192 
193 #if defined(LA_HIGH_PERFORMANCE)
194 
195 
196 
blasfeo_dgemv_n(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,double beta,struct blasfeo_dvec * sy,int yi,struct blasfeo_dvec * sz,int zi)197 void blasfeo_dgemv_n(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, double beta, struct blasfeo_dvec *sy, int yi, struct blasfeo_dvec *sz, int zi)
198 	{
199 
200 	if(m<0)
201 		return;
202 
203 	const int bs = 4;
204 
205 	int i;
206 
207 	int sda = sA->cn;
208 	double *pA = sA->pA + aj*bs + ai/bs*bs*sda;
209 	double *x = sx->pa + xi;
210 	double *y = sy->pa + yi;
211 	double *z = sz->pa + zi;
212 
213 	i = 0;
214 	// clean up at the beginning
215 	if(ai%bs!=0)
216 		{
217 		kernel_dgemv_n_4_gen_lib4(n, &alpha, pA, x, &beta, y-ai%bs, z-ai%bs, ai%bs, m+ai%bs);
218 		pA += bs*sda;
219 		y += 4 - ai%bs;
220 		z += 4 - ai%bs;
221 		m -= 4 - ai%bs;
222 		}
223 	// main loop
224 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
225 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
226 	for( ; i<m-11; i+=12)
227 		{
228 		kernel_dgemv_n_12_lib4(n, &alpha, &pA[i*sda], sda, x, &beta, &y[i], &z[i]);
229 		}
230 #endif
231 	for( ; i<m-7; i+=8)
232 		{
233 		kernel_dgemv_n_8_lib4(n, &alpha, &pA[i*sda], sda, x, &beta, &y[i], &z[i]);
234 		}
235 	if(i<m-3)
236 		{
237 		kernel_dgemv_n_4_lib4(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i]);
238 		i+=4;
239 		}
240 #else
241 	for( ; i<m-3; i+=4)
242 		{
243 		kernel_dgemv_n_4_lib4(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i]);
244 		}
245 #endif
246 	if(i<m)
247 		{
248 		kernel_dgemv_n_4_vs_lib4(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i], m-i);
249 		}
250 
251 	return;
252 
253 	}
254 
255 
256 
blasfeo_dgemv_t(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,double beta,struct blasfeo_dvec * sy,int yi,struct blasfeo_dvec * sz,int zi)257 void blasfeo_dgemv_t(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, double beta, struct blasfeo_dvec *sy, int yi, struct blasfeo_dvec *sz, int zi)
258 	{
259 
260 	if(n<=0)
261 		return;
262 
263 	const int bs = 4;
264 
265 	int i;
266 
267 	int sda = sA->cn;
268 	double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
269 	int offsetA = ai%bs;
270 	double *x = sx->pa + xi;
271 	double *y = sy->pa + yi;
272 	double *z = sz->pa + zi;
273 
274 	i = 0;
275 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
276 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
277 	for( ; i<n-11; i+=12)
278 		{
279 		kernel_dgemv_t_12_lib4(m, &alpha, offsetA, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
280 		}
281 #endif
282 	for( ; i<n-7; i+=8)
283 		{
284 		kernel_dgemv_t_8_lib4(m, &alpha, offsetA, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
285 		}
286 	if(i<n-3)
287 		{
288 		kernel_dgemv_t_4_lib4(m, &alpha, offsetA, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
289 		i+=4;
290 		}
291 #else
292 	for( ; i<n-3; i+=4)
293 		{
294 		kernel_dgemv_t_4_lib4(m, &alpha, offsetA, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
295 		}
296 #endif
297 	if(i<n)
298 		{
299 		kernel_dgemv_t_4_vs_lib4(m, &alpha, offsetA, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
300 		}
301 
302 	return;
303 
304 	}
305 
306 
307 
blasfeo_dgemv_nt(int m,int n,double alpha_n,double alpha_t,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx_n,int xi_n,struct blasfeo_dvec * sx_t,int xi_t,double beta_n,double beta_t,struct blasfeo_dvec * sy_n,int yi_n,struct blasfeo_dvec * sy_t,int yi_t,struct blasfeo_dvec * sz_n,int zi_n,struct blasfeo_dvec * sz_t,int zi_t)308 void blasfeo_dgemv_nt(int m, int n, double alpha_n, double alpha_t, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx_n, int xi_n, struct blasfeo_dvec *sx_t, int xi_t, double beta_n, double beta_t, struct blasfeo_dvec *sy_n, int yi_n, struct blasfeo_dvec *sy_t, int yi_t, struct blasfeo_dvec *sz_n, int zi_n, struct blasfeo_dvec *sz_t, int zi_t)
309 	{
310 	if(ai!=0)
311 		{
312 		printf("\nblasfeo_dgemv_nt: feature not implemented yet: ai=%d\n", ai);
313 		exit(1);
314 		}
315 	const int bs = 4;
316 #if defined(TARGET_X86_AMD_BARCELONA) | defined(TARGET_X86_AMD_JAGUAR)
317 	blasfeo_dgemv_n(m, n, alpha_n, sA, ai, aj, sx_n, xi_n, beta_n, sy_n, yi_n, sz_n, zi_n);
318 	blasfeo_dgemv_t(m, n, alpha_t, sA, ai, aj, sx_t, xi_t, beta_t, sy_t, yi_t, sz_t, zi_t);
319 	return;
320 #endif
321 	int sda = sA->cn;
322 	double *pA = sA->pA + aj*bs; // TODO ai
323 	double *x_n = sx_n->pa + xi_n;
324 	double *x_t = sx_t->pa + xi_t;
325 	double *y_n = sy_n->pa + yi_n;
326 	double *y_t = sy_t->pa + yi_t;
327 	double *z_n = sz_n->pa + zi_n;
328 	double *z_t = sz_t->pa + zi_t;
329 	dgemv_nt_lib(m, n, alpha_n, alpha_t, pA, sda, x_n, x_t, beta_n, beta_t, y_n, y_t, z_n, z_t);
330 	return;
331 	}
332 
333 
334 
335 // m >= n
blasfeo_dsymv_l(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,double beta,struct blasfeo_dvec * sy,int yi,struct blasfeo_dvec * sz,int zi)336 void blasfeo_dsymv_l(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, double beta, struct blasfeo_dvec *sy, int yi, struct blasfeo_dvec *sz, int zi)
337 	{
338 
339 	if(m<n)
340 		n = m;
341 
342 	if(m<=0 | n<=0)
343 		return;
344 
345 	const int bs = 4;
346 
347 	int ii, n1;
348 
349 	int sda = sA->cn;
350 	double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
351 	double *x = sx->pa + xi;
352 	double *y = sy->pa + yi;
353 	double *z = sz->pa + zi;
354 
355 	// copy and scale y int z
356 	ii = 0;
357 	for(; ii<m-3; ii+=4)
358 		{
359 		z[ii+0] = beta*y[ii+0];
360 		z[ii+1] = beta*y[ii+1];
361 		z[ii+2] = beta*y[ii+2];
362 		z[ii+3] = beta*y[ii+3];
363 		}
364 	for(; ii<m; ii++)
365 		{
366 		z[ii+0] = beta*y[ii+0];
367 		}
368 
369 	// clean up at the beginning
370 	if(ai%bs!=0) // 1, 2, 3
371 		{
372 		n1 = 4-ai%bs;
373 		kernel_dsymv_l_4_gen_lib4(m, &alpha, ai%bs, &pA[0], sda, &x[0], &z[0], n<n1 ? n : n1);
374 		pA += n1 + n1*bs + (sda-1)*bs;
375 		x += n1;
376 		z += n1;
377 		m -= n1;
378 		n -= n1;
379 		}
380 
381 #if defined(TARGET_X86_AMD_BARCELONA) | defined(TARGET_X86_AMD_JAGUAR)
382 	// using dgemv_n and dgemv_t kernels
383 	double beta1 = 1.0;
384 	double xx[4];
385 	for(ii=0; ii<n-3; ii+=4)
386 		{
387 		// gemv_n
388 		kernel_dgemv_n_4_lib4(ii, &alpha, pA+ii*sda, x, &beta1, z+ii, z+ii);
389 		// 4x4
390 		z[ii+0] += alpha*(pA[ii*sda+0+bs*(ii+0)]*x[ii+0] + pA[ii*sda+1+bs*(ii+0)]*x[ii+1] + pA[ii*sda+2+bs*(ii+0)]*x[ii+2] + pA[ii*sda+3+bs*(ii+0)]*x[ii+3]);
391 		z[ii+1] += alpha*(pA[ii*sda+1+bs*(ii+0)]*x[ii+0] + pA[ii*sda+1+bs*(ii+1)]*x[ii+1] + pA[ii*sda+2+bs*(ii+1)]*x[ii+2] + pA[ii*sda+3+bs*(ii+1)]*x[ii+3]);
392 		z[ii+2] += alpha*(pA[ii*sda+2+bs*(ii+0)]*x[ii+0] + pA[ii*sda+2+bs*(ii+1)]*x[ii+1] + pA[ii*sda+2+bs*(ii+2)]*x[ii+2] + pA[ii*sda+3+bs*(ii+2)]*x[ii+3]);
393 		z[ii+3] += alpha*(pA[ii*sda+3+bs*(ii+0)]*x[ii+0] + pA[ii*sda+3+bs*(ii+1)]*x[ii+1] + pA[ii*sda+3+bs*(ii+2)]*x[ii+2] + pA[ii*sda+3+bs*(ii+3)]*x[ii+3]);
394 		// gemv_t
395 		kernel_dgemv_t_4_lib4(m-ii-4, &alpha, 0, pA+(ii+4)*sda+ii*bs, sda, x+ii+4, &beta1, z+ii, z+ii);
396 		}
397 	if(ii<n)
398 		{
399 		if(n-ii==1)
400 			{
401 			xx[0] = alpha*x[ii+0];
402 			xx[1] = alpha*x[ii+1];
403 			xx[2] = alpha*x[ii+2];
404 			xx[3] = alpha*x[ii+3];
405 			// gemv_n
406 			kernel_dgemv_n_4_vs_lib4(ii, &alpha, pA+ii*sda, x, &beta1, z+ii, z+ii, m-ii);
407 			// 4x4
408 			z[ii+0] += pA[ii*sda+0+bs*(ii+0)]*xx[0];
409 			if(m-ii>1)
410 				{
411 				z[ii+0] += pA[ii*sda+1+bs*(ii+0)]*xx[1];
412 				z[ii+1] += pA[ii*sda+1+bs*(ii+0)]*xx[0];
413 				}
414 			if(m-ii>2)
415 				{
416 				z[ii+0] += pA[ii*sda+2+bs*(ii+0)]*xx[2];
417 				z[ii+2] += pA[ii*sda+2+bs*(ii+0)]*xx[0];
418 				}
419 			if(m-ii>3)
420 				{
421 				z[ii+0] += pA[ii*sda+3+bs*(ii+0)]*xx[3];
422 				z[ii+3] += pA[ii*sda+3+bs*(ii+0)]*xx[0];
423 				}
424 			// gemv_t
425 			kernel_dgemv_t_4_vs_lib4(m-ii-4, &alpha, 0, pA+(ii+4)*sda+ii*bs, sda, x+ii+4, &beta1, z+ii, z+ii, n-ii);
426 			ii += 4;
427 			}
428 		else if(n-ii==2)
429 			{
430 			xx[0] = alpha*x[ii+0];
431 			xx[1] = alpha*x[ii+1];
432 			xx[2] = alpha*x[ii+2];
433 			xx[3] = alpha*x[ii+3];
434 			// gemv_n
435 			kernel_dgemv_n_4_vs_lib4(ii, &alpha, pA+ii*sda, x, &beta1, z+ii, z+ii, m-ii);
436 			// 4x4
437 			z[ii+0] += pA[ii*sda+0+bs*(ii+0)]*xx[0];
438 			if(m-ii>1)
439 				{
440 				z[ii+0] += pA[ii*sda+1+bs*(ii+0)]*xx[1];
441 				z[ii+1] += pA[ii*sda+1+bs*(ii+0)]*xx[0] + pA[ii*sda+1+bs*(ii+1)]*xx[1];
442 				}
443 			if(m-ii>2)
444 				{
445 				z[ii+0] += pA[ii*sda+2+bs*(ii+0)]*xx[2];
446 				z[ii+1] += pA[ii*sda+2+bs*(ii+1)]*xx[2];
447 				z[ii+2] += pA[ii*sda+2+bs*(ii+0)]*xx[0] + pA[ii*sda+2+bs*(ii+1)]*xx[1];
448 				}
449 			if(m-ii>3)
450 				{
451 				z[ii+0] += pA[ii*sda+3+bs*(ii+0)]*xx[3];
452 				z[ii+1] += pA[ii*sda+3+bs*(ii+1)]*xx[3];
453 				z[ii+3] += pA[ii*sda+3+bs*(ii+0)]*xx[0] + pA[ii*sda+3+bs*(ii+1)]*xx[1];
454 				}
455 			// gemv_t
456 			kernel_dgemv_t_4_vs_lib4(m-ii-4, &alpha, 0, pA+(ii+4)*sda+ii*bs, sda, x+ii+4, &beta1, z+ii, z+ii, n-ii);
457 			ii += 4;
458 			}
459 		else // if(n-ii==3)
460 			{
461 			xx[0] = alpha*x[ii+0];
462 			xx[1] = alpha*x[ii+1];
463 			xx[2] = alpha*x[ii+2];
464 			xx[3] = alpha*x[ii+3];
465 			// gemv_n
466 			kernel_dgemv_n_4_vs_lib4(ii, &alpha, pA+ii*sda, x, &beta1, z+ii, z+ii, m-ii);
467 			// 4x4
468 			z[ii+0] += pA[ii*sda+0+bs*(ii+0)]*xx[0];
469 			if(m-ii>1)
470 				{
471 				z[ii+0] += pA[ii*sda+1+bs*(ii+0)]*xx[1];
472 				z[ii+1] += pA[ii*sda+1+bs*(ii+0)]*xx[0] + pA[ii*sda+1+bs*(ii+1)]*xx[1];
473 				}
474 			if(m-ii>2)
475 				{
476 				z[ii+0] += pA[ii*sda+2+bs*(ii+0)]*xx[2];
477 				z[ii+1] += pA[ii*sda+2+bs*(ii+1)]*xx[2];
478 				z[ii+2] += pA[ii*sda+2+bs*(ii+0)]*xx[0] + pA[ii*sda+2+bs*(ii+1)]*xx[1] + pA[ii*sda+2+bs*(ii+2)]*xx[2];
479 				}
480 			if(m-ii>3)
481 				{
482 				z[ii+0] += pA[ii*sda+3+bs*(ii+0)]*xx[3];
483 				z[ii+1] += pA[ii*sda+3+bs*(ii+1)]*xx[3];
484 				z[ii+2] += pA[ii*sda+3+bs*(ii+2)]*xx[3];
485 				z[ii+3] += pA[ii*sda+3+bs*(ii+0)]*xx[0] + pA[ii*sda+3+bs*(ii+1)]*xx[1] + pA[ii*sda+3+bs*(ii+2)]*xx[2];
486 				}
487 			// gemv_t
488 			kernel_dgemv_t_4_vs_lib4(m-ii-4, &alpha, 0, pA+(ii+4)*sda+ii*bs, sda, x+ii+4, &beta1, z+ii, z+ii, n-ii);
489 			ii += 4;
490 			}
491 		for( ; ii<m-3; ii+=4)
492 			{
493 			// gemv_n
494 			kernel_dgemv_n_4_lib4(n, &alpha, pA+ii*sda, x, &beta1, z+ii, z+ii);
495 			}
496 		if(ii<m)
497 			{
498 			// gemv_n
499 			kernel_dgemv_n_4_vs_lib4(n, &alpha, pA+ii*sda, x, &beta1, z+ii, z+ii, m-ii);
500 			}
501 		}
502 	return;
503 #endif
504 
505 	// main loop
506 	ii = 0;
507 	for(; ii<n-3; ii+=4)
508 		{
509 		kernel_dsymv_l_4_lib4(m-ii, &alpha, &pA[ii*bs+ii*sda], sda, &x[ii], &z[ii]);
510 		}
511 	// clean up at the end
512 	if(ii<n)
513 		{
514 		kernel_dsymv_l_4_gen_lib4(m-ii, &alpha, 0, &pA[ii*bs+ii*sda], sda, &x[ii], &z[ii], n-ii);
515 		}
516 
517 	return;
518 	}
519 
520 
521 
522 // m >= n
blasfeo_dtrmv_lnn(int m,int n,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sz,int zi)523 void blasfeo_dtrmv_lnn(int m, int n, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi)
524 	{
525 
526 	if(m<=0)
527 		return;
528 
529 	const int bs = 4;
530 
531 	int sda = sA->cn;
532 	double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
533 	double *x = sx->pa + xi;
534 	double *z = sz->pa + zi;
535 
536 	if(m-n>0)
537 		blasfeo_dgemv_n(m-n, n, 1.0, sA, ai+n, aj, sx, xi, 0.0, sz, zi+n, sz, zi+n);
538 
539 	double *pA2 = pA;
540 	double *z2 = z;
541 	int m2 = n;
542 	int n2 = 0;
543 	double *pA3, *x3;
544 
545 	double alpha = 1.0;
546 	double beta = 1.0;
547 
548 	double zt[4] = {0, 0, 0, 0};
549 	double xt[4] = {0, 0, 0, 0};
550 
551 	int ii, jj, jj_end;
552 
553 	ii = 0;
554 
555 	if(ai%4!=0)
556 		{
557 		pA2 += sda*bs - ai%bs;
558 		z2 += bs-ai%bs;
559 		m2 -= bs-ai%bs;
560 		n2 += bs-ai%bs;
561 		}
562 
563 	pA2 += m2/bs*bs*sda;
564 	z2 += m2/bs*bs;
565 	n2 += m2/bs*bs;
566 
567 	if(m2%bs != 0)
568 		{
569 		//
570 		pA3 = pA2 + bs*n2;
571 		x3 = x + n2;
572 
573 		// access only valid memory
574 		for(jj=0; jj<m2%bs; jj++)
575 			xt[jj] = x3[jj];
576 
577 		zt[2] = pA3[2+bs*0]*xt[0] + pA3[2+bs*1]*xt[1] + pA3[2+bs*2]*xt[2];
578 		zt[1] = pA3[1+bs*0]*xt[0] + pA3[1+bs*1]*xt[1];
579 		zt[0] = pA3[0+bs*0]*xt[0];
580 		kernel_dgemv_n_4_lib4(n2, &alpha, pA2, x, &beta, zt, zt);
581 		for(jj=0; jj<m2%bs; jj++)
582 			z2[jj] = zt[jj];
583 		}
584 	for(; ii<m2-3; ii+=4)
585 		{
586 		pA2 -= bs*sda;
587 		z2 -= 4;
588 		n2 -= 4;
589 		pA3 = pA2 + bs*n2;
590 		x3 = x + n2;
591 		z2[3] = pA3[3+bs*0]*x3[0] + pA3[3+bs*1]*x3[1] + pA3[3+bs*2]*x3[2] + pA3[3+bs*3]*x3[3];
592 		z2[2] = pA3[2+bs*0]*x3[0] + pA3[2+bs*1]*x3[1] + pA3[2+bs*2]*x3[2];
593 		z2[1] = pA3[1+bs*0]*x3[0] + pA3[1+bs*1]*x3[1];
594 		z2[0] = pA3[0+bs*0]*x3[0];
595 		kernel_dgemv_n_4_lib4(n2, &alpha, pA2, x, &beta, z2, z2);
596 		}
597 	if(ai%4!=0)
598 		{
599 		if(ai%bs==1)
600 			{
601 			zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + pA[2+bs*2]*x[2];
602 			zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
603 			zt[0] = pA[0+bs*0]*x[0];
604 			jj_end = 4-ai%bs<n ? 4-ai%bs : n;
605 			for(jj=0; jj<jj_end; jj++)
606 				z[jj] = zt[jj];
607 			}
608 		else if(ai%bs==2)
609 			{
610 			zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
611 			zt[0] = pA[0+bs*0]*x[0];
612 			jj_end = 4-ai%bs<n ? 4-ai%bs : n;
613 			for(jj=0; jj<jj_end; jj++)
614 				z[jj] = zt[jj];
615 			}
616 		else // if (ai%bs==3)
617 			{
618 			z[0] = pA[0+bs*0]*x[0];
619 			}
620 		}
621 
622 	return;
623 
624 	}
625 
626 
627 
628 // m >= n
blasfeo_dtrmv_lnu(int m,int n,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sz,int zi)629 void blasfeo_dtrmv_lnu(int m, int n, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi)
630 	{
631 
632 	if(m<=0)
633 		return;
634 
635 	const int bs = 4;
636 
637 	int sda = sA->cn;
638 	double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
639 	double *x = sx->pa + xi;
640 	double *z = sz->pa + zi;
641 
642 	if(m-n>0)
643 		blasfeo_dgemv_n(m-n, n, 1.0, sA, ai+n, aj, sx, xi, 0.0, sz, zi+n, sz, zi+n);
644 
645 	double *pA2 = pA;
646 	double *z2 = z;
647 	int m2 = n;
648 	int n2 = 0;
649 	double *pA3, *x3;
650 
651 	double alpha = 1.0;
652 	double beta = 1.0;
653 
654 	double zt[4];
655 
656 	int ii, jj, jj_end;
657 
658 	ii = 0;
659 
660 	if(ai%4!=0)
661 		{
662 		pA2 += sda*bs - ai%bs;
663 		z2 += bs-ai%bs;
664 		m2 -= bs-ai%bs;
665 		n2 += bs-ai%bs;
666 		}
667 
668 	pA2 += m2/bs*bs*sda;
669 	z2 += m2/bs*bs;
670 	n2 += m2/bs*bs;
671 
672 	if(m2%bs!=0)
673 		{
674 		//
675 		pA3 = pA2 + bs*n2;
676 		x3 = x + n2;
677 		zt[3] = pA3[3+bs*0]*x3[0] + pA3[3+bs*1]*x3[1] + pA3[3+bs*2]*x3[2] + 1.0*x3[3];
678 		zt[2] = pA3[2+bs*0]*x3[0] + pA3[2+bs*1]*x3[1] + 1.0*x3[2];
679 		zt[1] = pA3[1+bs*0]*x3[0] + 1.0*x3[1];
680 		zt[0] = 1.0*x3[0];
681 		kernel_dgemv_n_4_lib4(n2, &alpha, pA2, x, &beta, zt, zt);
682 		for(jj=0; jj<m2%bs; jj++)
683 			z2[jj] = zt[jj];
684 		}
685 	for(; ii<m2-3; ii+=4)
686 		{
687 		pA2 -= bs*sda;
688 		z2 -= 4;
689 		n2 -= 4;
690 		pA3 = pA2 + bs*n2;
691 		x3 = x + n2;
692 		z2[3] = pA3[3+bs*0]*x3[0] + pA3[3+bs*1]*x3[1] + pA3[3+bs*2]*x3[2] + 1.0*x3[3];
693 		z2[2] = pA3[2+bs*0]*x3[0] + pA3[2+bs*1]*x3[1] + 1.0*x3[2];
694 		z2[1] = pA3[1+bs*0]*x3[0] + 1.0*x3[1];
695 		z2[0] = 1.0*x3[0];
696 		kernel_dgemv_n_4_lib4(n2, &alpha, pA2, x, &beta, z2, z2);
697 		}
698 	if(ai%4!=0)
699 		{
700 		if(ai%bs==1)
701 			{
702 			zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + 1.0*x[2];
703 			zt[1] = pA[1+bs*0]*x[0] + 1.0*x[1];
704 			zt[0] = 1.0*x[0];
705 			jj_end = 4-ai%bs<n ? 4-ai%bs : n;
706 			for(jj=0; jj<jj_end; jj++)
707 				z[jj] = zt[jj];
708 			}
709 		else if(ai%bs==2)
710 			{
711 			zt[1] = pA[1+bs*0]*x[0] + 1.0*x[1];
712 			zt[0] = 1.0*x[0];
713 			jj_end = 4-ai%bs<n ? 4-ai%bs : n;
714 			for(jj=0; jj<jj_end; jj++)
715 				z[jj] = zt[jj];
716 			}
717 		else // if (ai%bs==3)
718 			{
719 			z[0] = 1.0*x[0];
720 			}
721 		}
722 
723 	return;
724 
725 	}
726 
727 
728 
729 // m >= n
blasfeo_dtrmv_ltn(int m,int n,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sz,int zi)730 void blasfeo_dtrmv_ltn(int m, int n, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi)
731 	{
732 
733 	if(m<=0)
734 		return;
735 
736 	const int bs = 4;
737 
738 	int sda = sA->cn;
739 	double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
740 	double *x = sx->pa + xi;
741 	double *z = sz->pa + zi;
742 
743 	double xt[4];
744 	double zt[4];
745 
746 	double alpha = 1.0;
747 	double beta = 1.0;
748 
749 	int ii, jj, ll, ll_max;
750 
751 	jj = 0;
752 
753 	if(ai%bs!=0)
754 		{
755 
756 		if(ai%bs==1)
757 			{
758 			ll_max = m-jj<3 ? m-jj : 3;
759 			for(ll=0; ll<ll_max; ll++)
760 				xt[ll] = x[ll];
761 			for(; ll<3; ll++)
762 				xt[ll] = 0.0;
763 			zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2];
764 			zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2];
765 			zt[2] = pA[2+bs*2]*xt[2];
766 			pA += bs*sda - 1;
767 			x += 3;
768 			kernel_dgemv_t_4_lib4(m-3-jj, &alpha, 0, pA, sda, x, &beta, zt, zt);
769 			ll_max = n-jj<3 ? n-jj : 3;
770 			for(ll=0; ll<ll_max; ll++)
771 				z[ll] = zt[ll];
772 			pA += bs*3;
773 			z += 3;
774 			jj += 3;
775 			}
776 		else if(ai%bs==2)
777 			{
778 			ll_max = m-jj<2 ? m-jj : 2;
779 			for(ll=0; ll<ll_max; ll++)
780 				xt[ll] = x[ll];
781 			for(; ll<2; ll++)
782 				xt[ll] = 0.0;
783 			zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1];
784 			zt[1] = pA[1+bs*1]*xt[1];
785 			pA += bs*sda - 2;
786 			x += 2;
787 			kernel_dgemv_t_4_lib4(m-2-jj, &alpha, 0, pA, sda, x, &beta, zt, zt);
788 			ll_max = n-jj<2 ? n-jj : 2;
789 			for(ll=0; ll<ll_max; ll++)
790 				z[ll] = zt[ll];
791 			pA += bs*2;
792 			z += 2;
793 			jj += 2;
794 			}
795 		else // if(ai%bs==3)
796 			{
797 			ll_max = m-jj<1 ? m-jj : 1;
798 			for(ll=0; ll<ll_max; ll++)
799 				xt[ll] = x[ll];
800 			for(; ll<1; ll++)
801 				xt[ll] = 0.0;
802 			zt[0] = pA[0+bs*0]*xt[0];
803 			pA += bs*sda - 3;
804 			x += 1;
805 			kernel_dgemv_t_4_lib4(m-1-jj, &alpha, 0, pA, sda, x, &beta, zt, zt);
806 			ll_max = n-jj<1 ? n-jj : 1;
807 			for(ll=0; ll<ll_max; ll++)
808 				z[ll] = zt[ll];
809 			pA += bs*1;
810 			z += 1;
811 			jj += 1;
812 			}
813 
814 		}
815 
816 	for(; jj<n-3; jj+=4)
817 		{
818 		zt[0] = pA[0+bs*0]*x[0] + pA[1+bs*0]*x[1] + pA[2+bs*0]*x[2] + pA[3+bs*0]*x[3];
819 		zt[1] = pA[1+bs*1]*x[1] + pA[2+bs*1]*x[2] + pA[3+bs*1]*x[3];
820 		zt[2] = pA[2+bs*2]*x[2] + pA[3+bs*2]*x[3];
821 		zt[3] = pA[3+bs*3]*x[3];
822 		pA += bs*sda;
823 		x += 4;
824 		kernel_dgemv_t_4_lib4(m-4-jj, &alpha, 0, pA, sda, x, &beta, zt, z);
825 		pA += bs*4;
826 		z += 4;
827 		}
828 	if(jj<n)
829 		{
830 		ll_max = m-jj<4 ? m-jj : 4;
831 		for(ll=0; ll<ll_max; ll++)
832 			xt[ll] = x[ll];
833 		for(; ll<4; ll++)
834 			xt[ll] = 0.0;
835 		zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3];
836 		zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3];
837 		zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3];
838 		zt[3] = pA[3+bs*3]*xt[3];
839 		pA += bs*sda;
840 		x += 4;
841 		kernel_dgemv_t_4_lib4(m-4-jj, &alpha, 0, pA, sda, x, &beta, zt, zt);
842 		for(ll=0; ll<n-jj; ll++)
843 			z[ll] = zt[ll];
844 //		pA += bs*4;
845 //		z += 4;
846 		}
847 
848 	return;
849 
850 	}
851 
852 
853 
854 // m >= n
blasfeo_dtrmv_ltu(int m,int n,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sz,int zi)855 void blasfeo_dtrmv_ltu(int m, int n, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi)
856 	{
857 
858 	if(m<=0)
859 		return;
860 
861 	const int bs = 4;
862 
863 	int sda = sA->cn;
864 	double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
865 	double *x = sx->pa + xi;
866 	double *z = sz->pa + zi;
867 
868 	double xt[4];
869 	double zt[4];
870 
871 	double alpha = 1.0;
872 	double beta = 1.0;
873 
874 	int ii, jj, ll, ll_max;
875 
876 	jj = 0;
877 
878 	if(ai%bs!=0)
879 		{
880 
881 		if(ai%bs==1)
882 			{
883 			ll_max = m-jj<3 ? m-jj : 3;
884 			for(ll=0; ll<ll_max; ll++)
885 				xt[ll] = x[ll];
886 			for(; ll<3; ll++)
887 				xt[ll] = 0.0;
888 			zt[0] = 1.0*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2];
889 			zt[1] = 1.0*xt[1] + pA[2+bs*1]*xt[2];
890 			zt[2] = 1.0*xt[2];
891 			pA += bs*sda - 1;
892 			x += 3;
893 			kernel_dgemv_t_4_lib4(m-3-jj, &alpha, 0, pA, sda, x, &beta, zt, zt);
894 			ll_max = n-jj<3 ? n-jj : 3;
895 			for(ll=0; ll<ll_max; ll++)
896 				z[ll] = zt[ll];
897 			pA += bs*3;
898 			z += 3;
899 			jj += 3;
900 			}
901 		else if(ai%bs==2)
902 			{
903 			ll_max = m-jj<2 ? m-jj : 2;
904 			for(ll=0; ll<ll_max; ll++)
905 				xt[ll] = x[ll];
906 			for(; ll<2; ll++)
907 				xt[ll] = 0.0;
908 			zt[0] = 1.0*xt[0] + pA[1+bs*0]*xt[1];
909 			zt[1] = 1.0*xt[1];
910 			pA += bs*sda - 2;
911 			x += 2;
912 			kernel_dgemv_t_4_lib4(m-2-jj, &alpha, 0, pA, sda, x, &beta, zt, zt);
913 			ll_max = n-jj<2 ? n-jj : 2;
914 			for(ll=0; ll<ll_max; ll++)
915 				z[ll] = zt[ll];
916 			pA += bs*2;
917 			z += 2;
918 			jj += 2;
919 			}
920 		else // if(ai%bs==3)
921 			{
922 			ll_max = m-jj<1 ? m-jj : 1;
923 			for(ll=0; ll<ll_max; ll++)
924 				xt[ll] = x[ll];
925 			for(; ll<1; ll++)
926 				xt[ll] = 0.0;
927 			zt[0] = 1.0*xt[0];
928 			pA += bs*sda - 3;
929 			x += 1;
930 			kernel_dgemv_t_4_lib4(m-1-jj, &alpha, 0, pA, sda, x, &beta, zt, zt);
931 			ll_max = n-jj<1 ? n-jj : 1;
932 			for(ll=0; ll<ll_max; ll++)
933 				z[ll] = zt[ll];
934 			pA += bs*1;
935 			z += 1;
936 			jj += 1;
937 			}
938 
939 		}
940 
941 	for(; jj<n-3; jj+=4)
942 		{
943 		zt[0] = 1.0*x[0] + pA[1+bs*0]*x[1] + pA[2+bs*0]*x[2] + pA[3+bs*0]*x[3];
944 		zt[1] = 1.0*x[1] + pA[2+bs*1]*x[2] + pA[3+bs*1]*x[3];
945 		zt[2] = 1.0*x[2] + pA[3+bs*2]*x[3];
946 		zt[3] = 1.0*x[3];
947 		pA += bs*sda;
948 		x += 4;
949 		kernel_dgemv_t_4_lib4(m-4-jj, &alpha, 0, pA, sda, x, &beta, zt, z);
950 		pA += bs*4;
951 		z += 4;
952 		}
953 	if(jj<n)
954 		{
955 		ll_max = m-jj<4 ? m-jj : 4;
956 		for(ll=0; ll<ll_max; ll++)
957 			xt[ll] = x[ll];
958 		for(; ll<4; ll++)
959 			xt[ll] = 0.0;
960 		zt[0] = 1.0*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3];
961 		zt[1] = 1.0*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3];
962 		zt[2] = 1.0*xt[2] + pA[3+bs*2]*xt[3];
963 		zt[3] = 1.0*xt[3];
964 		pA += bs*sda;
965 		x += 4;
966 		kernel_dgemv_t_4_lib4(m-4-jj, &alpha, 0, pA, sda, x, &beta, zt, zt);
967 		for(ll=0; ll<n-jj; ll++)
968 			z[ll] = zt[ll];
969 //		pA += bs*4;
970 //		z += 4;
971 		}
972 
973 	return;
974 
975 	}
976 
977 
978 
blasfeo_dtrmv_unn(int m,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sz,int zi)979 void blasfeo_dtrmv_unn(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi)
980 	{
981 
982 	if(m<=0)
983 		return;
984 
985 	if(ai!=0)
986 		{
987 		printf("\nblasfeo_dtrmv_unn: feature not implemented yet: ai=%d\n", ai);
988 		exit(1);
989 		}
990 
991 	const int bs = 4;
992 
993 	int sda = sA->cn;
994 	double *pA = sA->pA + aj*bs; // TODO ai
995 	double *x = sx->pa + xi;
996 	double *z = sz->pa + zi;
997 
998 	int i;
999 
1000 	i=0;
1001 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1002 	for(; i<m-7; i+=8)
1003 		{
1004 		kernel_dtrmv_un_8_lib4(m-i, pA, sda, x, z);
1005 		pA += 8*sda+8*bs;
1006 		x  += 8;
1007 		z  += 8;
1008 		}
1009 #endif
1010 	for(; i<m-3; i+=4)
1011 		{
1012 		kernel_dtrmv_un_4_lib4(m-i, pA, x, z);
1013 		pA += 4*sda+4*bs;
1014 		x  += 4;
1015 		z  += 4;
1016 		}
1017 	if(m>i)
1018 		{
1019 		if(m-i==1)
1020 			{
1021 			z[0] = pA[0+bs*0]*x[0];
1022 			}
1023 		else if(m-i==2)
1024 			{
1025 			z[0] = pA[0+bs*0]*x[0] + pA[0+bs*1]*x[1];
1026 			z[1] = pA[1+bs*1]*x[1];
1027 			}
1028 		else // if(m-i==3)
1029 			{
1030 			z[0] = pA[0+bs*0]*x[0] + pA[0+bs*1]*x[1] + pA[0+bs*2]*x[2];
1031 			z[1] = pA[1+bs*1]*x[1] + pA[1+bs*2]*x[2];
1032 			z[2] = pA[2+bs*2]*x[2];
1033 			}
1034 		}
1035 
1036 	return;
1037 
1038 	}
1039 
1040 
1041 
blasfeo_dtrmv_utn(int m,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sz,int zi)1042 void blasfeo_dtrmv_utn(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi)
1043 	{
1044 
1045 	if(m<=0)
1046 		return;
1047 
1048 	if(ai!=0)
1049 		{
1050 		printf("\nblasfeo_dtrmv_utn: feature not implemented yet: ai=%d\n", ai);
1051 		exit(1);
1052 		}
1053 
1054 	const int bs = 4;
1055 
1056 	int sda = sA->cn;
1057 	double *pA = sA->pA + aj*bs; // TODO ai
1058 	double *x = sx->pa + xi;
1059 	double *z = sz->pa + zi;
1060 
1061 	int ii, idx;
1062 
1063 	double *ptrA;
1064 
1065 	ii=0;
1066 	idx = m/bs*bs;
1067 	if(m%bs!=0)
1068 		{
1069 		kernel_dtrmv_ut_4_vs_lib4(m, pA+idx*bs, sda, x, z+idx, m%bs);
1070 		ii += m%bs;
1071 		}
1072 	idx -= 4;
1073 	for(; ii<m; ii+=4)
1074 		{
1075 		kernel_dtrmv_ut_4_lib4(idx+4, pA+idx*bs, sda, x, z+idx);
1076 		idx -= 4;
1077 		}
1078 
1079 	return;
1080 
1081 	}
1082 
1083 
1084 
blasfeo_dtrsv_lnn_mn(int m,int n,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sz,int zi)1085 void blasfeo_dtrsv_lnn_mn(int m, int n, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi)
1086 	{
1087 	if(m==0 | n==0)
1088 		return;
1089 #if defined(DIM_CHECK)
1090 	// non-negative size
1091 	if(m<0) printf("\n****** blasfeo_dtrsv_lnn_mn : m<0 : %d<0 *****\n", m);
1092 	if(n<0) printf("\n****** blasfeo_dtrsv_lnn_mn : n<0 : %d<0 *****\n", n);
1093 	// non-negative offset
1094 	if(ai<0) printf("\n****** blasfeo_dtrsv_lnn_mn : ai<0 : %d<0 *****\n", ai);
1095 	if(aj<0) printf("\n****** blasfeo_dtrsv_lnn_mn : aj<0 : %d<0 *****\n", aj);
1096 	if(xi<0) printf("\n****** blasfeo_dtrsv_lnn_mn : xi<0 : %d<0 *****\n", xi);
1097 	if(zi<0) printf("\n****** blasfeo_dtrsv_lnn_mn : zi<0 : %d<0 *****\n", zi);
1098 	// inside matrix
1099 	// A: m x k
1100 	if(ai+m > sA->m) printf("\n***** blasfeo_dtrsv_lnn_mn : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
1101 	if(aj+n > sA->n) printf("\n***** blasfeo_dtrsv_lnn_mn : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
1102 	// x: m
1103 	if(xi+m > sx->m) printf("\n***** blasfeo_dtrsv_lnn_mn : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
1104 	// z: m
1105 	if(zi+m > sz->m) printf("\n***** blasfeo_dtrsv_lnn_mn : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
1106 #endif
1107 	if(ai!=0)
1108 		{
1109 		printf("\nblasfeo_dtrsv_lnn_mn: feature not implemented yet: ai=%d\n", ai);
1110 		exit(1);
1111 		}
1112 	const int bs = 4;
1113 	int sda = sA->cn;
1114 	double *pA = sA->pA + aj*bs; // TODO ai
1115 	double *dA = sA->dA;
1116 	double *x = sx->pa + xi;
1117 	double *z = sz->pa + zi;
1118 	int ii;
1119 	if(ai==0 & aj==0)
1120 		{
1121 		if(sA->use_dA!=1)
1122 			{
1123 			ddiaex_lib(n, 1.0, ai, pA, sda, dA);
1124 			for(ii=0; ii<n; ii++)
1125 				dA[ii] = 1.0 / dA[ii];
1126 			sA->use_dA = 1;
1127 			}
1128 		}
1129 	else
1130 		{
1131 		ddiaex_lib(n, 1.0, ai, pA, sda, dA);
1132 		for(ii=0; ii<n; ii++)
1133 			dA[ii] = 1.0 / dA[ii];
1134 		sA->use_dA = 0;
1135 		}
1136 	dtrsv_ln_inv_lib(m, n, pA, sda, dA, x, z);
1137 	return;
1138 	}
1139 
1140 
1141 
blasfeo_dtrsv_ltn_mn(int m,int n,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sz,int zi)1142 void blasfeo_dtrsv_ltn_mn(int m, int n, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi)
1143 	{
1144 	if(m==0)
1145 		return;
1146 #if defined(DIM_CHECK)
1147 	// non-negative size
1148 	if(m<0) printf("\n****** blasfeo_dtrsv_ltn_mn : m<0 : %d<0 *****\n", m);
1149 	if(n<0) printf("\n****** blasfeo_dtrsv_ltn_mn : n<0 : %d<0 *****\n", n);
1150 	// non-negative offset
1151 	if(ai<0) printf("\n****** blasfeo_dtrsv_ltn_mn : ai<0 : %d<0 *****\n", ai);
1152 	if(aj<0) printf("\n****** blasfeo_dtrsv_ltn_mn : aj<0 : %d<0 *****\n", aj);
1153 	if(xi<0) printf("\n****** blasfeo_dtrsv_ltn_mn : xi<0 : %d<0 *****\n", xi);
1154 	if(zi<0) printf("\n****** blasfeo_dtrsv_ltn_mn : zi<0 : %d<0 *****\n", zi);
1155 	// inside matrix
1156 	// A: m x k
1157 	if(ai+m > sA->m) printf("\n***** blasfeo_dtrsv_ltn_mn : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
1158 	if(aj+n > sA->n) printf("\n***** blasfeo_dtrsv_ltn_mn : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
1159 	// x: m
1160 	if(xi+m > sx->m) printf("\n***** blasfeo_dtrsv_ltn_mn : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
1161 	// z: m
1162 	if(zi+m > sz->m) printf("\n***** blasfeo_dtrsv_ltn_mn : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
1163 #endif
1164 	if(ai!=0)
1165 		{
1166 		printf("\nblasfeo_dtrsv_ltn_mn: feature not implemented yet: ai=%d\n", ai);
1167 		exit(1);
1168 		}
1169 	const int bs = 4;
1170 	int sda = sA->cn;
1171 	double *pA = sA->pA + aj*bs; // TODO ai
1172 	double *dA = sA->dA;
1173 	double *x = sx->pa + xi;
1174 	double *z = sz->pa + zi;
1175 	int ii;
1176 	if(ai==0 & aj==0)
1177 		{
1178 		if(sA->use_dA!=1)
1179 			{
1180 			ddiaex_lib(n, 1.0, ai, pA, sda, dA);
1181 			for(ii=0; ii<n; ii++)
1182 				dA[ii] = 1.0 / dA[ii];
1183 			sA->use_dA = 1;
1184 			}
1185 		}
1186 	else
1187 		{
1188 		ddiaex_lib(n, 1.0, ai, pA, sda, dA);
1189 		for(ii=0; ii<n; ii++)
1190 			dA[ii] = 1.0 / dA[ii];
1191 		sA->use_dA = 0;
1192 		}
1193 	dtrsv_lt_inv_lib(m, n, pA, sda, dA, x, z);
1194 	return;
1195 	}
1196 
1197 
1198 
blasfeo_dtrsv_lnn(int m,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sz,int zi)1199 void blasfeo_dtrsv_lnn(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi)
1200 	{
1201 	if(m==0)
1202 		return;
1203 #if defined(DIM_CHECK)
1204 	// non-negative size
1205 	if(m<0) printf("\n****** blasfeo_dtrsv_lnn : m<0 : %d<0 *****\n", m);
1206 	// non-negative offset
1207 	if(ai<0) printf("\n****** blasfeo_dtrsv_lnn : ai<0 : %d<0 *****\n", ai);
1208 	if(aj<0) printf("\n****** blasfeo_dtrsv_lnn : aj<0 : %d<0 *****\n", aj);
1209 	if(xi<0) printf("\n****** blasfeo_dtrsv_lnn : xi<0 : %d<0 *****\n", xi);
1210 	if(zi<0) printf("\n****** blasfeo_dtrsv_lnn : zi<0 : %d<0 *****\n", zi);
1211 	// inside matrix
1212 	// A: m x k
1213 	if(ai+m > sA->m) printf("\n***** blasfeo_dtrsv_lnn : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
1214 	if(aj+m > sA->n) printf("\n***** blasfeo_dtrsv_lnn : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
1215 	// x: m
1216 	if(xi+m > sx->m) printf("\n***** blasfeo_dtrsv_lnn : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
1217 	// z: m
1218 	if(zi+m > sz->m) printf("\n***** blasfeo_dtrsv_lnn : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
1219 #endif
1220 	if(ai!=0)
1221 		{
1222 		printf("\nblasfeo_dtrsv_lnn: feature not implemented yet: ai=%d\n", ai);
1223 		exit(1);
1224 		}
1225 	const int bs = 4;
1226 	int sda = sA->cn;
1227 	double *pA = sA->pA + aj*bs; // TODO ai
1228 	double *dA = sA->dA;
1229 	double *x = sx->pa + xi;
1230 	double *z = sz->pa + zi;
1231 	int ii;
1232 	if(ai==0 & aj==0)
1233 		{
1234 		if(sA->use_dA!=1)
1235 			{
1236 			ddiaex_lib(m, 1.0, ai, pA, sda, dA);
1237 			for(ii=0; ii<m; ii++)
1238 				dA[ii] = 1.0 / dA[ii];
1239 			sA->use_dA = 1;
1240 			}
1241 		}
1242 	else
1243 		{
1244 		ddiaex_lib(m, 1.0, ai, pA, sda, dA);
1245 		for(ii=0; ii<m; ii++)
1246 			dA[ii] = 1.0 / dA[ii];
1247 		sA->use_dA = 0;
1248 		}
1249 	dtrsv_ln_inv_lib(m, m, pA, sda, dA, x, z);
1250 	return;
1251 	}
1252 
1253 
1254 
blasfeo_dtrsv_lnu(int m,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sz,int zi)1255 void blasfeo_dtrsv_lnu(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi)
1256 	{
1257 	if(m==0)
1258 		return;
1259 #if defined(DIM_CHECK)
1260 	// non-negative size
1261 	if(m<0) printf("\n****** blasfeo_dtrsv_lnu : m<0 : %d<0 *****\n", m);
1262 	// non-negative offset
1263 	if(ai<0) printf("\n****** blasfeo_dtrsv_lnu : ai<0 : %d<0 *****\n", ai);
1264 	if(aj<0) printf("\n****** blasfeo_dtrsv_lnu : aj<0 : %d<0 *****\n", aj);
1265 	if(xi<0) printf("\n****** blasfeo_dtrsv_lnu : xi<0 : %d<0 *****\n", xi);
1266 	if(zi<0) printf("\n****** blasfeo_dtrsv_lnu : zi<0 : %d<0 *****\n", zi);
1267 	// inside matrix
1268 	// A: m x k
1269 	if(ai+m > sA->m) printf("\n***** blasfeo_dtrsv_lnu : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
1270 	if(aj+m > sA->n) printf("\n***** blasfeo_dtrsv_lnu : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
1271 	// x: m
1272 	if(xi+m > sx->m) printf("\n***** blasfeo_dtrsv_lnu : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
1273 	// z: m
1274 	if(zi+m > sz->m) printf("\n***** blasfeo_dtrsv_lnu : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
1275 #endif
1276 //	printf("\n***** blasfeo_dtrsv_lnu : feature not implemented yet *****\n");
1277 //	exit(1);
1278 	if(ai!=0)
1279 		{
1280 		printf("\nblasfeo_dtrsv_lnn: feature not implemented yet: ai=%d\n", ai);
1281 		exit(1);
1282 		}
1283 	const int bs = 4;
1284 	int sda = sA->cn;
1285 	double *pA = sA->pA + aj*bs; // TODO ai
1286 	double *dA = sA->dA;
1287 	double *x = sx->pa + xi;
1288 	double *z = sz->pa + zi;
1289 	int ii;
1290 	if(x!=z)
1291 		{
1292 		for(ii=0; ii<m; ii++)
1293 			z[ii] = x[ii];
1294 		}
1295 	ii = 0;
1296 	for( ; ii<m-3; ii+=4)
1297 		{
1298 		kernel_dtrsv_ln_one_4_lib4(ii, &pA[ii*sda], z, &z[ii], &z[ii]);
1299 		}
1300 	if(ii<m)
1301 		{
1302 		kernel_dtrsv_ln_one_4_vs_lib4(ii, &pA[ii*sda], z, &z[ii], &z[ii], m-ii, m-ii);
1303 		ii+=4;
1304 		}
1305 	return;
1306 	}
1307 
1308 
1309 
blasfeo_dtrsv_ltn(int m,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sz,int zi)1310 void blasfeo_dtrsv_ltn(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi)
1311 	{
1312 	if(m==0)
1313 		return;
1314 #if defined(DIM_CHECK)
1315 	// non-negative size
1316 	if(m<0) printf("\n****** blasfeo_dtrsv_ltn : m<0 : %d<0 *****\n", m);
1317 	// non-negative offset
1318 	if(ai<0) printf("\n****** blasfeo_dtrsv_ltn : ai<0 : %d<0 *****\n", ai);
1319 	if(aj<0) printf("\n****** blasfeo_dtrsv_ltn : aj<0 : %d<0 *****\n", aj);
1320 	if(xi<0) printf("\n****** blasfeo_dtrsv_ltn : xi<0 : %d<0 *****\n", xi);
1321 	if(zi<0) printf("\n****** blasfeo_dtrsv_ltn : zi<0 : %d<0 *****\n", zi);
1322 	// inside matrix
1323 	// A: m x k
1324 	if(ai+m > sA->m) printf("\n***** blasfeo_dtrsv_ltn : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
1325 	if(aj+m > sA->n) printf("\n***** blasfeo_dtrsv_ltn : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
1326 	// x: m
1327 	if(xi+m > sx->m) printf("\n***** blasfeo_dtrsv_ltn : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
1328 	// z: m
1329 	if(zi+m > sz->m) printf("\n***** blasfeo_dtrsv_ltn : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
1330 #endif
1331 	if(ai!=0)
1332 		{
1333 		printf("\nblasfeo_dtrsv_ltn: feature not implemented yet: ai=%d\n", ai);
1334 		exit(1);
1335 		}
1336 	const int bs = 4;
1337 	int sda = sA->cn;
1338 	double *pA = sA->pA + aj*bs; // TODO ai
1339 	double *dA = sA->dA;
1340 	double *x = sx->pa + xi;
1341 	double *z = sz->pa + zi;
1342 	int ii;
1343 	if(ai==0 & aj==0)
1344 		{
1345 		if(sA->use_dA!=1)
1346 			{
1347 			ddiaex_lib(m, 1.0, ai, pA, sda, dA);
1348 			for(ii=0; ii<m; ii++)
1349 				dA[ii] = 1.0 / dA[ii];
1350 			sA->use_dA = 1;
1351 			}
1352 		}
1353 	else
1354 		{
1355 		ddiaex_lib(m, 1.0, ai, pA, sda, dA);
1356 		for(ii=0; ii<m; ii++)
1357 			dA[ii] = 1.0 / dA[ii];
1358 		sA->use_dA = 0;
1359 		}
1360 	dtrsv_lt_inv_lib(m, m, pA, sda, dA, x, z);
1361 	return;
1362 	}
1363 
1364 
1365 
blasfeo_dtrsv_ltu(int m,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sz,int zi)1366 void blasfeo_dtrsv_ltu(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi)
1367 	{
1368 	if(m==0)
1369 		return;
1370 #if defined(DIM_CHECK)
1371 	// non-negative size
1372 	if(m<0) printf("\n****** blasfeo_dtrsv_ltu : m<0 : %d<0 *****\n", m);
1373 	// non-negative offset
1374 	if(ai<0) printf("\n****** blasfeo_dtrsv_ltu : ai<0 : %d<0 *****\n", ai);
1375 	if(aj<0) printf("\n****** blasfeo_dtrsv_ltu : aj<0 : %d<0 *****\n", aj);
1376 	if(xi<0) printf("\n****** blasfeo_dtrsv_ltu : xi<0 : %d<0 *****\n", xi);
1377 	if(zi<0) printf("\n****** blasfeo_dtrsv_ltu : zi<0 : %d<0 *****\n", zi);
1378 	// inside matrix
1379 	// A: m x k
1380 	if(ai+m > sA->m) printf("\n***** blasfeo_dtrsv_ltu : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
1381 	if(aj+m > sA->n) printf("\n***** blasfeo_dtrsv_ltu : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
1382 	// x: m
1383 	if(xi+m > sx->m) printf("\n***** blasfeo_dtrsv_ltu : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
1384 	// z: m
1385 	if(zi+m > sz->m) printf("\n***** blasfeo_dtrsv_ltu : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
1386 #endif
1387 //	printf("\n***** blasfeo_dtrsv_ltu : feature not implemented yet *****\n");
1388 //	exit(1);
1389 	const int bs = 4;
1390 	int sda = sA->cn;
1391 	double *pA = sA->pA + aj*bs; // TODO ai
1392 	double *dA = sA->dA;
1393 	double *x = sx->pa + xi;
1394 	double *z = sz->pa + zi;
1395 	int ii;
1396 	if(x!=z)
1397 		for(ii=0; ii<m; ii++)
1398 			z[ii] = x[ii];
1399 	ii=0;
1400 	if(m%4==1)
1401 		{
1402 		kernel_dtrsv_lt_one_1_lib4(1, &pA[m/bs*bs*sda+(m-1)*bs], sda, &z[m-1], &z[m-1], &z[m-1]);
1403 		ii++;
1404 		}
1405 	else if(m%4==2)
1406 		{
1407 		kernel_dtrsv_lt_one_2_lib4(2, &pA[m/bs*bs*sda+(m-2)*bs], sda, &z[m-2], &z[m-2], &z[m-2]);
1408 		ii+=2;
1409 		}
1410 	else if(m%4==3)
1411 		{
1412 		kernel_dtrsv_lt_one_3_lib4(3, &pA[m/bs*bs*sda+(m-3)*bs], sda, &z[m-3], &z[m-3], &z[m-3]);
1413 		ii+=3;
1414 		}
1415 	for(; ii<m-3; ii+=4)
1416 		{
1417 		kernel_dtrsv_lt_one_4_lib4(ii+4, &pA[(m-ii-4)/bs*bs*sda+(m-ii-4)*bs], sda, &z[m-ii-4], &z[m-ii-4], &z[m-ii-4]);
1418 		}
1419 	return;
1420 	}
1421 
1422 
1423 
blasfeo_dtrsv_unn(int m,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sz,int zi)1424 void blasfeo_dtrsv_unn(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi)
1425 	{
1426 	if(m==0)
1427 		return;
1428 #if defined(DIM_CHECK)
1429 	// non-negative size
1430 	if(m<0) printf("\n****** blasfeo_dtrsv_unn : m<0 : %d<0 *****\n", m);
1431 	// non-negative offset
1432 	if(ai<0) printf("\n****** blasfeo_dtrsv_unn : ai<0 : %d<0 *****\n", ai);
1433 	if(aj<0) printf("\n****** blasfeo_dtrsv_unn : aj<0 : %d<0 *****\n", aj);
1434 	if(xi<0) printf("\n****** blasfeo_dtrsv_unn : xi<0 : %d<0 *****\n", xi);
1435 	if(zi<0) printf("\n****** blasfeo_dtrsv_unn : zi<0 : %d<0 *****\n", zi);
1436 	// inside matrix
1437 	// A: m x k
1438 	if(ai+m > sA->m) printf("\n***** blasfeo_dtrsv_unn : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
1439 	if(aj+m > sA->n) printf("\n***** blasfeo_dtrsv_unn : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
1440 	// x: m
1441 	if(xi+m > sx->m) printf("\n***** blasfeo_dtrsv_unn : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
1442 	// z: m
1443 	if(zi+m > sz->m) printf("\n***** blasfeo_dtrsv_unn : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
1444 #endif
1445 //	printf("\n***** blasfeo_dtrsv_unn : feature not implemented yet *****\n");
1446 //	exit(1);
1447 	const int bs = 4;
1448 	int sda = sA->cn;
1449 	double *pA = sA->pA + aj*bs; // TODO ai
1450 	double *dA = sA->dA;
1451 	double *x = sx->pa + xi;
1452 	double *z = sz->pa + zi;
1453 	int ii;
1454 	if(ai==0 & aj==0)
1455 		{
1456 		if(sA->use_dA!=1)
1457 			{
1458 			ddiaex_lib(m, 1.0, ai, pA, sda, dA);
1459 			for(ii=0; ii<m; ii++)
1460 				dA[ii] = 1.0 / dA[ii];
1461 			sA->use_dA = 1;
1462 			}
1463 		}
1464 	else
1465 		{
1466 		ddiaex_lib(m, 1.0, ai, pA, sda, dA);
1467 		for(ii=0; ii<m; ii++)
1468 			dA[ii] = 1.0 / dA[ii];
1469 		sA->use_dA = 0;
1470 		}
1471 	if(x!=z)
1472 		{
1473 		for(ii=0; ii<m; ii++)
1474 			z[ii] = x[ii];
1475 		}
1476 	ii = 0;
1477 	if(m%4==1)
1478 		{
1479 		z[m-ii-1] *= dA[m-ii-1];
1480 		ii+=1;
1481 		}
1482 	else if(m%4==2)
1483 		{
1484 		z[m-ii-1] *= dA[m-ii-1];
1485 		z[m-ii-2] -= pA[m/bs*bs*sda+(m-ii-1)*bs]*z[m-ii-1];
1486 		z[m-ii-2] *= dA[m-ii-2];
1487 		ii+=2;
1488 		}
1489 	else if(m%4==3)
1490 		{
1491 		z[m-ii-1] *= dA[m-ii-1];
1492 		z[m-ii-2] -= pA[1+m/bs*bs*sda+(m-ii-1)*bs]*z[m-ii-1];
1493 		z[m-ii-2] *= dA[m-ii-2];
1494 		z[m-ii-3] -= pA[m/bs*bs*sda+(m-ii-2)*bs]*z[m-ii-2];
1495 		z[m-ii-3] -= pA[m/bs*bs*sda+(m-ii-1)*bs]*z[m-ii-1];
1496 		z[m-ii-3] *= dA[m-ii-3];
1497 		ii+=3;
1498 		}
1499 	for(; ii<m-3; ii+=4)
1500 		{
1501 		// TODO
1502 		kernel_dtrsv_un_inv_4_lib4(ii+4, &pA[(m-ii-4)/bs*bs*sda+(m-ii-4)*bs], &dA[m-ii-4], &z[m-ii-4], &z[m-ii-4], &z[m-ii-4]);
1503 		}
1504 	return;
1505 	}
1506 
1507 
1508 
blasfeo_dtrsv_utn(int m,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sz,int zi)1509 void blasfeo_dtrsv_utn(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi)
1510 	{
1511 	if(m==0)
1512 		return;
1513 #if defined(DIM_CHECK)
1514 	// non-negative size
1515 	if(m<0) printf("\n****** blasfeo_dtrsv_utn : m<0 : %d<0 *****\n", m);
1516 	// non-negative offset
1517 	if(ai<0) printf("\n****** blasfeo_dtrsv_utn : ai<0 : %d<0 *****\n", ai);
1518 	if(aj<0) printf("\n****** blasfeo_dtrsv_utn : aj<0 : %d<0 *****\n", aj);
1519 	if(xi<0) printf("\n****** blasfeo_dtrsv_utn : xi<0 : %d<0 *****\n", xi);
1520 	if(zi<0) printf("\n****** blasfeo_dtrsv_utn : zi<0 : %d<0 *****\n", zi);
1521 	// inside matrix
1522 	// A: m x k
1523 	if(ai+m > sA->m) printf("\n***** blasfeo_dtrsv_utn : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
1524 	if(aj+m > sA->n) printf("\n***** blasfeo_dtrsv_utn : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
1525 	// x: m
1526 	if(xi+m > sx->m) printf("\n***** blasfeo_dtrsv_utn : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
1527 	// z: m
1528 	if(zi+m > sz->m) printf("\n***** blasfeo_dtrsv_utn : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
1529 #endif
1530 //	printf("\n***** blasfeo_dtrsv_utn : feature not implemented yet *****\n");
1531 //	exit(1);
1532 	const int bs = 4;
1533 	int sda = sA->cn;
1534 	double *pA = sA->pA + aj*bs; // TODO ai
1535 	double *dA = sA->dA;
1536 	double *x = sx->pa + xi;
1537 	double *z = sz->pa + zi;
1538 	int ii;
1539 	if(ai==0 & aj==0)
1540 		{
1541 		if(sA->use_dA!=1)
1542 			{
1543 			ddiaex_lib(m, 1.0, ai, pA, sda, dA);
1544 			for(ii=0; ii<m; ii++)
1545 				dA[ii] = 1.0 / dA[ii];
1546 			sA->use_dA = 1;
1547 			}
1548 		}
1549 	else
1550 		{
1551 		ddiaex_lib(m, 1.0, ai, pA, sda, dA);
1552 		for(ii=0; ii<m; ii++)
1553 			dA[ii] = 1.0 / dA[ii];
1554 		sA->use_dA = 0;
1555 		}
1556 	if(x!=z)
1557 		{
1558 		for(ii=0; ii<m; ii++)
1559 			z[ii] = x[ii];
1560 		}
1561 	ii = 0;
1562 	for( ; ii<m-3; ii+=4)
1563 		{
1564 		kernel_dtrsv_ut_inv_4_lib4(ii, &pA[ii*bs], sda, &dA[ii], z, &z[ii], &z[ii]);
1565 		}
1566 	if(ii<m)
1567 		{
1568 		kernel_dtrsv_ut_inv_4_vs_lib4(ii, &pA[ii*bs], sda, &dA[ii], z, &z[ii], &z[ii], m-ii, m-ii);
1569 		ii+=4;
1570 		}
1571 	return;
1572 	}
1573 
1574 
1575 
1576 #else
1577 
1578 #error : wrong LA choice
1579 
1580 #endif
1581