1 /**************************************************************************************************
2 * *
3 * This file is part of BLASFEO. *
4 * *
5 * BLASFEO -- BLAS for embedded optimization. *
6 * Copyright (C) 2019 by Gianluca Frison. *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8 * All rights reserved. *
9 * *
10 * The 2-Clause BSD License *
11 * *
12 * Redistribution and use in source and binary forms, with or without *
13 * modification, are permitted provided that the following conditions are met: *
14 * *
15 * 1. Redistributions of source code must retain the above copyright notice, this *
16 * list of conditions and the following disclaimer. *
17 * 2. Redistributions in binary form must reproduce the above copyright notice, *
18 * this list of conditions and the following disclaimer in the documentation *
19 * and/or other materials provided with the distribution. *
20 * *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
31 * *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de *
33 * *
34 **************************************************************************************************/
35
36 #include <stdlib.h>
37 #include <stdio.h>
38
39 #include "../include/blasfeo_common.h"
40 #include "../include/blasfeo_d_kernel.h"
41 #include "../include/blasfeo_d_blas.h"
42 #include "../include/blasfeo_d_aux.h"
43
44
45
dtrsv_ln_inv_lib(int m,int n,double * pA,int sda,double * inv_diag_A,double * x,double * y)46 void dtrsv_ln_inv_lib(int m, int n, double *pA, int sda, double *inv_diag_A, double *x, double *y)
47 {
48
49 if(m<=0 || n<=0)
50 return;
51
52 // suppose m>=n
53 if(m<n)
54 m = n;
55
56 const int bs = 4;
57
58 double alpha = -1.0;
59 double beta = 1.0;
60
61 int i;
62
63 if(x!=y)
64 {
65 for(i=0; i<m; i++)
66 y[i] = x[i];
67 }
68
69 i = 0;
70 for( ; i<n-3; i+=4)
71 {
72 kernel_dtrsv_ln_inv_4_lib4(i, &pA[i*sda], &inv_diag_A[i], y, &y[i], &y[i]);
73 }
74 if(i<n)
75 {
76 kernel_dtrsv_ln_inv_4_vs_lib4(i, &pA[i*sda], &inv_diag_A[i], y, &y[i], &y[i], m-i, n-i);
77 i+=4;
78 }
79 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
80 for( ; i<m-7; i+=8)
81 {
82 kernel_dgemv_n_8_lib4(n, &alpha, &pA[i*sda], sda, y, &beta, &y[i], &y[i]);
83 }
84 if(i<m-3)
85 {
86 kernel_dgemv_n_4_lib4(n, &alpha, &pA[i*sda], y, &beta, &y[i], &y[i]);
87 i+=4;
88 }
89 #else
90 for( ; i<m-3; i+=4)
91 {
92 kernel_dgemv_n_4_lib4(n, &alpha, &pA[i*sda], y, &beta, &y[i], &y[i]);
93 }
94 #endif
95 if(i<m)
96 {
97 kernel_dgemv_n_4_vs_lib4(n, &alpha, &pA[i*sda], y, &beta, &y[i], &y[i], m-i);
98 i+=4;
99 }
100
101 }
102
103
104
dtrsv_lt_inv_lib(int m,int n,double * pA,int sda,double * inv_diag_A,double * x,double * y)105 void dtrsv_lt_inv_lib(int m, int n, double *pA, int sda, double *inv_diag_A, double *x, double *y)
106 {
107
108 if(m<=0 || n<=0)
109 return;
110
111 if(n>m)
112 n = m;
113
114 const int bs = 4;
115
116 int i;
117
118 if(x!=y)
119 for(i=0; i<m; i++)
120 y[i] = x[i];
121
122 i=0;
123 if(n%4==1)
124 {
125 kernel_dtrsv_lt_inv_1_lib4(m-n+i+1, &pA[n/bs*bs*sda+(n-i-1)*bs], sda, &inv_diag_A[n-i-1], &y[n-i-1], &y[n-i-1], &y[n-i-1]);
126 i++;
127 }
128 else if(n%4==2)
129 {
130 kernel_dtrsv_lt_inv_2_lib4(m-n+i+2, &pA[n/bs*bs*sda+(n-i-2)*bs], sda, &inv_diag_A[n-i-2], &y[n-i-2], &y[n-i-2], &y[n-i-2]);
131 i+=2;
132 }
133 else if(n%4==3)
134 {
135 kernel_dtrsv_lt_inv_3_lib4(m-n+i+3, &pA[n/bs*bs*sda+(n-i-3)*bs], sda, &inv_diag_A[n-i-3], &y[n-i-3], &y[n-i-3], &y[n-i-3]);
136 i+=3;
137 }
138 for(; i<n-3; i+=4)
139 {
140 kernel_dtrsv_lt_inv_4_lib4(m-n+i+4, &pA[(n-i-4)/bs*bs*sda+(n-i-4)*bs], sda, &inv_diag_A[n-i-4], &y[n-i-4], &y[n-i-4], &y[n-i-4]);
141 }
142
143 }
144
145
146
dgemv_nt_lib(int m,int n,double alpha_n,double alpha_t,double * pA,int sda,double * x_n,double * x_t,double beta_n,double beta_t,double * y_n,double * y_t,double * z_n,double * z_t)147 void dgemv_nt_lib(int m, int n, double alpha_n, double alpha_t, double *pA, int sda, double *x_n, double *x_t, double beta_n, double beta_t, double *y_n, double *y_t, double *z_n, double *z_t)
148 {
149
150 if(m<=0 | n<=0)
151 return;
152
153 const int bs = 4;
154
155 int ii;
156
157 // copy and scale y_n int z_n
158 ii = 0;
159 for(; ii<m-3; ii+=4)
160 {
161 z_n[ii+0] = beta_n*y_n[ii+0];
162 z_n[ii+1] = beta_n*y_n[ii+1];
163 z_n[ii+2] = beta_n*y_n[ii+2];
164 z_n[ii+3] = beta_n*y_n[ii+3];
165 }
166 for(; ii<m; ii++)
167 {
168 z_n[ii+0] = beta_n*y_n[ii+0];
169 }
170
171 ii = 0;
172 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
173 for(; ii<n-5; ii+=6)
174 {
175 kernel_dgemv_nt_6_lib4(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii);
176 }
177 #endif
178 for(; ii<n-3; ii+=4)
179 {
180 kernel_dgemv_nt_4_lib4(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii);
181 }
182 if(ii<n)
183 {
184 kernel_dgemv_nt_4_vs_lib4(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii, n-ii);
185 }
186
187 return;
188
189 }
190
191
192
193 #if defined(LA_HIGH_PERFORMANCE)
194
195
196
blasfeo_dgemv_n(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,double beta,struct blasfeo_dvec * sy,int yi,struct blasfeo_dvec * sz,int zi)197 void blasfeo_dgemv_n(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, double beta, struct blasfeo_dvec *sy, int yi, struct blasfeo_dvec *sz, int zi)
198 {
199
200 if(m<0)
201 return;
202
203 const int bs = 4;
204
205 int i;
206
207 int sda = sA->cn;
208 double *pA = sA->pA + aj*bs + ai/bs*bs*sda;
209 double *x = sx->pa + xi;
210 double *y = sy->pa + yi;
211 double *z = sz->pa + zi;
212
213 i = 0;
214 // clean up at the beginning
215 if(ai%bs!=0)
216 {
217 kernel_dgemv_n_4_gen_lib4(n, &alpha, pA, x, &beta, y-ai%bs, z-ai%bs, ai%bs, m+ai%bs);
218 pA += bs*sda;
219 y += 4 - ai%bs;
220 z += 4 - ai%bs;
221 m -= 4 - ai%bs;
222 }
223 // main loop
224 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
225 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
226 for( ; i<m-11; i+=12)
227 {
228 kernel_dgemv_n_12_lib4(n, &alpha, &pA[i*sda], sda, x, &beta, &y[i], &z[i]);
229 }
230 #endif
231 for( ; i<m-7; i+=8)
232 {
233 kernel_dgemv_n_8_lib4(n, &alpha, &pA[i*sda], sda, x, &beta, &y[i], &z[i]);
234 }
235 if(i<m-3)
236 {
237 kernel_dgemv_n_4_lib4(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i]);
238 i+=4;
239 }
240 #else
241 for( ; i<m-3; i+=4)
242 {
243 kernel_dgemv_n_4_lib4(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i]);
244 }
245 #endif
246 if(i<m)
247 {
248 kernel_dgemv_n_4_vs_lib4(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i], m-i);
249 }
250
251 return;
252
253 }
254
255
256
blasfeo_dgemv_t(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,double beta,struct blasfeo_dvec * sy,int yi,struct blasfeo_dvec * sz,int zi)257 void blasfeo_dgemv_t(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, double beta, struct blasfeo_dvec *sy, int yi, struct blasfeo_dvec *sz, int zi)
258 {
259
260 if(n<=0)
261 return;
262
263 const int bs = 4;
264
265 int i;
266
267 int sda = sA->cn;
268 double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
269 int offsetA = ai%bs;
270 double *x = sx->pa + xi;
271 double *y = sy->pa + yi;
272 double *z = sz->pa + zi;
273
274 i = 0;
275 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
276 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
277 for( ; i<n-11; i+=12)
278 {
279 kernel_dgemv_t_12_lib4(m, &alpha, offsetA, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
280 }
281 #endif
282 for( ; i<n-7; i+=8)
283 {
284 kernel_dgemv_t_8_lib4(m, &alpha, offsetA, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
285 }
286 if(i<n-3)
287 {
288 kernel_dgemv_t_4_lib4(m, &alpha, offsetA, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
289 i+=4;
290 }
291 #else
292 for( ; i<n-3; i+=4)
293 {
294 kernel_dgemv_t_4_lib4(m, &alpha, offsetA, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
295 }
296 #endif
297 if(i<n)
298 {
299 kernel_dgemv_t_4_vs_lib4(m, &alpha, offsetA, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
300 }
301
302 return;
303
304 }
305
306
307
blasfeo_dgemv_nt(int m,int n,double alpha_n,double alpha_t,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx_n,int xi_n,struct blasfeo_dvec * sx_t,int xi_t,double beta_n,double beta_t,struct blasfeo_dvec * sy_n,int yi_n,struct blasfeo_dvec * sy_t,int yi_t,struct blasfeo_dvec * sz_n,int zi_n,struct blasfeo_dvec * sz_t,int zi_t)308 void blasfeo_dgemv_nt(int m, int n, double alpha_n, double alpha_t, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx_n, int xi_n, struct blasfeo_dvec *sx_t, int xi_t, double beta_n, double beta_t, struct blasfeo_dvec *sy_n, int yi_n, struct blasfeo_dvec *sy_t, int yi_t, struct blasfeo_dvec *sz_n, int zi_n, struct blasfeo_dvec *sz_t, int zi_t)
309 {
310 if(ai!=0)
311 {
312 printf("\nblasfeo_dgemv_nt: feature not implemented yet: ai=%d\n", ai);
313 exit(1);
314 }
315 const int bs = 4;
316 #if defined(TARGET_X86_AMD_BARCELONA) | defined(TARGET_X86_AMD_JAGUAR)
317 blasfeo_dgemv_n(m, n, alpha_n, sA, ai, aj, sx_n, xi_n, beta_n, sy_n, yi_n, sz_n, zi_n);
318 blasfeo_dgemv_t(m, n, alpha_t, sA, ai, aj, sx_t, xi_t, beta_t, sy_t, yi_t, sz_t, zi_t);
319 return;
320 #endif
321 int sda = sA->cn;
322 double *pA = sA->pA + aj*bs; // TODO ai
323 double *x_n = sx_n->pa + xi_n;
324 double *x_t = sx_t->pa + xi_t;
325 double *y_n = sy_n->pa + yi_n;
326 double *y_t = sy_t->pa + yi_t;
327 double *z_n = sz_n->pa + zi_n;
328 double *z_t = sz_t->pa + zi_t;
329 dgemv_nt_lib(m, n, alpha_n, alpha_t, pA, sda, x_n, x_t, beta_n, beta_t, y_n, y_t, z_n, z_t);
330 return;
331 }
332
333
334
335 // m >= n
blasfeo_dsymv_l(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,double beta,struct blasfeo_dvec * sy,int yi,struct blasfeo_dvec * sz,int zi)336 void blasfeo_dsymv_l(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, double beta, struct blasfeo_dvec *sy, int yi, struct blasfeo_dvec *sz, int zi)
337 {
338
339 if(m<n)
340 n = m;
341
342 if(m<=0 | n<=0)
343 return;
344
345 const int bs = 4;
346
347 int ii, n1;
348
349 int sda = sA->cn;
350 double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
351 double *x = sx->pa + xi;
352 double *y = sy->pa + yi;
353 double *z = sz->pa + zi;
354
355 // copy and scale y int z
356 ii = 0;
357 for(; ii<m-3; ii+=4)
358 {
359 z[ii+0] = beta*y[ii+0];
360 z[ii+1] = beta*y[ii+1];
361 z[ii+2] = beta*y[ii+2];
362 z[ii+3] = beta*y[ii+3];
363 }
364 for(; ii<m; ii++)
365 {
366 z[ii+0] = beta*y[ii+0];
367 }
368
369 // clean up at the beginning
370 if(ai%bs!=0) // 1, 2, 3
371 {
372 n1 = 4-ai%bs;
373 kernel_dsymv_l_4_gen_lib4(m, &alpha, ai%bs, &pA[0], sda, &x[0], &z[0], n<n1 ? n : n1);
374 pA += n1 + n1*bs + (sda-1)*bs;
375 x += n1;
376 z += n1;
377 m -= n1;
378 n -= n1;
379 }
380
381 #if defined(TARGET_X86_AMD_BARCELONA) | defined(TARGET_X86_AMD_JAGUAR)
382 // using dgemv_n and dgemv_t kernels
383 double beta1 = 1.0;
384 double xx[4];
385 for(ii=0; ii<n-3; ii+=4)
386 {
387 // gemv_n
388 kernel_dgemv_n_4_lib4(ii, &alpha, pA+ii*sda, x, &beta1, z+ii, z+ii);
389 // 4x4
390 z[ii+0] += alpha*(pA[ii*sda+0+bs*(ii+0)]*x[ii+0] + pA[ii*sda+1+bs*(ii+0)]*x[ii+1] + pA[ii*sda+2+bs*(ii+0)]*x[ii+2] + pA[ii*sda+3+bs*(ii+0)]*x[ii+3]);
391 z[ii+1] += alpha*(pA[ii*sda+1+bs*(ii+0)]*x[ii+0] + pA[ii*sda+1+bs*(ii+1)]*x[ii+1] + pA[ii*sda+2+bs*(ii+1)]*x[ii+2] + pA[ii*sda+3+bs*(ii+1)]*x[ii+3]);
392 z[ii+2] += alpha*(pA[ii*sda+2+bs*(ii+0)]*x[ii+0] + pA[ii*sda+2+bs*(ii+1)]*x[ii+1] + pA[ii*sda+2+bs*(ii+2)]*x[ii+2] + pA[ii*sda+3+bs*(ii+2)]*x[ii+3]);
393 z[ii+3] += alpha*(pA[ii*sda+3+bs*(ii+0)]*x[ii+0] + pA[ii*sda+3+bs*(ii+1)]*x[ii+1] + pA[ii*sda+3+bs*(ii+2)]*x[ii+2] + pA[ii*sda+3+bs*(ii+3)]*x[ii+3]);
394 // gemv_t
395 kernel_dgemv_t_4_lib4(m-ii-4, &alpha, 0, pA+(ii+4)*sda+ii*bs, sda, x+ii+4, &beta1, z+ii, z+ii);
396 }
397 if(ii<n)
398 {
399 if(n-ii==1)
400 {
401 xx[0] = alpha*x[ii+0];
402 xx[1] = alpha*x[ii+1];
403 xx[2] = alpha*x[ii+2];
404 xx[3] = alpha*x[ii+3];
405 // gemv_n
406 kernel_dgemv_n_4_vs_lib4(ii, &alpha, pA+ii*sda, x, &beta1, z+ii, z+ii, m-ii);
407 // 4x4
408 z[ii+0] += pA[ii*sda+0+bs*(ii+0)]*xx[0];
409 if(m-ii>1)
410 {
411 z[ii+0] += pA[ii*sda+1+bs*(ii+0)]*xx[1];
412 z[ii+1] += pA[ii*sda+1+bs*(ii+0)]*xx[0];
413 }
414 if(m-ii>2)
415 {
416 z[ii+0] += pA[ii*sda+2+bs*(ii+0)]*xx[2];
417 z[ii+2] += pA[ii*sda+2+bs*(ii+0)]*xx[0];
418 }
419 if(m-ii>3)
420 {
421 z[ii+0] += pA[ii*sda+3+bs*(ii+0)]*xx[3];
422 z[ii+3] += pA[ii*sda+3+bs*(ii+0)]*xx[0];
423 }
424 // gemv_t
425 kernel_dgemv_t_4_vs_lib4(m-ii-4, &alpha, 0, pA+(ii+4)*sda+ii*bs, sda, x+ii+4, &beta1, z+ii, z+ii, n-ii);
426 ii += 4;
427 }
428 else if(n-ii==2)
429 {
430 xx[0] = alpha*x[ii+0];
431 xx[1] = alpha*x[ii+1];
432 xx[2] = alpha*x[ii+2];
433 xx[3] = alpha*x[ii+3];
434 // gemv_n
435 kernel_dgemv_n_4_vs_lib4(ii, &alpha, pA+ii*sda, x, &beta1, z+ii, z+ii, m-ii);
436 // 4x4
437 z[ii+0] += pA[ii*sda+0+bs*(ii+0)]*xx[0];
438 if(m-ii>1)
439 {
440 z[ii+0] += pA[ii*sda+1+bs*(ii+0)]*xx[1];
441 z[ii+1] += pA[ii*sda+1+bs*(ii+0)]*xx[0] + pA[ii*sda+1+bs*(ii+1)]*xx[1];
442 }
443 if(m-ii>2)
444 {
445 z[ii+0] += pA[ii*sda+2+bs*(ii+0)]*xx[2];
446 z[ii+1] += pA[ii*sda+2+bs*(ii+1)]*xx[2];
447 z[ii+2] += pA[ii*sda+2+bs*(ii+0)]*xx[0] + pA[ii*sda+2+bs*(ii+1)]*xx[1];
448 }
449 if(m-ii>3)
450 {
451 z[ii+0] += pA[ii*sda+3+bs*(ii+0)]*xx[3];
452 z[ii+1] += pA[ii*sda+3+bs*(ii+1)]*xx[3];
453 z[ii+3] += pA[ii*sda+3+bs*(ii+0)]*xx[0] + pA[ii*sda+3+bs*(ii+1)]*xx[1];
454 }
455 // gemv_t
456 kernel_dgemv_t_4_vs_lib4(m-ii-4, &alpha, 0, pA+(ii+4)*sda+ii*bs, sda, x+ii+4, &beta1, z+ii, z+ii, n-ii);
457 ii += 4;
458 }
459 else // if(n-ii==3)
460 {
461 xx[0] = alpha*x[ii+0];
462 xx[1] = alpha*x[ii+1];
463 xx[2] = alpha*x[ii+2];
464 xx[3] = alpha*x[ii+3];
465 // gemv_n
466 kernel_dgemv_n_4_vs_lib4(ii, &alpha, pA+ii*sda, x, &beta1, z+ii, z+ii, m-ii);
467 // 4x4
468 z[ii+0] += pA[ii*sda+0+bs*(ii+0)]*xx[0];
469 if(m-ii>1)
470 {
471 z[ii+0] += pA[ii*sda+1+bs*(ii+0)]*xx[1];
472 z[ii+1] += pA[ii*sda+1+bs*(ii+0)]*xx[0] + pA[ii*sda+1+bs*(ii+1)]*xx[1];
473 }
474 if(m-ii>2)
475 {
476 z[ii+0] += pA[ii*sda+2+bs*(ii+0)]*xx[2];
477 z[ii+1] += pA[ii*sda+2+bs*(ii+1)]*xx[2];
478 z[ii+2] += pA[ii*sda+2+bs*(ii+0)]*xx[0] + pA[ii*sda+2+bs*(ii+1)]*xx[1] + pA[ii*sda+2+bs*(ii+2)]*xx[2];
479 }
480 if(m-ii>3)
481 {
482 z[ii+0] += pA[ii*sda+3+bs*(ii+0)]*xx[3];
483 z[ii+1] += pA[ii*sda+3+bs*(ii+1)]*xx[3];
484 z[ii+2] += pA[ii*sda+3+bs*(ii+2)]*xx[3];
485 z[ii+3] += pA[ii*sda+3+bs*(ii+0)]*xx[0] + pA[ii*sda+3+bs*(ii+1)]*xx[1] + pA[ii*sda+3+bs*(ii+2)]*xx[2];
486 }
487 // gemv_t
488 kernel_dgemv_t_4_vs_lib4(m-ii-4, &alpha, 0, pA+(ii+4)*sda+ii*bs, sda, x+ii+4, &beta1, z+ii, z+ii, n-ii);
489 ii += 4;
490 }
491 for( ; ii<m-3; ii+=4)
492 {
493 // gemv_n
494 kernel_dgemv_n_4_lib4(n, &alpha, pA+ii*sda, x, &beta1, z+ii, z+ii);
495 }
496 if(ii<m)
497 {
498 // gemv_n
499 kernel_dgemv_n_4_vs_lib4(n, &alpha, pA+ii*sda, x, &beta1, z+ii, z+ii, m-ii);
500 }
501 }
502 return;
503 #endif
504
505 // main loop
506 ii = 0;
507 for(; ii<n-3; ii+=4)
508 {
509 kernel_dsymv_l_4_lib4(m-ii, &alpha, &pA[ii*bs+ii*sda], sda, &x[ii], &z[ii]);
510 }
511 // clean up at the end
512 if(ii<n)
513 {
514 kernel_dsymv_l_4_gen_lib4(m-ii, &alpha, 0, &pA[ii*bs+ii*sda], sda, &x[ii], &z[ii], n-ii);
515 }
516
517 return;
518 }
519
520
521
522 // m >= n
blasfeo_dtrmv_lnn(int m,int n,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sz,int zi)523 void blasfeo_dtrmv_lnn(int m, int n, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi)
524 {
525
526 if(m<=0)
527 return;
528
529 const int bs = 4;
530
531 int sda = sA->cn;
532 double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
533 double *x = sx->pa + xi;
534 double *z = sz->pa + zi;
535
536 if(m-n>0)
537 blasfeo_dgemv_n(m-n, n, 1.0, sA, ai+n, aj, sx, xi, 0.0, sz, zi+n, sz, zi+n);
538
539 double *pA2 = pA;
540 double *z2 = z;
541 int m2 = n;
542 int n2 = 0;
543 double *pA3, *x3;
544
545 double alpha = 1.0;
546 double beta = 1.0;
547
548 double zt[4] = {0, 0, 0, 0};
549 double xt[4] = {0, 0, 0, 0};
550
551 int ii, jj, jj_end;
552
553 ii = 0;
554
555 if(ai%4!=0)
556 {
557 pA2 += sda*bs - ai%bs;
558 z2 += bs-ai%bs;
559 m2 -= bs-ai%bs;
560 n2 += bs-ai%bs;
561 }
562
563 pA2 += m2/bs*bs*sda;
564 z2 += m2/bs*bs;
565 n2 += m2/bs*bs;
566
567 if(m2%bs != 0)
568 {
569 //
570 pA3 = pA2 + bs*n2;
571 x3 = x + n2;
572
573 // access only valid memory
574 for(jj=0; jj<m2%bs; jj++)
575 xt[jj] = x3[jj];
576
577 zt[2] = pA3[2+bs*0]*xt[0] + pA3[2+bs*1]*xt[1] + pA3[2+bs*2]*xt[2];
578 zt[1] = pA3[1+bs*0]*xt[0] + pA3[1+bs*1]*xt[1];
579 zt[0] = pA3[0+bs*0]*xt[0];
580 kernel_dgemv_n_4_lib4(n2, &alpha, pA2, x, &beta, zt, zt);
581 for(jj=0; jj<m2%bs; jj++)
582 z2[jj] = zt[jj];
583 }
584 for(; ii<m2-3; ii+=4)
585 {
586 pA2 -= bs*sda;
587 z2 -= 4;
588 n2 -= 4;
589 pA3 = pA2 + bs*n2;
590 x3 = x + n2;
591 z2[3] = pA3[3+bs*0]*x3[0] + pA3[3+bs*1]*x3[1] + pA3[3+bs*2]*x3[2] + pA3[3+bs*3]*x3[3];
592 z2[2] = pA3[2+bs*0]*x3[0] + pA3[2+bs*1]*x3[1] + pA3[2+bs*2]*x3[2];
593 z2[1] = pA3[1+bs*0]*x3[0] + pA3[1+bs*1]*x3[1];
594 z2[0] = pA3[0+bs*0]*x3[0];
595 kernel_dgemv_n_4_lib4(n2, &alpha, pA2, x, &beta, z2, z2);
596 }
597 if(ai%4!=0)
598 {
599 if(ai%bs==1)
600 {
601 zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + pA[2+bs*2]*x[2];
602 zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
603 zt[0] = pA[0+bs*0]*x[0];
604 jj_end = 4-ai%bs<n ? 4-ai%bs : n;
605 for(jj=0; jj<jj_end; jj++)
606 z[jj] = zt[jj];
607 }
608 else if(ai%bs==2)
609 {
610 zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
611 zt[0] = pA[0+bs*0]*x[0];
612 jj_end = 4-ai%bs<n ? 4-ai%bs : n;
613 for(jj=0; jj<jj_end; jj++)
614 z[jj] = zt[jj];
615 }
616 else // if (ai%bs==3)
617 {
618 z[0] = pA[0+bs*0]*x[0];
619 }
620 }
621
622 return;
623
624 }
625
626
627
628 // m >= n
blasfeo_dtrmv_lnu(int m,int n,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sz,int zi)629 void blasfeo_dtrmv_lnu(int m, int n, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi)
630 {
631
632 if(m<=0)
633 return;
634
635 const int bs = 4;
636
637 int sda = sA->cn;
638 double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
639 double *x = sx->pa + xi;
640 double *z = sz->pa + zi;
641
642 if(m-n>0)
643 blasfeo_dgemv_n(m-n, n, 1.0, sA, ai+n, aj, sx, xi, 0.0, sz, zi+n, sz, zi+n);
644
645 double *pA2 = pA;
646 double *z2 = z;
647 int m2 = n;
648 int n2 = 0;
649 double *pA3, *x3;
650
651 double alpha = 1.0;
652 double beta = 1.0;
653
654 double zt[4];
655
656 int ii, jj, jj_end;
657
658 ii = 0;
659
660 if(ai%4!=0)
661 {
662 pA2 += sda*bs - ai%bs;
663 z2 += bs-ai%bs;
664 m2 -= bs-ai%bs;
665 n2 += bs-ai%bs;
666 }
667
668 pA2 += m2/bs*bs*sda;
669 z2 += m2/bs*bs;
670 n2 += m2/bs*bs;
671
672 if(m2%bs!=0)
673 {
674 //
675 pA3 = pA2 + bs*n2;
676 x3 = x + n2;
677 zt[3] = pA3[3+bs*0]*x3[0] + pA3[3+bs*1]*x3[1] + pA3[3+bs*2]*x3[2] + 1.0*x3[3];
678 zt[2] = pA3[2+bs*0]*x3[0] + pA3[2+bs*1]*x3[1] + 1.0*x3[2];
679 zt[1] = pA3[1+bs*0]*x3[0] + 1.0*x3[1];
680 zt[0] = 1.0*x3[0];
681 kernel_dgemv_n_4_lib4(n2, &alpha, pA2, x, &beta, zt, zt);
682 for(jj=0; jj<m2%bs; jj++)
683 z2[jj] = zt[jj];
684 }
685 for(; ii<m2-3; ii+=4)
686 {
687 pA2 -= bs*sda;
688 z2 -= 4;
689 n2 -= 4;
690 pA3 = pA2 + bs*n2;
691 x3 = x + n2;
692 z2[3] = pA3[3+bs*0]*x3[0] + pA3[3+bs*1]*x3[1] + pA3[3+bs*2]*x3[2] + 1.0*x3[3];
693 z2[2] = pA3[2+bs*0]*x3[0] + pA3[2+bs*1]*x3[1] + 1.0*x3[2];
694 z2[1] = pA3[1+bs*0]*x3[0] + 1.0*x3[1];
695 z2[0] = 1.0*x3[0];
696 kernel_dgemv_n_4_lib4(n2, &alpha, pA2, x, &beta, z2, z2);
697 }
698 if(ai%4!=0)
699 {
700 if(ai%bs==1)
701 {
702 zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + 1.0*x[2];
703 zt[1] = pA[1+bs*0]*x[0] + 1.0*x[1];
704 zt[0] = 1.0*x[0];
705 jj_end = 4-ai%bs<n ? 4-ai%bs : n;
706 for(jj=0; jj<jj_end; jj++)
707 z[jj] = zt[jj];
708 }
709 else if(ai%bs==2)
710 {
711 zt[1] = pA[1+bs*0]*x[0] + 1.0*x[1];
712 zt[0] = 1.0*x[0];
713 jj_end = 4-ai%bs<n ? 4-ai%bs : n;
714 for(jj=0; jj<jj_end; jj++)
715 z[jj] = zt[jj];
716 }
717 else // if (ai%bs==3)
718 {
719 z[0] = 1.0*x[0];
720 }
721 }
722
723 return;
724
725 }
726
727
728
729 // m >= n
blasfeo_dtrmv_ltn(int m,int n,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sz,int zi)730 void blasfeo_dtrmv_ltn(int m, int n, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi)
731 {
732
733 if(m<=0)
734 return;
735
736 const int bs = 4;
737
738 int sda = sA->cn;
739 double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
740 double *x = sx->pa + xi;
741 double *z = sz->pa + zi;
742
743 double xt[4];
744 double zt[4];
745
746 double alpha = 1.0;
747 double beta = 1.0;
748
749 int ii, jj, ll, ll_max;
750
751 jj = 0;
752
753 if(ai%bs!=0)
754 {
755
756 if(ai%bs==1)
757 {
758 ll_max = m-jj<3 ? m-jj : 3;
759 for(ll=0; ll<ll_max; ll++)
760 xt[ll] = x[ll];
761 for(; ll<3; ll++)
762 xt[ll] = 0.0;
763 zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2];
764 zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2];
765 zt[2] = pA[2+bs*2]*xt[2];
766 pA += bs*sda - 1;
767 x += 3;
768 kernel_dgemv_t_4_lib4(m-3-jj, &alpha, 0, pA, sda, x, &beta, zt, zt);
769 ll_max = n-jj<3 ? n-jj : 3;
770 for(ll=0; ll<ll_max; ll++)
771 z[ll] = zt[ll];
772 pA += bs*3;
773 z += 3;
774 jj += 3;
775 }
776 else if(ai%bs==2)
777 {
778 ll_max = m-jj<2 ? m-jj : 2;
779 for(ll=0; ll<ll_max; ll++)
780 xt[ll] = x[ll];
781 for(; ll<2; ll++)
782 xt[ll] = 0.0;
783 zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1];
784 zt[1] = pA[1+bs*1]*xt[1];
785 pA += bs*sda - 2;
786 x += 2;
787 kernel_dgemv_t_4_lib4(m-2-jj, &alpha, 0, pA, sda, x, &beta, zt, zt);
788 ll_max = n-jj<2 ? n-jj : 2;
789 for(ll=0; ll<ll_max; ll++)
790 z[ll] = zt[ll];
791 pA += bs*2;
792 z += 2;
793 jj += 2;
794 }
795 else // if(ai%bs==3)
796 {
797 ll_max = m-jj<1 ? m-jj : 1;
798 for(ll=0; ll<ll_max; ll++)
799 xt[ll] = x[ll];
800 for(; ll<1; ll++)
801 xt[ll] = 0.0;
802 zt[0] = pA[0+bs*0]*xt[0];
803 pA += bs*sda - 3;
804 x += 1;
805 kernel_dgemv_t_4_lib4(m-1-jj, &alpha, 0, pA, sda, x, &beta, zt, zt);
806 ll_max = n-jj<1 ? n-jj : 1;
807 for(ll=0; ll<ll_max; ll++)
808 z[ll] = zt[ll];
809 pA += bs*1;
810 z += 1;
811 jj += 1;
812 }
813
814 }
815
816 for(; jj<n-3; jj+=4)
817 {
818 zt[0] = pA[0+bs*0]*x[0] + pA[1+bs*0]*x[1] + pA[2+bs*0]*x[2] + pA[3+bs*0]*x[3];
819 zt[1] = pA[1+bs*1]*x[1] + pA[2+bs*1]*x[2] + pA[3+bs*1]*x[3];
820 zt[2] = pA[2+bs*2]*x[2] + pA[3+bs*2]*x[3];
821 zt[3] = pA[3+bs*3]*x[3];
822 pA += bs*sda;
823 x += 4;
824 kernel_dgemv_t_4_lib4(m-4-jj, &alpha, 0, pA, sda, x, &beta, zt, z);
825 pA += bs*4;
826 z += 4;
827 }
828 if(jj<n)
829 {
830 ll_max = m-jj<4 ? m-jj : 4;
831 for(ll=0; ll<ll_max; ll++)
832 xt[ll] = x[ll];
833 for(; ll<4; ll++)
834 xt[ll] = 0.0;
835 zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3];
836 zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3];
837 zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3];
838 zt[3] = pA[3+bs*3]*xt[3];
839 pA += bs*sda;
840 x += 4;
841 kernel_dgemv_t_4_lib4(m-4-jj, &alpha, 0, pA, sda, x, &beta, zt, zt);
842 for(ll=0; ll<n-jj; ll++)
843 z[ll] = zt[ll];
844 // pA += bs*4;
845 // z += 4;
846 }
847
848 return;
849
850 }
851
852
853
854 // m >= n
blasfeo_dtrmv_ltu(int m,int n,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sz,int zi)855 void blasfeo_dtrmv_ltu(int m, int n, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi)
856 {
857
858 if(m<=0)
859 return;
860
861 const int bs = 4;
862
863 int sda = sA->cn;
864 double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
865 double *x = sx->pa + xi;
866 double *z = sz->pa + zi;
867
868 double xt[4];
869 double zt[4];
870
871 double alpha = 1.0;
872 double beta = 1.0;
873
874 int ii, jj, ll, ll_max;
875
876 jj = 0;
877
878 if(ai%bs!=0)
879 {
880
881 if(ai%bs==1)
882 {
883 ll_max = m-jj<3 ? m-jj : 3;
884 for(ll=0; ll<ll_max; ll++)
885 xt[ll] = x[ll];
886 for(; ll<3; ll++)
887 xt[ll] = 0.0;
888 zt[0] = 1.0*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2];
889 zt[1] = 1.0*xt[1] + pA[2+bs*1]*xt[2];
890 zt[2] = 1.0*xt[2];
891 pA += bs*sda - 1;
892 x += 3;
893 kernel_dgemv_t_4_lib4(m-3-jj, &alpha, 0, pA, sda, x, &beta, zt, zt);
894 ll_max = n-jj<3 ? n-jj : 3;
895 for(ll=0; ll<ll_max; ll++)
896 z[ll] = zt[ll];
897 pA += bs*3;
898 z += 3;
899 jj += 3;
900 }
901 else if(ai%bs==2)
902 {
903 ll_max = m-jj<2 ? m-jj : 2;
904 for(ll=0; ll<ll_max; ll++)
905 xt[ll] = x[ll];
906 for(; ll<2; ll++)
907 xt[ll] = 0.0;
908 zt[0] = 1.0*xt[0] + pA[1+bs*0]*xt[1];
909 zt[1] = 1.0*xt[1];
910 pA += bs*sda - 2;
911 x += 2;
912 kernel_dgemv_t_4_lib4(m-2-jj, &alpha, 0, pA, sda, x, &beta, zt, zt);
913 ll_max = n-jj<2 ? n-jj : 2;
914 for(ll=0; ll<ll_max; ll++)
915 z[ll] = zt[ll];
916 pA += bs*2;
917 z += 2;
918 jj += 2;
919 }
920 else // if(ai%bs==3)
921 {
922 ll_max = m-jj<1 ? m-jj : 1;
923 for(ll=0; ll<ll_max; ll++)
924 xt[ll] = x[ll];
925 for(; ll<1; ll++)
926 xt[ll] = 0.0;
927 zt[0] = 1.0*xt[0];
928 pA += bs*sda - 3;
929 x += 1;
930 kernel_dgemv_t_4_lib4(m-1-jj, &alpha, 0, pA, sda, x, &beta, zt, zt);
931 ll_max = n-jj<1 ? n-jj : 1;
932 for(ll=0; ll<ll_max; ll++)
933 z[ll] = zt[ll];
934 pA += bs*1;
935 z += 1;
936 jj += 1;
937 }
938
939 }
940
941 for(; jj<n-3; jj+=4)
942 {
943 zt[0] = 1.0*x[0] + pA[1+bs*0]*x[1] + pA[2+bs*0]*x[2] + pA[3+bs*0]*x[3];
944 zt[1] = 1.0*x[1] + pA[2+bs*1]*x[2] + pA[3+bs*1]*x[3];
945 zt[2] = 1.0*x[2] + pA[3+bs*2]*x[3];
946 zt[3] = 1.0*x[3];
947 pA += bs*sda;
948 x += 4;
949 kernel_dgemv_t_4_lib4(m-4-jj, &alpha, 0, pA, sda, x, &beta, zt, z);
950 pA += bs*4;
951 z += 4;
952 }
953 if(jj<n)
954 {
955 ll_max = m-jj<4 ? m-jj : 4;
956 for(ll=0; ll<ll_max; ll++)
957 xt[ll] = x[ll];
958 for(; ll<4; ll++)
959 xt[ll] = 0.0;
960 zt[0] = 1.0*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3];
961 zt[1] = 1.0*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3];
962 zt[2] = 1.0*xt[2] + pA[3+bs*2]*xt[3];
963 zt[3] = 1.0*xt[3];
964 pA += bs*sda;
965 x += 4;
966 kernel_dgemv_t_4_lib4(m-4-jj, &alpha, 0, pA, sda, x, &beta, zt, zt);
967 for(ll=0; ll<n-jj; ll++)
968 z[ll] = zt[ll];
969 // pA += bs*4;
970 // z += 4;
971 }
972
973 return;
974
975 }
976
977
978
blasfeo_dtrmv_unn(int m,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sz,int zi)979 void blasfeo_dtrmv_unn(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi)
980 {
981
982 if(m<=0)
983 return;
984
985 if(ai!=0)
986 {
987 printf("\nblasfeo_dtrmv_unn: feature not implemented yet: ai=%d\n", ai);
988 exit(1);
989 }
990
991 const int bs = 4;
992
993 int sda = sA->cn;
994 double *pA = sA->pA + aj*bs; // TODO ai
995 double *x = sx->pa + xi;
996 double *z = sz->pa + zi;
997
998 int i;
999
1000 i=0;
1001 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1002 for(; i<m-7; i+=8)
1003 {
1004 kernel_dtrmv_un_8_lib4(m-i, pA, sda, x, z);
1005 pA += 8*sda+8*bs;
1006 x += 8;
1007 z += 8;
1008 }
1009 #endif
1010 for(; i<m-3; i+=4)
1011 {
1012 kernel_dtrmv_un_4_lib4(m-i, pA, x, z);
1013 pA += 4*sda+4*bs;
1014 x += 4;
1015 z += 4;
1016 }
1017 if(m>i)
1018 {
1019 if(m-i==1)
1020 {
1021 z[0] = pA[0+bs*0]*x[0];
1022 }
1023 else if(m-i==2)
1024 {
1025 z[0] = pA[0+bs*0]*x[0] + pA[0+bs*1]*x[1];
1026 z[1] = pA[1+bs*1]*x[1];
1027 }
1028 else // if(m-i==3)
1029 {
1030 z[0] = pA[0+bs*0]*x[0] + pA[0+bs*1]*x[1] + pA[0+bs*2]*x[2];
1031 z[1] = pA[1+bs*1]*x[1] + pA[1+bs*2]*x[2];
1032 z[2] = pA[2+bs*2]*x[2];
1033 }
1034 }
1035
1036 return;
1037
1038 }
1039
1040
1041
blasfeo_dtrmv_utn(int m,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sz,int zi)1042 void blasfeo_dtrmv_utn(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi)
1043 {
1044
1045 if(m<=0)
1046 return;
1047
1048 if(ai!=0)
1049 {
1050 printf("\nblasfeo_dtrmv_utn: feature not implemented yet: ai=%d\n", ai);
1051 exit(1);
1052 }
1053
1054 const int bs = 4;
1055
1056 int sda = sA->cn;
1057 double *pA = sA->pA + aj*bs; // TODO ai
1058 double *x = sx->pa + xi;
1059 double *z = sz->pa + zi;
1060
1061 int ii, idx;
1062
1063 double *ptrA;
1064
1065 ii=0;
1066 idx = m/bs*bs;
1067 if(m%bs!=0)
1068 {
1069 kernel_dtrmv_ut_4_vs_lib4(m, pA+idx*bs, sda, x, z+idx, m%bs);
1070 ii += m%bs;
1071 }
1072 idx -= 4;
1073 for(; ii<m; ii+=4)
1074 {
1075 kernel_dtrmv_ut_4_lib4(idx+4, pA+idx*bs, sda, x, z+idx);
1076 idx -= 4;
1077 }
1078
1079 return;
1080
1081 }
1082
1083
1084
blasfeo_dtrsv_lnn_mn(int m,int n,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sz,int zi)1085 void blasfeo_dtrsv_lnn_mn(int m, int n, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi)
1086 {
1087 if(m==0 | n==0)
1088 return;
1089 #if defined(DIM_CHECK)
1090 // non-negative size
1091 if(m<0) printf("\n****** blasfeo_dtrsv_lnn_mn : m<0 : %d<0 *****\n", m);
1092 if(n<0) printf("\n****** blasfeo_dtrsv_lnn_mn : n<0 : %d<0 *****\n", n);
1093 // non-negative offset
1094 if(ai<0) printf("\n****** blasfeo_dtrsv_lnn_mn : ai<0 : %d<0 *****\n", ai);
1095 if(aj<0) printf("\n****** blasfeo_dtrsv_lnn_mn : aj<0 : %d<0 *****\n", aj);
1096 if(xi<0) printf("\n****** blasfeo_dtrsv_lnn_mn : xi<0 : %d<0 *****\n", xi);
1097 if(zi<0) printf("\n****** blasfeo_dtrsv_lnn_mn : zi<0 : %d<0 *****\n", zi);
1098 // inside matrix
1099 // A: m x k
1100 if(ai+m > sA->m) printf("\n***** blasfeo_dtrsv_lnn_mn : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
1101 if(aj+n > sA->n) printf("\n***** blasfeo_dtrsv_lnn_mn : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
1102 // x: m
1103 if(xi+m > sx->m) printf("\n***** blasfeo_dtrsv_lnn_mn : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
1104 // z: m
1105 if(zi+m > sz->m) printf("\n***** blasfeo_dtrsv_lnn_mn : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
1106 #endif
1107 if(ai!=0)
1108 {
1109 printf("\nblasfeo_dtrsv_lnn_mn: feature not implemented yet: ai=%d\n", ai);
1110 exit(1);
1111 }
1112 const int bs = 4;
1113 int sda = sA->cn;
1114 double *pA = sA->pA + aj*bs; // TODO ai
1115 double *dA = sA->dA;
1116 double *x = sx->pa + xi;
1117 double *z = sz->pa + zi;
1118 int ii;
1119 if(ai==0 & aj==0)
1120 {
1121 if(sA->use_dA!=1)
1122 {
1123 ddiaex_lib(n, 1.0, ai, pA, sda, dA);
1124 for(ii=0; ii<n; ii++)
1125 dA[ii] = 1.0 / dA[ii];
1126 sA->use_dA = 1;
1127 }
1128 }
1129 else
1130 {
1131 ddiaex_lib(n, 1.0, ai, pA, sda, dA);
1132 for(ii=0; ii<n; ii++)
1133 dA[ii] = 1.0 / dA[ii];
1134 sA->use_dA = 0;
1135 }
1136 dtrsv_ln_inv_lib(m, n, pA, sda, dA, x, z);
1137 return;
1138 }
1139
1140
1141
blasfeo_dtrsv_ltn_mn(int m,int n,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sz,int zi)1142 void blasfeo_dtrsv_ltn_mn(int m, int n, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi)
1143 {
1144 if(m==0)
1145 return;
1146 #if defined(DIM_CHECK)
1147 // non-negative size
1148 if(m<0) printf("\n****** blasfeo_dtrsv_ltn_mn : m<0 : %d<0 *****\n", m);
1149 if(n<0) printf("\n****** blasfeo_dtrsv_ltn_mn : n<0 : %d<0 *****\n", n);
1150 // non-negative offset
1151 if(ai<0) printf("\n****** blasfeo_dtrsv_ltn_mn : ai<0 : %d<0 *****\n", ai);
1152 if(aj<0) printf("\n****** blasfeo_dtrsv_ltn_mn : aj<0 : %d<0 *****\n", aj);
1153 if(xi<0) printf("\n****** blasfeo_dtrsv_ltn_mn : xi<0 : %d<0 *****\n", xi);
1154 if(zi<0) printf("\n****** blasfeo_dtrsv_ltn_mn : zi<0 : %d<0 *****\n", zi);
1155 // inside matrix
1156 // A: m x k
1157 if(ai+m > sA->m) printf("\n***** blasfeo_dtrsv_ltn_mn : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
1158 if(aj+n > sA->n) printf("\n***** blasfeo_dtrsv_ltn_mn : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
1159 // x: m
1160 if(xi+m > sx->m) printf("\n***** blasfeo_dtrsv_ltn_mn : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
1161 // z: m
1162 if(zi+m > sz->m) printf("\n***** blasfeo_dtrsv_ltn_mn : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
1163 #endif
1164 if(ai!=0)
1165 {
1166 printf("\nblasfeo_dtrsv_ltn_mn: feature not implemented yet: ai=%d\n", ai);
1167 exit(1);
1168 }
1169 const int bs = 4;
1170 int sda = sA->cn;
1171 double *pA = sA->pA + aj*bs; // TODO ai
1172 double *dA = sA->dA;
1173 double *x = sx->pa + xi;
1174 double *z = sz->pa + zi;
1175 int ii;
1176 if(ai==0 & aj==0)
1177 {
1178 if(sA->use_dA!=1)
1179 {
1180 ddiaex_lib(n, 1.0, ai, pA, sda, dA);
1181 for(ii=0; ii<n; ii++)
1182 dA[ii] = 1.0 / dA[ii];
1183 sA->use_dA = 1;
1184 }
1185 }
1186 else
1187 {
1188 ddiaex_lib(n, 1.0, ai, pA, sda, dA);
1189 for(ii=0; ii<n; ii++)
1190 dA[ii] = 1.0 / dA[ii];
1191 sA->use_dA = 0;
1192 }
1193 dtrsv_lt_inv_lib(m, n, pA, sda, dA, x, z);
1194 return;
1195 }
1196
1197
1198
blasfeo_dtrsv_lnn(int m,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sz,int zi)1199 void blasfeo_dtrsv_lnn(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi)
1200 {
1201 if(m==0)
1202 return;
1203 #if defined(DIM_CHECK)
1204 // non-negative size
1205 if(m<0) printf("\n****** blasfeo_dtrsv_lnn : m<0 : %d<0 *****\n", m);
1206 // non-negative offset
1207 if(ai<0) printf("\n****** blasfeo_dtrsv_lnn : ai<0 : %d<0 *****\n", ai);
1208 if(aj<0) printf("\n****** blasfeo_dtrsv_lnn : aj<0 : %d<0 *****\n", aj);
1209 if(xi<0) printf("\n****** blasfeo_dtrsv_lnn : xi<0 : %d<0 *****\n", xi);
1210 if(zi<0) printf("\n****** blasfeo_dtrsv_lnn : zi<0 : %d<0 *****\n", zi);
1211 // inside matrix
1212 // A: m x k
1213 if(ai+m > sA->m) printf("\n***** blasfeo_dtrsv_lnn : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
1214 if(aj+m > sA->n) printf("\n***** blasfeo_dtrsv_lnn : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
1215 // x: m
1216 if(xi+m > sx->m) printf("\n***** blasfeo_dtrsv_lnn : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
1217 // z: m
1218 if(zi+m > sz->m) printf("\n***** blasfeo_dtrsv_lnn : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
1219 #endif
1220 if(ai!=0)
1221 {
1222 printf("\nblasfeo_dtrsv_lnn: feature not implemented yet: ai=%d\n", ai);
1223 exit(1);
1224 }
1225 const int bs = 4;
1226 int sda = sA->cn;
1227 double *pA = sA->pA + aj*bs; // TODO ai
1228 double *dA = sA->dA;
1229 double *x = sx->pa + xi;
1230 double *z = sz->pa + zi;
1231 int ii;
1232 if(ai==0 & aj==0)
1233 {
1234 if(sA->use_dA!=1)
1235 {
1236 ddiaex_lib(m, 1.0, ai, pA, sda, dA);
1237 for(ii=0; ii<m; ii++)
1238 dA[ii] = 1.0 / dA[ii];
1239 sA->use_dA = 1;
1240 }
1241 }
1242 else
1243 {
1244 ddiaex_lib(m, 1.0, ai, pA, sda, dA);
1245 for(ii=0; ii<m; ii++)
1246 dA[ii] = 1.0 / dA[ii];
1247 sA->use_dA = 0;
1248 }
1249 dtrsv_ln_inv_lib(m, m, pA, sda, dA, x, z);
1250 return;
1251 }
1252
1253
1254
blasfeo_dtrsv_lnu(int m,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sz,int zi)1255 void blasfeo_dtrsv_lnu(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi)
1256 {
1257 if(m==0)
1258 return;
1259 #if defined(DIM_CHECK)
1260 // non-negative size
1261 if(m<0) printf("\n****** blasfeo_dtrsv_lnu : m<0 : %d<0 *****\n", m);
1262 // non-negative offset
1263 if(ai<0) printf("\n****** blasfeo_dtrsv_lnu : ai<0 : %d<0 *****\n", ai);
1264 if(aj<0) printf("\n****** blasfeo_dtrsv_lnu : aj<0 : %d<0 *****\n", aj);
1265 if(xi<0) printf("\n****** blasfeo_dtrsv_lnu : xi<0 : %d<0 *****\n", xi);
1266 if(zi<0) printf("\n****** blasfeo_dtrsv_lnu : zi<0 : %d<0 *****\n", zi);
1267 // inside matrix
1268 // A: m x k
1269 if(ai+m > sA->m) printf("\n***** blasfeo_dtrsv_lnu : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
1270 if(aj+m > sA->n) printf("\n***** blasfeo_dtrsv_lnu : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
1271 // x: m
1272 if(xi+m > sx->m) printf("\n***** blasfeo_dtrsv_lnu : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
1273 // z: m
1274 if(zi+m > sz->m) printf("\n***** blasfeo_dtrsv_lnu : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
1275 #endif
1276 // printf("\n***** blasfeo_dtrsv_lnu : feature not implemented yet *****\n");
1277 // exit(1);
1278 if(ai!=0)
1279 {
1280 printf("\nblasfeo_dtrsv_lnn: feature not implemented yet: ai=%d\n", ai);
1281 exit(1);
1282 }
1283 const int bs = 4;
1284 int sda = sA->cn;
1285 double *pA = sA->pA + aj*bs; // TODO ai
1286 double *dA = sA->dA;
1287 double *x = sx->pa + xi;
1288 double *z = sz->pa + zi;
1289 int ii;
1290 if(x!=z)
1291 {
1292 for(ii=0; ii<m; ii++)
1293 z[ii] = x[ii];
1294 }
1295 ii = 0;
1296 for( ; ii<m-3; ii+=4)
1297 {
1298 kernel_dtrsv_ln_one_4_lib4(ii, &pA[ii*sda], z, &z[ii], &z[ii]);
1299 }
1300 if(ii<m)
1301 {
1302 kernel_dtrsv_ln_one_4_vs_lib4(ii, &pA[ii*sda], z, &z[ii], &z[ii], m-ii, m-ii);
1303 ii+=4;
1304 }
1305 return;
1306 }
1307
1308
1309
blasfeo_dtrsv_ltn(int m,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sz,int zi)1310 void blasfeo_dtrsv_ltn(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi)
1311 {
1312 if(m==0)
1313 return;
1314 #if defined(DIM_CHECK)
1315 // non-negative size
1316 if(m<0) printf("\n****** blasfeo_dtrsv_ltn : m<0 : %d<0 *****\n", m);
1317 // non-negative offset
1318 if(ai<0) printf("\n****** blasfeo_dtrsv_ltn : ai<0 : %d<0 *****\n", ai);
1319 if(aj<0) printf("\n****** blasfeo_dtrsv_ltn : aj<0 : %d<0 *****\n", aj);
1320 if(xi<0) printf("\n****** blasfeo_dtrsv_ltn : xi<0 : %d<0 *****\n", xi);
1321 if(zi<0) printf("\n****** blasfeo_dtrsv_ltn : zi<0 : %d<0 *****\n", zi);
1322 // inside matrix
1323 // A: m x k
1324 if(ai+m > sA->m) printf("\n***** blasfeo_dtrsv_ltn : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
1325 if(aj+m > sA->n) printf("\n***** blasfeo_dtrsv_ltn : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
1326 // x: m
1327 if(xi+m > sx->m) printf("\n***** blasfeo_dtrsv_ltn : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
1328 // z: m
1329 if(zi+m > sz->m) printf("\n***** blasfeo_dtrsv_ltn : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
1330 #endif
1331 if(ai!=0)
1332 {
1333 printf("\nblasfeo_dtrsv_ltn: feature not implemented yet: ai=%d\n", ai);
1334 exit(1);
1335 }
1336 const int bs = 4;
1337 int sda = sA->cn;
1338 double *pA = sA->pA + aj*bs; // TODO ai
1339 double *dA = sA->dA;
1340 double *x = sx->pa + xi;
1341 double *z = sz->pa + zi;
1342 int ii;
1343 if(ai==0 & aj==0)
1344 {
1345 if(sA->use_dA!=1)
1346 {
1347 ddiaex_lib(m, 1.0, ai, pA, sda, dA);
1348 for(ii=0; ii<m; ii++)
1349 dA[ii] = 1.0 / dA[ii];
1350 sA->use_dA = 1;
1351 }
1352 }
1353 else
1354 {
1355 ddiaex_lib(m, 1.0, ai, pA, sda, dA);
1356 for(ii=0; ii<m; ii++)
1357 dA[ii] = 1.0 / dA[ii];
1358 sA->use_dA = 0;
1359 }
1360 dtrsv_lt_inv_lib(m, m, pA, sda, dA, x, z);
1361 return;
1362 }
1363
1364
1365
blasfeo_dtrsv_ltu(int m,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sz,int zi)1366 void blasfeo_dtrsv_ltu(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi)
1367 {
1368 if(m==0)
1369 return;
1370 #if defined(DIM_CHECK)
1371 // non-negative size
1372 if(m<0) printf("\n****** blasfeo_dtrsv_ltu : m<0 : %d<0 *****\n", m);
1373 // non-negative offset
1374 if(ai<0) printf("\n****** blasfeo_dtrsv_ltu : ai<0 : %d<0 *****\n", ai);
1375 if(aj<0) printf("\n****** blasfeo_dtrsv_ltu : aj<0 : %d<0 *****\n", aj);
1376 if(xi<0) printf("\n****** blasfeo_dtrsv_ltu : xi<0 : %d<0 *****\n", xi);
1377 if(zi<0) printf("\n****** blasfeo_dtrsv_ltu : zi<0 : %d<0 *****\n", zi);
1378 // inside matrix
1379 // A: m x k
1380 if(ai+m > sA->m) printf("\n***** blasfeo_dtrsv_ltu : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
1381 if(aj+m > sA->n) printf("\n***** blasfeo_dtrsv_ltu : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
1382 // x: m
1383 if(xi+m > sx->m) printf("\n***** blasfeo_dtrsv_ltu : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
1384 // z: m
1385 if(zi+m > sz->m) printf("\n***** blasfeo_dtrsv_ltu : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
1386 #endif
1387 // printf("\n***** blasfeo_dtrsv_ltu : feature not implemented yet *****\n");
1388 // exit(1);
1389 const int bs = 4;
1390 int sda = sA->cn;
1391 double *pA = sA->pA + aj*bs; // TODO ai
1392 double *dA = sA->dA;
1393 double *x = sx->pa + xi;
1394 double *z = sz->pa + zi;
1395 int ii;
1396 if(x!=z)
1397 for(ii=0; ii<m; ii++)
1398 z[ii] = x[ii];
1399 ii=0;
1400 if(m%4==1)
1401 {
1402 kernel_dtrsv_lt_one_1_lib4(1, &pA[m/bs*bs*sda+(m-1)*bs], sda, &z[m-1], &z[m-1], &z[m-1]);
1403 ii++;
1404 }
1405 else if(m%4==2)
1406 {
1407 kernel_dtrsv_lt_one_2_lib4(2, &pA[m/bs*bs*sda+(m-2)*bs], sda, &z[m-2], &z[m-2], &z[m-2]);
1408 ii+=2;
1409 }
1410 else if(m%4==3)
1411 {
1412 kernel_dtrsv_lt_one_3_lib4(3, &pA[m/bs*bs*sda+(m-3)*bs], sda, &z[m-3], &z[m-3], &z[m-3]);
1413 ii+=3;
1414 }
1415 for(; ii<m-3; ii+=4)
1416 {
1417 kernel_dtrsv_lt_one_4_lib4(ii+4, &pA[(m-ii-4)/bs*bs*sda+(m-ii-4)*bs], sda, &z[m-ii-4], &z[m-ii-4], &z[m-ii-4]);
1418 }
1419 return;
1420 }
1421
1422
1423
blasfeo_dtrsv_unn(int m,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sz,int zi)1424 void blasfeo_dtrsv_unn(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi)
1425 {
1426 if(m==0)
1427 return;
1428 #if defined(DIM_CHECK)
1429 // non-negative size
1430 if(m<0) printf("\n****** blasfeo_dtrsv_unn : m<0 : %d<0 *****\n", m);
1431 // non-negative offset
1432 if(ai<0) printf("\n****** blasfeo_dtrsv_unn : ai<0 : %d<0 *****\n", ai);
1433 if(aj<0) printf("\n****** blasfeo_dtrsv_unn : aj<0 : %d<0 *****\n", aj);
1434 if(xi<0) printf("\n****** blasfeo_dtrsv_unn : xi<0 : %d<0 *****\n", xi);
1435 if(zi<0) printf("\n****** blasfeo_dtrsv_unn : zi<0 : %d<0 *****\n", zi);
1436 // inside matrix
1437 // A: m x k
1438 if(ai+m > sA->m) printf("\n***** blasfeo_dtrsv_unn : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
1439 if(aj+m > sA->n) printf("\n***** blasfeo_dtrsv_unn : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
1440 // x: m
1441 if(xi+m > sx->m) printf("\n***** blasfeo_dtrsv_unn : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
1442 // z: m
1443 if(zi+m > sz->m) printf("\n***** blasfeo_dtrsv_unn : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
1444 #endif
1445 // printf("\n***** blasfeo_dtrsv_unn : feature not implemented yet *****\n");
1446 // exit(1);
1447 const int bs = 4;
1448 int sda = sA->cn;
1449 double *pA = sA->pA + aj*bs; // TODO ai
1450 double *dA = sA->dA;
1451 double *x = sx->pa + xi;
1452 double *z = sz->pa + zi;
1453 int ii;
1454 if(ai==0 & aj==0)
1455 {
1456 if(sA->use_dA!=1)
1457 {
1458 ddiaex_lib(m, 1.0, ai, pA, sda, dA);
1459 for(ii=0; ii<m; ii++)
1460 dA[ii] = 1.0 / dA[ii];
1461 sA->use_dA = 1;
1462 }
1463 }
1464 else
1465 {
1466 ddiaex_lib(m, 1.0, ai, pA, sda, dA);
1467 for(ii=0; ii<m; ii++)
1468 dA[ii] = 1.0 / dA[ii];
1469 sA->use_dA = 0;
1470 }
1471 if(x!=z)
1472 {
1473 for(ii=0; ii<m; ii++)
1474 z[ii] = x[ii];
1475 }
1476 ii = 0;
1477 if(m%4==1)
1478 {
1479 z[m-ii-1] *= dA[m-ii-1];
1480 ii+=1;
1481 }
1482 else if(m%4==2)
1483 {
1484 z[m-ii-1] *= dA[m-ii-1];
1485 z[m-ii-2] -= pA[m/bs*bs*sda+(m-ii-1)*bs]*z[m-ii-1];
1486 z[m-ii-2] *= dA[m-ii-2];
1487 ii+=2;
1488 }
1489 else if(m%4==3)
1490 {
1491 z[m-ii-1] *= dA[m-ii-1];
1492 z[m-ii-2] -= pA[1+m/bs*bs*sda+(m-ii-1)*bs]*z[m-ii-1];
1493 z[m-ii-2] *= dA[m-ii-2];
1494 z[m-ii-3] -= pA[m/bs*bs*sda+(m-ii-2)*bs]*z[m-ii-2];
1495 z[m-ii-3] -= pA[m/bs*bs*sda+(m-ii-1)*bs]*z[m-ii-1];
1496 z[m-ii-3] *= dA[m-ii-3];
1497 ii+=3;
1498 }
1499 for(; ii<m-3; ii+=4)
1500 {
1501 // TODO
1502 kernel_dtrsv_un_inv_4_lib4(ii+4, &pA[(m-ii-4)/bs*bs*sda+(m-ii-4)*bs], &dA[m-ii-4], &z[m-ii-4], &z[m-ii-4], &z[m-ii-4]);
1503 }
1504 return;
1505 }
1506
1507
1508
blasfeo_dtrsv_utn(int m,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dvec * sx,int xi,struct blasfeo_dvec * sz,int zi)1509 void blasfeo_dtrsv_utn(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi)
1510 {
1511 if(m==0)
1512 return;
1513 #if defined(DIM_CHECK)
1514 // non-negative size
1515 if(m<0) printf("\n****** blasfeo_dtrsv_utn : m<0 : %d<0 *****\n", m);
1516 // non-negative offset
1517 if(ai<0) printf("\n****** blasfeo_dtrsv_utn : ai<0 : %d<0 *****\n", ai);
1518 if(aj<0) printf("\n****** blasfeo_dtrsv_utn : aj<0 : %d<0 *****\n", aj);
1519 if(xi<0) printf("\n****** blasfeo_dtrsv_utn : xi<0 : %d<0 *****\n", xi);
1520 if(zi<0) printf("\n****** blasfeo_dtrsv_utn : zi<0 : %d<0 *****\n", zi);
1521 // inside matrix
1522 // A: m x k
1523 if(ai+m > sA->m) printf("\n***** blasfeo_dtrsv_utn : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
1524 if(aj+m > sA->n) printf("\n***** blasfeo_dtrsv_utn : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
1525 // x: m
1526 if(xi+m > sx->m) printf("\n***** blasfeo_dtrsv_utn : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
1527 // z: m
1528 if(zi+m > sz->m) printf("\n***** blasfeo_dtrsv_utn : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
1529 #endif
1530 // printf("\n***** blasfeo_dtrsv_utn : feature not implemented yet *****\n");
1531 // exit(1);
1532 const int bs = 4;
1533 int sda = sA->cn;
1534 double *pA = sA->pA + aj*bs; // TODO ai
1535 double *dA = sA->dA;
1536 double *x = sx->pa + xi;
1537 double *z = sz->pa + zi;
1538 int ii;
1539 if(ai==0 & aj==0)
1540 {
1541 if(sA->use_dA!=1)
1542 {
1543 ddiaex_lib(m, 1.0, ai, pA, sda, dA);
1544 for(ii=0; ii<m; ii++)
1545 dA[ii] = 1.0 / dA[ii];
1546 sA->use_dA = 1;
1547 }
1548 }
1549 else
1550 {
1551 ddiaex_lib(m, 1.0, ai, pA, sda, dA);
1552 for(ii=0; ii<m; ii++)
1553 dA[ii] = 1.0 / dA[ii];
1554 sA->use_dA = 0;
1555 }
1556 if(x!=z)
1557 {
1558 for(ii=0; ii<m; ii++)
1559 z[ii] = x[ii];
1560 }
1561 ii = 0;
1562 for( ; ii<m-3; ii+=4)
1563 {
1564 kernel_dtrsv_ut_inv_4_lib4(ii, &pA[ii*bs], sda, &dA[ii], z, &z[ii], &z[ii]);
1565 }
1566 if(ii<m)
1567 {
1568 kernel_dtrsv_ut_inv_4_vs_lib4(ii, &pA[ii*bs], sda, &dA[ii], z, &z[ii], &z[ii], m-ii, m-ii);
1569 ii+=4;
1570 }
1571 return;
1572 }
1573
1574
1575
1576 #else
1577
1578 #error : wrong LA choice
1579
1580 #endif
1581