1 /**************************************************************************************************
2 * *
3 * This file is part of BLASFEO. *
4 * *
5 * BLASFEO -- BLAS For Embedded Optimization. *
6 * Copyright (C) 2019 by Gianluca Frison. *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8 * All rights reserved. *
9 * *
10 * The 2-Clause BSD License *
11 * *
12 * Redistribution and use in source and binary forms, with or without *
13 * modification, are permitted provided that the following conditions are met: *
14 * *
15 * 1. Redistributions of source code must retain the above copyright notice, this *
16 * list of conditions and the following disclaimer. *
17 * 2. Redistributions in binary form must reproduce the above copyright notice, *
18 * this list of conditions and the following disclaimer in the documentation *
19 * and/or other materials provided with the distribution. *
20 * *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
31 * *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de *
33 * *
34 **************************************************************************************************/
35
36
37
38 #include "../../include/blasfeo_s_kernel.h"
39
40
41
42 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) //|| defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemv_n_4_lib4(int kmax,float * alpha,float * A,float * x,float * beta,float * y,float * z)43 void kernel_sgemv_n_4_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z)
44 {
45
46 const int bs = 4;
47
48 int k;
49
50 float x_0;
51
52 float yy[4] = {0.0, 0.0, 0.0, 0.0};
53
54 k=0;
55 for(; k<kmax-3; k+=4)
56 {
57
58 x_0 = x[0];
59
60 yy[0] += A[0+bs*0] * x_0;
61 yy[1] += A[1+bs*0] * x_0;
62 yy[2] += A[2+bs*0] * x_0;
63 yy[3] += A[3+bs*0] * x_0;
64
65 x_0 = x[1];
66
67 yy[0] += A[0+bs*1] * x_0;
68 yy[1] += A[1+bs*1] * x_0;
69 yy[2] += A[2+bs*1] * x_0;
70 yy[3] += A[3+bs*1] * x_0;
71
72 x_0 = x[2];
73
74 yy[0] += A[0+bs*2] * x_0;
75 yy[1] += A[1+bs*2] * x_0;
76 yy[2] += A[2+bs*2] * x_0;
77 yy[3] += A[3+bs*2] * x_0;
78
79 x_0 = x[3];
80
81 yy[0] += A[0+bs*3] * x_0;
82 yy[1] += A[1+bs*3] * x_0;
83 yy[2] += A[2+bs*3] * x_0;
84 yy[3] += A[3+bs*3] * x_0;
85
86 A += 4*bs;
87 x += 4;
88
89 }
90
91 for(; k<kmax; k++)
92 {
93
94 x_0 = x[0];
95
96 yy[0] += A[0+bs*0] * x_0;
97 yy[1] += A[1+bs*0] * x_0;
98 yy[2] += A[2+bs*0] * x_0;
99 yy[3] += A[3+bs*0] * x_0;
100
101 A += 1*bs;
102 x += 1;
103
104 }
105
106 z[0] = alpha[0]*yy[0] + beta[0]*y[0];
107 z[1] = alpha[0]*yy[1] + beta[0]*y[1];
108 z[2] = alpha[0]*yy[2] + beta[0]*y[2];
109 z[3] = alpha[0]*yy[3] + beta[0]*y[3];
110
111 return;
112
113 }
114 #endif
115
116
117
118 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) //|| defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemv_n_4_vs_lib4(int kmax,float * alpha,float * A,float * x,float * beta,float * y,float * z,int m1)119 void kernel_sgemv_n_4_vs_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z, int m1)
120 {
121
122 const int bs = 4;
123
124 float yy[4] = {0.0, 0.0, 0.0, 0.0};
125
126 kernel_sgemv_n_4_lib4(kmax, alpha, A, x, beta, y, yy);
127
128 z[0] = yy[0];
129 if(m1<2) return;
130 z[1] = yy[1];
131 if(m1<3) return;
132 z[2] = yy[2];
133 if(m1<4) return;
134 z[3] = yy[3];
135
136 return;
137
138 }
139 #endif
140
141
142
143 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemv_n_4_gen_lib4(int kmax,float * alpha,float * A,float * x,float * beta,float * y,float * z,int m0,int m1)144 void kernel_sgemv_n_4_gen_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z, int m0, int m1)
145 {
146
147 const int bs = 4;
148
149 float yy[4] = {0.0, 0.0, 0.0, 0.0};
150
151 kernel_sgemv_n_4_lib4(kmax, alpha, A, x, beta, y, yy);
152
153 if(m0<=0 & m1>0) z[0] = yy[0];
154 if(m0<=1 & m1>1) z[1] = yy[1];
155 if(m0<=2 & m1>2) z[2] = yy[2];
156 if(m0<=3 & m1>3) z[3] = yy[3];
157
158 return;
159
160 }
161 #endif
162
163
164
165 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) //|| defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemv_t_4_lib4(int kmax,float * alpha,int offA,float * A,int sda,float * x,float * beta,float * y,float * z)166 void kernel_sgemv_t_4_lib4(int kmax, float *alpha, int offA, float *A, int sda, float *x, float *beta, float *y, float *z)
167 {
168
169 const int bs = 4;
170
171 int k, kend;
172
173 float
174 x_0, x_1, x_2, x_3;
175
176 float yy[4] = {0.0, 0.0, 0.0, 0.0};
177
178 k=0;
179 if(offA!=0) // 1, 2, 3
180 {
181 kend = 4-offA<kmax ? 4-offA : kmax;
182 for(; k<kend; k++)
183 {
184
185 x_0 = x[0];
186
187 yy[0] += A[0+bs*0] * x_0;
188 yy[1] += A[0+bs*1] * x_0;
189 yy[2] += A[0+bs*2] * x_0;
190 yy[3] += A[0+bs*3] * x_0;
191
192 A += 1;
193 x += 1;
194
195 }
196 A += bs*(sda-1);
197 }
198 for(; k<kmax-bs+1; k+=bs)
199 {
200
201 x_0 = x[0];
202 x_1 = x[1];
203 x_2 = x[2];
204 x_3 = x[3];
205
206 yy[0] += A[0+bs*0] * x_0;
207 yy[1] += A[0+bs*1] * x_0;
208 yy[2] += A[0+bs*2] * x_0;
209 yy[3] += A[0+bs*3] * x_0;
210
211 yy[0] += A[1+bs*0] * x_1;
212 yy[1] += A[1+bs*1] * x_1;
213 yy[2] += A[1+bs*2] * x_1;
214 yy[3] += A[1+bs*3] * x_1;
215
216 yy[0] += A[2+bs*0] * x_2;
217 yy[1] += A[2+bs*1] * x_2;
218 yy[2] += A[2+bs*2] * x_2;
219 yy[3] += A[2+bs*3] * x_2;
220
221 yy[0] += A[3+bs*0] * x_3;
222 yy[1] += A[3+bs*1] * x_3;
223 yy[2] += A[3+bs*2] * x_3;
224 yy[3] += A[3+bs*3] * x_3;
225
226 A += sda*bs;
227 x += 4;
228
229 }
230 for(; k<kmax; k++)
231 {
232
233 x_0 = x[0];
234
235 yy[0] += A[0+bs*0] * x_0;
236 yy[1] += A[0+bs*1] * x_0;
237 yy[2] += A[0+bs*2] * x_0;
238 yy[3] += A[0+bs*3] * x_0;
239
240 A += 1;
241 x += 1;
242
243 }
244
245 z[0] = alpha[0]*yy[0] + beta[0]*y[0];
246 z[1] = alpha[0]*yy[1] + beta[0]*y[1];
247 z[2] = alpha[0]*yy[2] + beta[0]*y[2];
248 z[3] = alpha[0]*yy[3] + beta[0]*y[3];
249
250 return;
251
252 }
253 #endif
254
255
256
257
258 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) //|| defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemv_t_4_vs_lib4(int kmax,float * alpha,int offsetA,float * A,int sda,float * x,float * beta,float * y,float * z,int m1)259 void kernel_sgemv_t_4_vs_lib4(int kmax, float *alpha, int offsetA, float *A, int sda, float *x, float *beta, float *y, float *z, int m1)
260 {
261
262 const int bs = 4;
263
264 float yy[4] = {0.0, 0.0, 0.0, 0.0};
265
266 kernel_sgemv_t_4_lib4(kmax, alpha, offsetA, A, sda, x, beta, y, yy);
267
268 z[0] = yy[0];
269 if(m1<2) return;
270 z[1] = yy[1];
271 if(m1<3) return;
272 z[2] = yy[2];
273 if(m1<4) return;
274 z[3] = yy[3];
275
276 return;
277
278
279 }
280 #endif
281
282
283
284
285 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) //|| defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsv_ln_inv_4_vs_lib4(int kmax,float * A,float * inv_diag_A,float * x,float * y,float * z,int m1,int n1)286 void kernel_strsv_ln_inv_4_vs_lib4(int kmax, float *A, float *inv_diag_A, float *x, float *y, float *z, int m1, int n1)
287 {
288
289 const int bs = 4;
290
291 float yy[4] = {0.0, 0.0, 0.0, 0.0};
292
293 float alpha1 = -1.0;
294 float beta1 = 1.0;
295
296 int k1 = kmax/bs*bs;
297
298 kernel_sgemv_n_4_lib4(k1, &alpha1, A, x, &beta1, y, yy);
299
300 A += k1*bs;
301
302 float
303 a_00, a_10, a_20, a_30,
304 a_11, a_21, a_31;
305
306 // a_00
307 a_00 = inv_diag_A[0];
308 a_10 = A[1+bs*0];
309 a_20 = A[2+bs*0];
310 a_30 = A[3+bs*0];
311 yy[0] *= a_00;
312 z[0] = yy[0];
313 yy[1] -= a_10 * yy[0];
314 yy[2] -= a_20 * yy[0];
315 yy[3] -= a_30 * yy[0];
316
317 if(n1==1)
318 {
319 if(m1==1)
320 return;
321 z[1] = yy[1];
322 if(m1==2)
323 return;
324 z[2] = yy[2];
325 if(m1==3)
326 return;
327 z[3] = yy[3];
328 return;
329 }
330
331 // a_11
332 a_11 = inv_diag_A[1];
333 a_21 = A[2+bs*1];
334 a_31 = A[3+bs*1];
335 yy[1] *= a_11;
336 z[1] = yy[1];
337 yy[2] -= a_21 * yy[1];
338 yy[3] -= a_31 * yy[1];
339
340 if(n1==2)
341 {
342 if(m1==2)
343 return;
344 z[2] = yy[2];
345 if(m1==3)
346 return;
347 z[3] = yy[3];
348 return;
349 }
350
351 // a_22
352 a_00 = inv_diag_A[2];
353 a_10 = A[3+bs*2];
354 yy[2] *= a_00;
355 z[2] = yy[2];
356 yy[3] -= a_10 * yy[2];
357
358 if(n1==3)
359 {
360 if(m1==3)
361 return;
362 z[3] = yy[3];
363
364 return;
365 }
366
367 // a_33
368 a_11 = inv_diag_A[3];
369 yy[3] *= a_11;
370 z[3] = yy[3];
371
372 return;
373
374 }
375 #endif
376
377
378
379 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) //|| defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsv_ln_inv_4_lib4(int kmax,float * A,float * inv_diag_A,float * x,float * y,float * z)380 void kernel_strsv_ln_inv_4_lib4(int kmax, float *A, float *inv_diag_A, float *x, float *y, float *z)
381 {
382
383 const int bs = 4;
384
385 float yy[4] = {0.0, 0.0, 0.0, 0.0};
386
387 float alpha1 = -1.0;
388 float beta1 = 1.0;
389
390 int k1 = kmax/bs*bs;
391
392 kernel_sgemv_n_4_lib4(k1, &alpha1, A, x, &beta1, y, yy);
393
394 A += k1*bs;
395
396 float
397 a_00, a_10, a_20, a_30,
398 a_11, a_21, a_31;
399
400 // a_00
401 a_00 = inv_diag_A[0];
402 a_10 = A[1+bs*0];
403 a_20 = A[2+bs*0];
404 a_30 = A[3+bs*0];
405 yy[0] *= a_00;
406 z[0] = yy[0];
407 yy[1] -= a_10 * yy[0];
408 yy[2] -= a_20 * yy[0];
409 yy[3] -= a_30 * yy[0];
410
411 // a_11
412 a_11 = inv_diag_A[1];
413 a_21 = A[2+bs*1];
414 a_31 = A[3+bs*1];
415 yy[1] *= a_11;
416 z[1] = yy[1];
417 yy[2] -= a_21 * yy[1];
418 yy[3] -= a_31 * yy[1];
419
420 // a_22
421 a_00 = inv_diag_A[2];
422 a_10 = A[3+bs*2];
423 yy[2] *= a_00;
424 z[2] = yy[2];
425 yy[3] -= a_10 * yy[2];
426
427 // a_33
428 a_11 = inv_diag_A[3];
429 yy[3] *= a_11;
430 z[3] = yy[3];
431
432 return;
433
434 }
435 #endif
436
437
438 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsv_ln_one_4_vs_lib4(int kmax,float * A,float * x,float * y,float * z,int m1,int n1)439 void kernel_strsv_ln_one_4_vs_lib4(int kmax, float *A, float *x, float *y, float *z, int m1, int n1)
440 {
441
442 const int bs = 4;
443
444 float yy[4] = {0.0, 0.0, 0.0, 0.0};
445
446 float alpha1 = -1.0;
447 float beta1 = 1.0;
448
449 int k1 = kmax/bs*bs;
450
451 kernel_sgemv_n_4_lib4(k1, &alpha1, A, x, &beta1, y, yy);
452
453 A += k1*bs;
454
455 float
456 a_00, a_10, a_20, a_30,
457 a_11, a_21, a_31;
458
459 // a_00
460 // a_00 = 1.0;
461 a_10 = A[1+bs*0];
462 a_20 = A[2+bs*0];
463 a_30 = A[3+bs*0];
464 // yy[0] *= a_00;
465 z[0] = yy[0];
466 yy[1] -= a_10 * yy[0];
467 yy[2] -= a_20 * yy[0];
468 yy[3] -= a_30 * yy[0];
469
470 if(n1==1)
471 {
472 if(m1==1)
473 return;
474 z[1] = yy[1];
475 if(m1==2)
476 return;
477 z[2] = yy[2];
478 if(m1==3)
479 return;
480 z[3] = yy[3];
481 return;
482 }
483
484 // a_11
485 // a_11 = 1.0;
486 a_21 = A[2+bs*1];
487 a_31 = A[3+bs*1];
488 // yy[1] *= a_11;
489 z[1] = yy[1];
490 yy[2] -= a_21 * yy[1];
491 yy[3] -= a_31 * yy[1];
492
493 if(n1==2)
494 {
495 if(m1==2)
496 return;
497 z[2] = yy[2];
498 if(m1==3)
499 return;
500 z[3] = yy[3];
501 return;
502 }
503
504 // a_22
505 // a_00 = 1.0;
506 a_10 = A[3+bs*2];
507 // yy[2] *= a_00;
508 z[2] = yy[2];
509 yy[3] -= a_10 * yy[2];
510
511 if(n1==3)
512 {
513 if(m1==3)
514 return;
515 z[3] = yy[3];
516
517 return;
518 }
519
520 // a_33
521 // a_11 = 1.0;
522 // yy[3] *= a_11;
523 z[3] = yy[3];
524
525 return;
526
527 }
528 #endif
529
530
531
532 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsv_ln_one_4_lib4(int kmax,float * A,float * x,float * y,float * z)533 void kernel_strsv_ln_one_4_lib4(int kmax, float *A, float *x, float *y, float *z)
534 {
535
536 const int bs = 4;
537
538 float yy[4] = {0.0, 0.0, 0.0, 0.0};
539
540 float alpha1 = -1.0;
541 float beta1 = 1.0;
542
543 int k1 = kmax/bs*bs;
544
545 kernel_sgemv_n_4_lib4(k1, &alpha1, A, x, &beta1, y, yy);
546
547 A += k1*bs;
548
549 float
550 a_00, a_10, a_20, a_30,
551 a_11, a_21, a_31;
552
553 // a_00
554 // a_00 = 1.0;
555 a_10 = A[1+bs*0];
556 a_20 = A[2+bs*0];
557 a_30 = A[3+bs*0];
558 // yy[0] *= a_00;
559 z[0] = yy[0];
560 yy[1] -= a_10 * yy[0];
561 yy[2] -= a_20 * yy[0];
562 yy[3] -= a_30 * yy[0];
563
564 // a_11
565 // a_11 = 1.0;
566 a_21 = A[2+bs*1];
567 a_31 = A[3+bs*1];
568 // yy[1] *= a_11;
569 z[1] = yy[1];
570 yy[2] -= a_21 * yy[1];
571 yy[3] -= a_31 * yy[1];
572
573 // a_22
574 // a_00 = 1.0;
575 a_10 = A[3+bs*2];
576 // yy[2] *= a_00;
577 z[2] = yy[2];
578 yy[3] -= a_10 * yy[2];
579
580 // a_33
581 // a_11 = 1.0;
582 // yy[3] *= a_11;
583 z[3] = yy[3];
584
585 return;
586
587 }
588 #endif
589
590
591
592 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsv_lt_inv_4_lib4(int kmax,float * A,int sda,float * inv_diag_A,float * x,float * y,float * z)593 void kernel_strsv_lt_inv_4_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
594 {
595
596 const int bs = 4;
597
598 float yy[4] = {0, 0, 0, 0};
599
600 float alpha = -1.0;
601 float beta = 1.0;
602 kernel_sgemv_t_4_lib4(kmax-4, &alpha, 0, A+4+(sda-1)*bs, sda, x+4, &beta, y, yy);
603
604 // bottom trinagle
605 yy[3] *= inv_diag_A[3];
606 z[3] = yy[3];
607
608 yy[2] -= A[3+bs*2] * yy[3];
609 yy[2] *= inv_diag_A[2];
610 z[2] = yy[2];
611
612 // square
613 yy[0] -= A[2+bs*0]*yy[2] + A[3+bs*0]*yy[3];
614 yy[1] -= A[2+bs*1]*yy[2] + A[3+bs*1]*yy[3];
615
616 // top trinagle
617 yy[1] *= inv_diag_A[1];
618 z[1] = yy[1];
619
620 yy[0] -= A[1+bs*0] * yy[1];
621 yy[0] *= inv_diag_A[0];
622 z[0] = yy[0];
623
624 return;
625
626 }
627 #endif
628
629
630
631 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsv_lt_inv_3_lib4(int kmax,float * A,int sda,float * inv_diag_A,float * x,float * y,float * z)632 void kernel_strsv_lt_inv_3_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
633 {
634
635 const int bs = 4;
636
637 int
638 k;
639
640 float *tA, *tx;
641 tA = A;
642 tx = x;
643
644 float
645 x_0, x_1, x_2, x_3,
646 y_0=0, y_1=0, y_2=0;
647
648 k = 3;
649 if(kmax>4)
650 {
651 // clean up at the beginning
652 x_3 = x[3];
653
654 y_0 -= A[3+bs*0] * x_3;
655 y_1 -= A[3+bs*1] * x_3;
656 y_2 -= A[3+bs*2] * x_3;
657
658 k=4;
659 A += 4 + (sda-1)*bs;
660 x += 4;
661 for(; k<kmax-3; k+=4)
662 {
663
664 x_0 = x[0];
665 x_1 = x[1];
666 x_2 = x[2];
667 x_3 = x[3];
668
669 y_0 -= A[0+bs*0] * x_0;
670 y_1 -= A[0+bs*1] * x_0;
671 y_2 -= A[0+bs*2] * x_0;
672
673 y_0 -= A[1+bs*0] * x_1;
674 y_1 -= A[1+bs*1] * x_1;
675 y_2 -= A[1+bs*2] * x_1;
676
677 y_0 -= A[2+bs*0] * x_2;
678 y_1 -= A[2+bs*1] * x_2;
679 y_2 -= A[2+bs*2] * x_2;
680
681 y_0 -= A[3+bs*0] * x_3;
682 y_1 -= A[3+bs*1] * x_3;
683 y_2 -= A[3+bs*2] * x_3;
684
685 A += sda*bs;
686 x += 4;
687
688 }
689 }
690 else
691 {
692 A += 3;
693 x += 1;
694 }
695 for(; k<kmax; k++)
696 {
697
698 x_0 = x[0];
699
700 y_0 -= A[0+bs*0] * x_0;
701 y_1 -= A[0+bs*1] * x_0;
702 y_2 -= A[0+bs*2] * x_0;
703
704 A += 1;//sda*bs;
705 x += 1;
706
707 }
708
709 y_0 = y[0] + y_0;
710 y_1 = y[1] + y_1;
711 y_2 = y[2] + y_2;
712
713 A = tA;
714 x = tx;
715
716 // bottom trinagle
717 y_2 *= inv_diag_A[2];
718 z[2] = y_2;
719
720 // square
721 y_0 -= A[2+bs*0]*y_2;
722 y_1 -= A[2+bs*1]*y_2;
723
724 // top trinagle
725 y_1 *= inv_diag_A[1];
726 z[1] = y_1;
727
728 y_0 -= A[1+bs*0] * y_1;
729 y_0 *= inv_diag_A[0];
730 z[0] = y_0;
731
732 }
733 #endif
734
735
736
737 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsv_lt_inv_2_lib4(int kmax,float * A,int sda,float * inv_diag_A,float * x,float * y,float * z)738 void kernel_strsv_lt_inv_2_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
739 {
740
741 const int bs = 4;
742
743 int
744 k;
745
746 float *tA, *tx;
747 tA = A;
748 tx = x;
749
750 float
751 x_0, x_1, x_2, x_3,
752 y_0=0, y_1=0;
753
754 k = 2;
755 if(kmax>4)
756 {
757 // clean up at the beginning
758 x_2 = x[2];
759 x_3 = x[3];
760
761 y_0 -= A[2+bs*0] * x_2;
762 y_1 -= A[2+bs*1] * x_2;
763
764 y_0 -= A[3+bs*0] * x_3;
765 y_1 -= A[3+bs*1] * x_3;
766
767 k=4;
768 A += 4 + (sda-1)*bs;
769 x += 4;
770 for(; k<kmax-3; k+=4)
771 {
772
773 x_0 = x[0];
774 x_1 = x[1];
775 x_2 = x[2];
776 x_3 = x[3];
777
778 y_0 -= A[0+bs*0] * x_0;
779 y_1 -= A[0+bs*1] * x_0;
780
781 y_0 -= A[1+bs*0] * x_1;
782 y_1 -= A[1+bs*1] * x_1;
783
784 y_0 -= A[2+bs*0] * x_2;
785 y_1 -= A[2+bs*1] * x_2;
786
787 y_0 -= A[3+bs*0] * x_3;
788 y_1 -= A[3+bs*1] * x_3;
789
790 A += sda*bs;
791 x += 4;
792
793 }
794 }
795 else
796 {
797 A += 2;
798 x += 2;
799 }
800 for(; k<kmax; k++)
801 {
802
803 x_0 = x[0];
804
805 y_0 -= A[0+bs*0] * x_0;
806 y_1 -= A[0+bs*1] * x_0;
807
808 A += 1;//sda*bs;
809 x += 1;
810
811 }
812
813 y_0 = y[0] + y_0;
814 y_1 = y[1] + y_1;
815
816 A = tA;
817 x = tx;
818
819 // top trinagle
820 y_1 *= inv_diag_A[1];
821 z[1] = y_1;
822
823 y_0 -= A[1+bs*0] * y_1;
824 y_0 *= inv_diag_A[0];
825 z[0] = y_0;
826
827 }
828 #endif
829
830
831
832 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsv_lt_inv_1_lib4(int kmax,float * A,int sda,float * inv_diag_A,float * x,float * y,float * z)833 void kernel_strsv_lt_inv_1_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
834 {
835
836 const int bs = 4;
837
838 int
839 k;
840
841 float *tA, *tx;
842 tA = A;
843 tx = x;
844
845 float
846 x_0, x_1, x_2, x_3,
847 y_0=0;
848
849 k = 1;
850 if(kmax>4)
851 {
852 // clean up at the beginning
853 x_1 = x[1];
854 x_2 = x[2];
855 x_3 = x[3];
856
857 y_0 -= A[1+bs*0] * x_1;
858 y_0 -= A[2+bs*0] * x_2;
859 y_0 -= A[3+bs*0] * x_3;
860
861 k=4;
862 A += 4 + (sda-1)*bs;
863 x += 4;
864 for(; k<kmax-3; k+=4)
865 {
866
867 x_0 = x[0];
868 x_1 = x[1];
869 x_2 = x[2];
870 x_3 = x[3];
871
872 y_0 -= A[0+bs*0] * x_0;
873 y_0 -= A[1+bs*0] * x_1;
874 y_0 -= A[2+bs*0] * x_2;
875 y_0 -= A[3+bs*0] * x_3;
876
877 A += sda*bs;
878 x += 4;
879
880 }
881 }
882 else
883 {
884 A += 1;
885 x += 1;
886 }
887 for(; k<kmax; k++)
888 {
889
890 x_0 = x[0];
891
892 y_0 -= A[0+bs*0] * x_0;
893
894 A += 1;//sda*bs;
895 x += 1;
896
897 }
898
899 y_0 = y[0] + y_0;
900
901 A = tA;
902 x = tx;
903
904 // top trinagle
905 y_0 *= inv_diag_A[0];
906 z[0] = y_0;
907
908 }
909 #endif
910
911
912
913 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsv_lt_one_4_lib4(int kmax,float * A,int sda,float * x,float * y,float * z)914 void kernel_strsv_lt_one_4_lib4(int kmax, float *A, int sda, float *x, float *y, float *z)
915 {
916
917 const int bs = 4;
918
919 float yy[4] = {0, 0, 0, 0};
920
921 float alpha = -1.0;
922 float beta = 1.0;
923
924 kernel_sgemv_t_4_lib4(kmax-4, &alpha, 0, A+4+(sda-1)*bs, sda, x+4, &beta, y, yy);
925
926 // bottom trinagle
927 z[3] = yy[3];
928
929 yy[2] -= A[3+bs*2] * yy[3];
930 z[2] = yy[2];
931
932 // square
933 yy[0] -= A[2+bs*0]*yy[2] + A[3+bs*0]*yy[3];
934 yy[1] -= A[2+bs*1]*yy[2] + A[3+bs*1]*yy[3];
935
936 // top trinagle
937 z[1] = yy[1];
938
939 yy[0] -= A[1+bs*0] * yy[1];
940 z[0] = yy[0];
941
942 }
943 #endif
944
945
946
947 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsv_lt_one_3_lib4(int kmax,float * A,int sda,float * x,float * y,float * z)948 void kernel_strsv_lt_one_3_lib4(int kmax, float *A, int sda, float *x, float *y, float *z)
949 {
950
951 const int bs = 4;
952
953 int
954 k;
955
956 float *tA, *tx;
957 tA = A;
958 tx = x;
959
960 float
961 x_0, x_1, x_2, x_3,
962 y_0=0, y_1=0, y_2=0;
963
964 k = 3;
965 if(kmax>4)
966 {
967 // clean up at the beginning
968 x_3 = x[3];
969
970 y_0 -= A[3+bs*0] * x_3;
971 y_1 -= A[3+bs*1] * x_3;
972 y_2 -= A[3+bs*2] * x_3;
973
974 k=4;
975 A += 4 + (sda-1)*bs;
976 x += 4;
977 for(; k<kmax-3; k+=4)
978 {
979
980 x_0 = x[0];
981 x_1 = x[1];
982 x_2 = x[2];
983 x_3 = x[3];
984
985 y_0 -= A[0+bs*0] * x_0;
986 y_1 -= A[0+bs*1] * x_0;
987 y_2 -= A[0+bs*2] * x_0;
988
989 y_0 -= A[1+bs*0] * x_1;
990 y_1 -= A[1+bs*1] * x_1;
991 y_2 -= A[1+bs*2] * x_1;
992
993 y_0 -= A[2+bs*0] * x_2;
994 y_1 -= A[2+bs*1] * x_2;
995 y_2 -= A[2+bs*2] * x_2;
996
997 y_0 -= A[3+bs*0] * x_3;
998 y_1 -= A[3+bs*1] * x_3;
999 y_2 -= A[3+bs*2] * x_3;
1000
1001 A += sda*bs;
1002 x += 4;
1003
1004 }
1005 }
1006 else
1007 {
1008 A += 3;
1009 x += 1;
1010 }
1011 for(; k<kmax; k++)
1012 {
1013
1014 x_0 = x[0];
1015
1016 y_0 -= A[0+bs*0] * x_0;
1017 y_1 -= A[0+bs*1] * x_0;
1018 y_2 -= A[0+bs*2] * x_0;
1019
1020 A += 1;//sda*bs;
1021 x += 1;
1022
1023 }
1024
1025 y_0 = y[0] + y_0;
1026 y_1 = y[1] + y_1;
1027 y_2 = y[2] + y_2;
1028
1029 A = tA;
1030 x = tx;
1031
1032 // bottom trinagle
1033 z[2] = y_2;
1034
1035 // square
1036 y_0 -= A[2+bs*0]*y_2;
1037 y_1 -= A[2+bs*1]*y_2;
1038
1039 // top trinagle
1040 z[1] = y_1;
1041
1042 y_0 -= A[1+bs*0] * y_1;
1043 z[0] = y_0;
1044
1045 }
1046 #endif
1047
1048
1049
1050 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsv_lt_one_2_lib4(int kmax,float * A,int sda,float * x,float * y,float * z)1051 void kernel_strsv_lt_one_2_lib4(int kmax, float *A, int sda, float *x, float *y, float *z)
1052 {
1053
1054 const int bs = 4;
1055
1056 int
1057 k;
1058
1059 float *tA, *tx;
1060 tA = A;
1061 tx = x;
1062
1063 float
1064 x_0, x_1, x_2, x_3,
1065 y_0=0, y_1=0;
1066
1067 k = 2;
1068 if(kmax>4)
1069 {
1070 // clean up at the beginning
1071 x_2 = x[2];
1072 x_3 = x[3];
1073
1074 y_0 -= A[2+bs*0] * x_2;
1075 y_1 -= A[2+bs*1] * x_2;
1076
1077 y_0 -= A[3+bs*0] * x_3;
1078 y_1 -= A[3+bs*1] * x_3;
1079
1080 k=4;
1081 A += 4 + (sda-1)*bs;
1082 x += 4;
1083 for(; k<kmax-3; k+=4)
1084 {
1085
1086 x_0 = x[0];
1087 x_1 = x[1];
1088 x_2 = x[2];
1089 x_3 = x[3];
1090
1091 y_0 -= A[0+bs*0] * x_0;
1092 y_1 -= A[0+bs*1] * x_0;
1093
1094 y_0 -= A[1+bs*0] * x_1;
1095 y_1 -= A[1+bs*1] * x_1;
1096
1097 y_0 -= A[2+bs*0] * x_2;
1098 y_1 -= A[2+bs*1] * x_2;
1099
1100 y_0 -= A[3+bs*0] * x_3;
1101 y_1 -= A[3+bs*1] * x_3;
1102
1103 A += sda*bs;
1104 x += 4;
1105
1106 }
1107 }
1108 else
1109 {
1110 A += 2;
1111 x += 2;
1112 }
1113 for(; k<kmax; k++)
1114 {
1115
1116 x_0 = x[0];
1117
1118 y_0 -= A[0+bs*0] * x_0;
1119 y_1 -= A[0+bs*1] * x_0;
1120
1121 A += 1;//sda*bs;
1122 x += 1;
1123
1124 }
1125
1126 y_0 = y[0] + y_0;
1127 y_1 = y[1] + y_1;
1128
1129 A = tA;
1130 x = tx;
1131
1132 // top trinagle
1133 z[1] = y_1;
1134
1135 y_0 -= A[1+bs*0] * y_1;
1136 z[0] = y_0;
1137
1138 }
1139 #endif
1140
1141
1142
1143 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsv_lt_one_1_lib4(int kmax,float * A,int sda,float * x,float * y,float * z)1144 void kernel_strsv_lt_one_1_lib4(int kmax, float *A, int sda, float *x, float *y, float *z)
1145 {
1146
1147 const int bs = 4;
1148
1149 int
1150 k;
1151
1152 float *tA, *tx;
1153 tA = A;
1154 tx = x;
1155
1156 float
1157 x_0, x_1, x_2, x_3,
1158 y_0=0;
1159
1160 k = 1;
1161 if(kmax>4)
1162 {
1163 // clean up at the beginning
1164 x_1 = x[1];
1165 x_2 = x[2];
1166 x_3 = x[3];
1167
1168 y_0 -= A[1+bs*0] * x_1;
1169 y_0 -= A[2+bs*0] * x_2;
1170 y_0 -= A[3+bs*0] * x_3;
1171
1172 k=4;
1173 A += 4 + (sda-1)*bs;
1174 x += 4;
1175 for(; k<kmax-3; k+=4)
1176 {
1177
1178 x_0 = x[0];
1179 x_1 = x[1];
1180 x_2 = x[2];
1181 x_3 = x[3];
1182
1183 y_0 -= A[0+bs*0] * x_0;
1184 y_0 -= A[1+bs*0] * x_1;
1185 y_0 -= A[2+bs*0] * x_2;
1186 y_0 -= A[3+bs*0] * x_3;
1187
1188 A += sda*bs;
1189 x += 4;
1190
1191 }
1192 }
1193 else
1194 {
1195 A += 1;
1196 x += 1;
1197 }
1198 for(; k<kmax; k++)
1199 {
1200
1201 x_0 = x[0];
1202
1203 y_0 -= A[0+bs*0] * x_0;
1204
1205 A += 1;//sda*bs;
1206 x += 1;
1207
1208 }
1209
1210 y_0 = y[0] + y_0;
1211
1212 A = tA;
1213 x = tx;
1214
1215 // top trinagle
1216 z[0] = y_0;
1217
1218 }
1219 #endif
1220
1221
1222
1223 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsv_un_inv_4_lib4(int kmax,float * A,float * inv_diag_A,float * x,float * y,float * z)1224 void kernel_strsv_un_inv_4_lib4(int kmax, float *A, float *inv_diag_A, float *x, float *y, float *z)
1225 {
1226
1227 const int bs = 4;
1228
1229 float yy[4] = {0, 0, 0, 0};
1230
1231 float alpha = -1.0;
1232 float beta = 1.0;
1233
1234 kernel_sgemv_n_4_lib4(kmax-4, &alpha, A+4*bs, x+4, &beta, y, yy);
1235
1236 // bottom trinagle
1237 yy[3] *= inv_diag_A[3];
1238 z[3] = yy[3];
1239
1240 yy[2] -= A[2+bs*3] * yy[3];
1241 yy[2] *= inv_diag_A[2];
1242 z[2] = yy[2];
1243
1244 // square
1245 yy[0] -= A[0+bs*2]*yy[2] + A[0+bs*3]*yy[3];
1246 yy[1] -= A[1+bs*2]*yy[2] + A[1+bs*3]*yy[3];
1247
1248 // top trinagle
1249 yy[1] *= inv_diag_A[1];
1250 z[1] = yy[1];
1251
1252 yy[0] -= A[0+bs*1] * yy[1];
1253 yy[0] *= inv_diag_A[0];
1254 z[0] = yy[0];
1255
1256 }
1257 #endif
1258
1259
1260
1261 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsv_ut_inv_4_vs_lib4(int kmax,float * A,int sda,float * inv_diag_A,float * x,float * y,float * z,int m1,int n1)1262 void kernel_strsv_ut_inv_4_vs_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z, int m1, int n1)
1263 {
1264
1265 const int bs = 4;
1266
1267 float yy[4] = {0, 0, 0, 0};
1268
1269 int k1 = kmax/bs*bs;
1270 float alpha = -1.0;
1271 float beta = 1.0;
1272
1273 kernel_sgemv_t_4_lib4(k1, &alpha, 0, A, sda, x, &beta, y, yy);
1274
1275 A += sda*k1;
1276
1277 float
1278 a_00, a_10, a_20, a_30,
1279 a_11, a_21, a_31;
1280
1281 // a_00
1282 a_00 = inv_diag_A[0];
1283 a_10 = A[0+bs*1];
1284 a_20 = A[0+bs*2];
1285 a_30 = A[0+bs*3];
1286 yy[0] *= a_00;
1287 z[0] = yy[0];
1288 yy[1] -= a_10 * yy[0];
1289 yy[2] -= a_20 * yy[0];
1290 yy[3] -= a_30 * yy[0];
1291
1292 if(n1==1)
1293 {
1294 if(m1==1)
1295 return;
1296 z[1] = yy[1];
1297 if(m1==2)
1298 return;
1299 z[2] = yy[2];
1300 if(m1==3)
1301 return;
1302 z[3] = yy[3];
1303 return;
1304 }
1305
1306 // a_11
1307 a_11 = inv_diag_A[1];
1308 a_21 = A[1+bs*2];
1309 a_31 = A[1+bs*3];
1310 yy[1] *= a_11;
1311 z[1] = yy[1];
1312 yy[2] -= a_21 * yy[1];
1313 yy[3] -= a_31 * yy[1];
1314
1315 if(n1==2)
1316 {
1317 if(m1==2)
1318 return;
1319 z[2] = yy[2];
1320 if(m1==3)
1321 return;
1322 z[3] = yy[3];
1323 return;
1324 }
1325
1326 // a_22
1327 a_00 = inv_diag_A[2];
1328 a_10 = A[2+bs*3];
1329 yy[2] *= a_00;
1330 z[2] = yy[2];
1331 yy[3] -= a_10 * yy[2];
1332
1333 if(n1==3)
1334 {
1335 if(m1==3)
1336 return;
1337 z[3] = yy[3];
1338
1339 return;
1340 }
1341
1342 // a_33
1343 a_11 = inv_diag_A[3];
1344 yy[3] *= a_11;
1345 z[3] = yy[3];
1346
1347 return;
1348
1349 }
1350 #endif
1351
1352
1353
1354 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsv_ut_inv_4_lib4(int kmax,float * A,int sda,float * inv_diag_A,float * x,float * y,float * z)1355 void kernel_strsv_ut_inv_4_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
1356 {
1357
1358 const int bs = 4;
1359
1360 float yy[4] = {0, 0, 0, 0};
1361
1362 int k1 = kmax/bs*bs;
1363 float alpha = -1.0;
1364 float beta = 1.0;
1365
1366 kernel_sgemv_t_4_lib4(k1, &alpha, 0, A, sda, x, &beta, y, yy);
1367
1368 A += sda*k1;
1369
1370 float
1371 a_00, a_10, a_20, a_30,
1372 a_11, a_21, a_31;
1373
1374 // a_00
1375 a_00 = inv_diag_A[0];
1376 a_10 = A[0+bs*1];
1377 a_20 = A[0+bs*2];
1378 a_30 = A[0+bs*3];
1379 yy[0] *= a_00;
1380 z[0] = yy[0];
1381 yy[1] -= a_10 * yy[0];
1382 yy[2] -= a_20 * yy[0];
1383 yy[3] -= a_30 * yy[0];
1384
1385 // a_11
1386 a_11 = inv_diag_A[1];
1387 a_21 = A[1+bs*2];
1388 a_31 = A[1+bs*3];
1389 yy[1] *= a_11;
1390 z[1] = yy[1];
1391 yy[2] -= a_21 * yy[1];
1392 yy[3] -= a_31 * yy[1];
1393
1394 // a_22
1395 a_00 = inv_diag_A[2];
1396 a_10 = A[2+bs*3];
1397 yy[2] *= a_00;
1398 z[2] = yy[2];
1399 yy[3] -= a_10 * yy[2];
1400
1401 // a_33
1402 a_11 = inv_diag_A[3];
1403 yy[3] *= a_11;
1404 z[3] = yy[3];
1405
1406 return;
1407
1408 }
1409 #endif
1410
1411
1412
1413 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strmv_un_4_lib4(int kmax,float * A,float * x,float * z)1414 void kernel_strmv_un_4_lib4(int kmax, float *A, float *x, float *z)
1415 {
1416
1417 const int bs = 4;
1418
1419 float yy[4] = {0, 0, 0, 0};
1420
1421 float x_0, x_1, x_2, x_3;
1422
1423 x_0 = x[0];
1424 x_1 = x[1];
1425 x_2 = x[2];
1426 x_3 = x[3];
1427
1428 yy[0] += A[0+bs*0] * x_0;
1429 /* yy[1] += A[1+bs*0] * x_0;*/
1430 /* yy[2] += A[2+bs*0] * x_0;*/
1431 /* yy[3] += A[3+bs*0] * x_0;*/
1432
1433 yy[0] += A[0+bs*1] * x_1;
1434 yy[1] += A[1+bs*1] * x_1;
1435 /* yy[2] += A[2+bs*1] * x_1;*/
1436 /* yy[3] += A[3+bs*1] * x_1;*/
1437
1438 yy[0] += A[0+bs*2] * x_2;
1439 yy[1] += A[1+bs*2] * x_2;
1440 yy[2] += A[2+bs*2] * x_2;
1441 /* yy[3] += A[3+bs*2] * x_2;*/
1442
1443 yy[0] += A[0+bs*3] * x_3;
1444 yy[1] += A[1+bs*3] * x_3;
1445 yy[2] += A[2+bs*3] * x_3;
1446 yy[3] += A[3+bs*3] * x_3;
1447
1448 float alpha1 = 1.0;
1449 float beta1 = 1.0;
1450
1451 kernel_sgemv_n_4_lib4(kmax-4, &alpha1, A+4*bs, x+4, &beta1, yy, z);
1452
1453 return;
1454
1455 }
1456 #endif
1457
1458
1459
1460 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strmv_ut_4_vs_lib4(int kmax,float * A,int sda,float * x,float * z,int m1)1461 void kernel_strmv_ut_4_vs_lib4(int kmax, float *A, int sda, float *x, float *z, int m1)
1462 {
1463
1464 const int bs = 4;
1465
1466 float yy[4] = {0, 0, 0, 0};
1467
1468 float x_0, x_1, x_2, x_3;
1469
1470 int k1 = kmax/bs*bs;
1471 float alpha1 = 1.0;
1472 float beta1 = 1.0;
1473
1474 kernel_sgemv_t_4_lib4(k1, &alpha1, 0, A, sda, x, &beta1, yy, yy);
1475
1476 A += k1*sda;
1477 x += k1;
1478
1479 x_0 = x[0];
1480 x_1 = x[1];
1481 x_2 = x[2];
1482 x_3 = x[3];
1483
1484 yy[0] += A[0+bs*0] * x_0;
1485 yy[1] += A[0+bs*1] * x_0;
1486 yy[2] += A[0+bs*2] * x_0;
1487 yy[3] += A[0+bs*3] * x_0;
1488
1489 /* yy[0] += A[1+bs*0] * x_1;*/
1490 yy[1] += A[1+bs*1] * x_1;
1491 yy[2] += A[1+bs*2] * x_1;
1492 yy[3] += A[1+bs*3] * x_1;
1493
1494 /* yy[0] += A[2+bs*0] * x_2;*/
1495 /* yy[1] += A[2+bs*1] * x_2;*/
1496 yy[2] += A[2+bs*2] * x_2;
1497 yy[3] += A[2+bs*3] * x_2;
1498
1499 /* yy[0] += A[3+bs*0] * x_3;*/
1500 /* yy[1] += A[3+bs*1] * x_3;*/
1501 /* yy[2] += A[3+bs*2] * x_3;*/
1502 yy[3] += A[3+bs*3] * x_3;
1503
1504 // A += sda*bs;
1505 // x += 4;
1506
1507 // store_vs
1508 if(m1>=4)
1509 {
1510 z[0] = yy[0];
1511 z[1] = yy[1];
1512 z[2] = yy[2];
1513 z[3] = yy[3];
1514 }
1515 else
1516 {
1517 z[0] = yy[0];
1518 if(m1>=2)
1519 {
1520 z[1] = yy[1];
1521 if(m1>2)
1522 {
1523 z[2] = yy[2];
1524 }
1525 }
1526 }
1527
1528 return;
1529
1530 }
1531 #endif
1532
1533
1534
1535 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strmv_ut_4_lib4(int kmax,float * A,int sda,float * x,float * z)1536 void kernel_strmv_ut_4_lib4(int kmax, float *A, int sda, float *x, float *z)
1537 {
1538
1539 const int bs = 4;
1540
1541 float yy[4] = {0, 0, 0, 0};
1542
1543 float x_0, x_1, x_2, x_3;
1544
1545 int k1 = kmax/bs*bs;
1546 float alpha1 = 1.0;
1547 float beta1 = 1.0;
1548
1549 kernel_sgemv_t_4_lib4(k1, &alpha1, 0, A, sda, x, &beta1, yy, yy);
1550
1551 A += k1*sda;
1552 x += k1;
1553
1554 x_0 = x[0];
1555 x_1 = x[1];
1556 x_2 = x[2];
1557 x_3 = x[3];
1558
1559 yy[0] += A[0+bs*0] * x_0;
1560 yy[1] += A[0+bs*1] * x_0;
1561 yy[2] += A[0+bs*2] * x_0;
1562 yy[3] += A[0+bs*3] * x_0;
1563
1564 /* yy[0] += A[1+bs*0] * x_1;*/
1565 yy[1] += A[1+bs*1] * x_1;
1566 yy[2] += A[1+bs*2] * x_1;
1567 yy[3] += A[1+bs*3] * x_1;
1568
1569 /* yy[0] += A[2+bs*0] * x_2;*/
1570 /* yy[1] += A[2+bs*1] * x_2;*/
1571 yy[2] += A[2+bs*2] * x_2;
1572 yy[3] += A[2+bs*3] * x_2;
1573
1574 /* yy[0] += A[3+bs*0] * x_3;*/
1575 /* yy[1] += A[3+bs*1] * x_3;*/
1576 /* yy[2] += A[3+bs*2] * x_3;*/
1577 yy[3] += A[3+bs*3] * x_3;
1578
1579 // A += sda*bs;
1580 // x += 4;
1581
1582 z[0] = yy[0];
1583 z[1] = yy[1];
1584 z[2] = yy[2];
1585 z[3] = yy[3];
1586
1587 return;
1588
1589 }
1590 #endif
1591
1592
1593