1 /**************************************************************************************************
2 *                                                                                                 *
3 * This file is part of BLASFEO.                                                                   *
4 *                                                                                                 *
5 * BLASFEO -- BLAS For Embedded Optimization.                                                      *
6 * Copyright (C) 2019 by Gianluca Frison.                                                          *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
8 * All rights reserved.                                                                            *
9 *                                                                                                 *
10 * The 2-Clause BSD License                                                                        *
11 *                                                                                                 *
12 * Redistribution and use in source and binary forms, with or without                              *
13 * modification, are permitted provided that the following conditions are met:                     *
14 *                                                                                                 *
15 * 1. Redistributions of source code must retain the above copyright notice, this                  *
16 *    list of conditions and the following disclaimer.                                             *
17 * 2. Redistributions in binary form must reproduce the above copyright notice,                    *
18 *    this list of conditions and the following disclaimer in the documentation                    *
19 *    and/or other materials provided with the distribution.                                       *
20 *                                                                                                 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND                 *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED                   *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE                          *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR                 *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES                  *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;                    *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND                     *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT                      *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS                   *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                                    *
31 *                                                                                                 *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de                             *
33 *                                                                                                 *
34 **************************************************************************************************/
35 
36 
37 
38 #include "../../include/blasfeo_s_kernel.h"
39 
40 
41 
42 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) //|| defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemv_n_4_lib4(int kmax,float * alpha,float * A,float * x,float * beta,float * y,float * z)43 void kernel_sgemv_n_4_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z)
44 	{
45 
46 	const int bs = 4;
47 
48 	int k;
49 
50 	float x_0;
51 
52 	float yy[4] = {0.0, 0.0, 0.0, 0.0};
53 
54 	k=0;
55 	for(; k<kmax-3; k+=4)
56 		{
57 
58 		x_0 = x[0];
59 
60 		yy[0] += A[0+bs*0] * x_0;
61 		yy[1] += A[1+bs*0] * x_0;
62 		yy[2] += A[2+bs*0] * x_0;
63 		yy[3] += A[3+bs*0] * x_0;
64 
65 		x_0 = x[1];
66 
67 		yy[0] += A[0+bs*1] * x_0;
68 		yy[1] += A[1+bs*1] * x_0;
69 		yy[2] += A[2+bs*1] * x_0;
70 		yy[3] += A[3+bs*1] * x_0;
71 
72 		x_0 = x[2];
73 
74 		yy[0] += A[0+bs*2] * x_0;
75 		yy[1] += A[1+bs*2] * x_0;
76 		yy[2] += A[2+bs*2] * x_0;
77 		yy[3] += A[3+bs*2] * x_0;
78 
79 		x_0 = x[3];
80 
81 		yy[0] += A[0+bs*3] * x_0;
82 		yy[1] += A[1+bs*3] * x_0;
83 		yy[2] += A[2+bs*3] * x_0;
84 		yy[3] += A[3+bs*3] * x_0;
85 
86 		A += 4*bs;
87 		x += 4;
88 
89 		}
90 
91 	for(; k<kmax; k++)
92 		{
93 
94 		x_0 = x[0];
95 
96 		yy[0] += A[0+bs*0] * x_0;
97 		yy[1] += A[1+bs*0] * x_0;
98 		yy[2] += A[2+bs*0] * x_0;
99 		yy[3] += A[3+bs*0] * x_0;
100 
101 		A += 1*bs;
102 		x += 1;
103 
104 		}
105 
106 	z[0] = alpha[0]*yy[0] + beta[0]*y[0];
107 	z[1] = alpha[0]*yy[1] + beta[0]*y[1];
108 	z[2] = alpha[0]*yy[2] + beta[0]*y[2];
109 	z[3] = alpha[0]*yy[3] + beta[0]*y[3];
110 
111 	return;
112 
113 	}
114 #endif
115 
116 
117 
118 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) //|| defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemv_n_4_vs_lib4(int kmax,float * alpha,float * A,float * x,float * beta,float * y,float * z,int m1)119 void kernel_sgemv_n_4_vs_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z, int m1)
120 	{
121 
122 	const int bs = 4;
123 
124 	float yy[4] = {0.0, 0.0, 0.0, 0.0};
125 
126 	kernel_sgemv_n_4_lib4(kmax, alpha, A, x, beta, y, yy);
127 
128 	z[0] = yy[0];
129 	if(m1<2) return;
130 	z[1] = yy[1];
131 	if(m1<3) return;
132 	z[2] = yy[2];
133 	if(m1<4) return;
134 	z[3] = yy[3];
135 
136 	return;
137 
138 	}
139 #endif
140 
141 
142 
143 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemv_n_4_gen_lib4(int kmax,float * alpha,float * A,float * x,float * beta,float * y,float * z,int m0,int m1)144 void kernel_sgemv_n_4_gen_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z, int m0, int m1)
145 	{
146 
147 	const int bs = 4;
148 
149 	float yy[4] = {0.0, 0.0, 0.0, 0.0};
150 
151 	kernel_sgemv_n_4_lib4(kmax, alpha, A, x, beta, y, yy);
152 
153 	if(m0<=0 & m1>0) z[0] = yy[0];
154 	if(m0<=1 & m1>1) z[1] = yy[1];
155 	if(m0<=2 & m1>2) z[2] = yy[2];
156 	if(m0<=3 & m1>3) z[3] = yy[3];
157 
158 	return;
159 
160 	}
161 #endif
162 
163 
164 
165 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) //|| defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemv_t_4_lib4(int kmax,float * alpha,int offA,float * A,int sda,float * x,float * beta,float * y,float * z)166 void kernel_sgemv_t_4_lib4(int kmax, float *alpha, int offA, float *A, int sda, float *x, float *beta, float *y, float *z)
167 	{
168 
169 	const int bs  = 4;
170 
171 	int k, kend;
172 
173 	float
174 		x_0, x_1, x_2, x_3;
175 
176 	float yy[4] = {0.0, 0.0, 0.0, 0.0};
177 
178 	k=0;
179 	if(offA!=0) // 1, 2, 3
180 		{
181 		kend = 4-offA<kmax ? 4-offA : kmax;
182 		for(; k<kend; k++)
183 			{
184 
185 			x_0 = x[0];
186 
187 			yy[0] += A[0+bs*0] * x_0;
188 			yy[1] += A[0+bs*1] * x_0;
189 			yy[2] += A[0+bs*2] * x_0;
190 			yy[3] += A[0+bs*3] * x_0;
191 
192 			A += 1;
193 			x += 1;
194 
195 			}
196 		A += bs*(sda-1);
197 		}
198 	for(; k<kmax-bs+1; k+=bs)
199 		{
200 
201 		x_0 = x[0];
202 		x_1 = x[1];
203 		x_2 = x[2];
204 		x_3 = x[3];
205 
206 		yy[0] += A[0+bs*0] * x_0;
207 		yy[1] += A[0+bs*1] * x_0;
208 		yy[2] += A[0+bs*2] * x_0;
209 		yy[3] += A[0+bs*3] * x_0;
210 
211 		yy[0] += A[1+bs*0] * x_1;
212 		yy[1] += A[1+bs*1] * x_1;
213 		yy[2] += A[1+bs*2] * x_1;
214 		yy[3] += A[1+bs*3] * x_1;
215 
216 		yy[0] += A[2+bs*0] * x_2;
217 		yy[1] += A[2+bs*1] * x_2;
218 		yy[2] += A[2+bs*2] * x_2;
219 		yy[3] += A[2+bs*3] * x_2;
220 
221 		yy[0] += A[3+bs*0] * x_3;
222 		yy[1] += A[3+bs*1] * x_3;
223 		yy[2] += A[3+bs*2] * x_3;
224 		yy[3] += A[3+bs*3] * x_3;
225 
226 		A += sda*bs;
227 		x += 4;
228 
229 		}
230 	for(; k<kmax; k++)
231 		{
232 
233 		x_0 = x[0];
234 
235 		yy[0] += A[0+bs*0] * x_0;
236 		yy[1] += A[0+bs*1] * x_0;
237 		yy[2] += A[0+bs*2] * x_0;
238 		yy[3] += A[0+bs*3] * x_0;
239 
240 		A += 1;
241 		x += 1;
242 
243 		}
244 
245 	z[0] = alpha[0]*yy[0] + beta[0]*y[0];
246 	z[1] = alpha[0]*yy[1] + beta[0]*y[1];
247 	z[2] = alpha[0]*yy[2] + beta[0]*y[2];
248 	z[3] = alpha[0]*yy[3] + beta[0]*y[3];
249 
250 	return;
251 
252 	}
253 #endif
254 
255 
256 
257 
258 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) //|| defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemv_t_4_vs_lib4(int kmax,float * alpha,int offsetA,float * A,int sda,float * x,float * beta,float * y,float * z,int m1)259 void kernel_sgemv_t_4_vs_lib4(int kmax, float *alpha, int offsetA, float *A, int sda, float *x, float *beta, float *y, float *z, int m1)
260 	{
261 
262 	const int bs = 4;
263 
264 	float yy[4] = {0.0, 0.0, 0.0, 0.0};
265 
266 	kernel_sgemv_t_4_lib4(kmax, alpha, offsetA, A, sda, x, beta, y, yy);
267 
268 	z[0] = yy[0];
269 	if(m1<2) return;
270 	z[1] = yy[1];
271 	if(m1<3) return;
272 	z[2] = yy[2];
273 	if(m1<4) return;
274 	z[3] = yy[3];
275 
276 	return;
277 
278 
279 	}
280 #endif
281 
282 
283 
284 
285 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) //|| defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsv_ln_inv_4_vs_lib4(int kmax,float * A,float * inv_diag_A,float * x,float * y,float * z,int m1,int n1)286 void kernel_strsv_ln_inv_4_vs_lib4(int kmax, float *A, float *inv_diag_A, float *x, float *y, float *z, int m1, int n1)
287 	{
288 
289 	const int bs = 4;
290 
291 	float yy[4] = {0.0, 0.0, 0.0, 0.0};
292 
293 	float alpha1 = -1.0;
294 	float beta1  = 1.0;
295 
296 	int k1 = kmax/bs*bs;
297 
298 	kernel_sgemv_n_4_lib4(k1, &alpha1, A, x, &beta1, y, yy);
299 
300 	A += k1*bs;
301 
302 	float
303 		a_00, a_10, a_20, a_30,
304 		a_11, a_21, a_31;
305 
306 	// a_00
307 	a_00 = inv_diag_A[0];
308 	a_10 = A[1+bs*0];
309 	a_20 = A[2+bs*0];
310 	a_30 = A[3+bs*0];
311 	yy[0] *= a_00;
312 	z[0] = yy[0];
313 	yy[1] -= a_10 * yy[0];
314 	yy[2] -= a_20 * yy[0];
315 	yy[3] -= a_30 * yy[0];
316 
317 	if(n1==1)
318 		{
319 		if(m1==1)
320 			return;
321 		z[1] = yy[1];
322 		if(m1==2)
323 			return;
324 		z[2] = yy[2];
325 		if(m1==3)
326 			return;
327 		z[3] = yy[3];
328 		return;
329 		}
330 
331 	// a_11
332 	a_11 = inv_diag_A[1];
333 	a_21 = A[2+bs*1];
334 	a_31 = A[3+bs*1];
335 	yy[1] *= a_11;
336 	z[1] = yy[1];
337 	yy[2] -= a_21 * yy[1];
338 	yy[3] -= a_31 * yy[1];
339 
340 	if(n1==2)
341 		{
342 		if(m1==2)
343 			return;
344 		z[2] = yy[2];
345 		if(m1==3)
346 			return;
347 		z[3] = yy[3];
348 		return;
349 		}
350 
351 	// a_22
352 	a_00 = inv_diag_A[2];
353 	a_10 = A[3+bs*2];
354 	yy[2] *= a_00;
355 	z[2] = yy[2];
356 	yy[3] -= a_10 * yy[2];
357 
358 	if(n1==3)
359 		{
360 		if(m1==3)
361 			return;
362 		z[3] = yy[3];
363 
364 		return;
365 		}
366 
367 	// a_33
368 	a_11 = inv_diag_A[3];
369 	yy[3] *= a_11;
370 	z[3] = yy[3];
371 
372 	return;
373 
374 	}
375 #endif
376 
377 
378 
379 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) //|| defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsv_ln_inv_4_lib4(int kmax,float * A,float * inv_diag_A,float * x,float * y,float * z)380 void kernel_strsv_ln_inv_4_lib4(int kmax, float *A, float *inv_diag_A, float *x, float *y, float *z)
381 	{
382 
383 	const int bs = 4;
384 
385 	float yy[4] = {0.0, 0.0, 0.0, 0.0};
386 
387 	float alpha1 = -1.0;
388 	float beta1  = 1.0;
389 
390 	int k1 = kmax/bs*bs;
391 
392 	kernel_sgemv_n_4_lib4(k1, &alpha1, A, x, &beta1, y, yy);
393 
394 	A += k1*bs;
395 
396 	float
397 		a_00, a_10, a_20, a_30,
398 		a_11, a_21, a_31;
399 
400 	// a_00
401 	a_00 = inv_diag_A[0];
402 	a_10 = A[1+bs*0];
403 	a_20 = A[2+bs*0];
404 	a_30 = A[3+bs*0];
405 	yy[0] *= a_00;
406 	z[0] = yy[0];
407 	yy[1] -= a_10 * yy[0];
408 	yy[2] -= a_20 * yy[0];
409 	yy[3] -= a_30 * yy[0];
410 
411 	// a_11
412 	a_11 = inv_diag_A[1];
413 	a_21 = A[2+bs*1];
414 	a_31 = A[3+bs*1];
415 	yy[1] *= a_11;
416 	z[1] = yy[1];
417 	yy[2] -= a_21 * yy[1];
418 	yy[3] -= a_31 * yy[1];
419 
420 	// a_22
421 	a_00 = inv_diag_A[2];
422 	a_10 = A[3+bs*2];
423 	yy[2] *= a_00;
424 	z[2] = yy[2];
425 	yy[3] -= a_10 * yy[2];
426 
427 	// a_33
428 	a_11 = inv_diag_A[3];
429 	yy[3] *= a_11;
430 	z[3] = yy[3];
431 
432 	return;
433 
434 	}
435 #endif
436 
437 
438 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsv_ln_one_4_vs_lib4(int kmax,float * A,float * x,float * y,float * z,int m1,int n1)439 void kernel_strsv_ln_one_4_vs_lib4(int kmax, float *A, float *x, float *y, float *z, int m1, int n1)
440 	{
441 
442 	const int bs = 4;
443 
444 	float yy[4] = {0.0, 0.0, 0.0, 0.0};
445 
446 	float alpha1 = -1.0;
447 	float beta1  = 1.0;
448 
449 	int k1 = kmax/bs*bs;
450 
451 	kernel_sgemv_n_4_lib4(k1, &alpha1, A, x, &beta1, y, yy);
452 
453 	A += k1*bs;
454 
455 	float
456 		a_00, a_10, a_20, a_30,
457 		a_11, a_21, a_31;
458 
459 	// a_00
460 //	a_00 = 1.0;
461 	a_10 = A[1+bs*0];
462 	a_20 = A[2+bs*0];
463 	a_30 = A[3+bs*0];
464 //	yy[0] *= a_00;
465 	z[0] = yy[0];
466 	yy[1] -= a_10 * yy[0];
467 	yy[2] -= a_20 * yy[0];
468 	yy[3] -= a_30 * yy[0];
469 
470 	if(n1==1)
471 		{
472 		if(m1==1)
473 			return;
474 		z[1] = yy[1];
475 		if(m1==2)
476 			return;
477 		z[2] = yy[2];
478 		if(m1==3)
479 			return;
480 		z[3] = yy[3];
481 		return;
482 		}
483 
484 	// a_11
485 //	a_11 = 1.0;
486 	a_21 = A[2+bs*1];
487 	a_31 = A[3+bs*1];
488 //	yy[1] *= a_11;
489 	z[1] = yy[1];
490 	yy[2] -= a_21 * yy[1];
491 	yy[3] -= a_31 * yy[1];
492 
493 	if(n1==2)
494 		{
495 		if(m1==2)
496 			return;
497 		z[2] = yy[2];
498 		if(m1==3)
499 			return;
500 		z[3] = yy[3];
501 		return;
502 		}
503 
504 	// a_22
505 //	a_00 = 1.0;
506 	a_10 = A[3+bs*2];
507 //	yy[2] *= a_00;
508 	z[2] = yy[2];
509 	yy[3] -= a_10 * yy[2];
510 
511 	if(n1==3)
512 		{
513 		if(m1==3)
514 			return;
515 		z[3] = yy[3];
516 
517 		return;
518 		}
519 
520 	// a_33
521 //	a_11 = 1.0;
522 //	yy[3] *= a_11;
523 	z[3] = yy[3];
524 
525 	return;
526 
527 	}
528 #endif
529 
530 
531 
532 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsv_ln_one_4_lib4(int kmax,float * A,float * x,float * y,float * z)533 void kernel_strsv_ln_one_4_lib4(int kmax, float *A, float *x, float *y, float *z)
534 	{
535 
536 	const int bs = 4;
537 
538 	float yy[4] = {0.0, 0.0, 0.0, 0.0};
539 
540 	float alpha1 = -1.0;
541 	float beta1  = 1.0;
542 
543 	int k1 = kmax/bs*bs;
544 
545 	kernel_sgemv_n_4_lib4(k1, &alpha1, A, x, &beta1, y, yy);
546 
547 	A += k1*bs;
548 
549 	float
550 		a_00, a_10, a_20, a_30,
551 		a_11, a_21, a_31;
552 
553 	// a_00
554 //	a_00 = 1.0;
555 	a_10 = A[1+bs*0];
556 	a_20 = A[2+bs*0];
557 	a_30 = A[3+bs*0];
558 //	yy[0] *= a_00;
559 	z[0] = yy[0];
560 	yy[1] -= a_10 * yy[0];
561 	yy[2] -= a_20 * yy[0];
562 	yy[3] -= a_30 * yy[0];
563 
564 	// a_11
565 //	a_11 = 1.0;
566 	a_21 = A[2+bs*1];
567 	a_31 = A[3+bs*1];
568 //	yy[1] *= a_11;
569 	z[1] = yy[1];
570 	yy[2] -= a_21 * yy[1];
571 	yy[3] -= a_31 * yy[1];
572 
573 	// a_22
574 //	a_00 = 1.0;
575 	a_10 = A[3+bs*2];
576 //	yy[2] *= a_00;
577 	z[2] = yy[2];
578 	yy[3] -= a_10 * yy[2];
579 
580 	// a_33
581 //	a_11 = 1.0;
582 //	yy[3] *= a_11;
583 	z[3] = yy[3];
584 
585 	return;
586 
587 	}
588 #endif
589 
590 
591 
592 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsv_lt_inv_4_lib4(int kmax,float * A,int sda,float * inv_diag_A,float * x,float * y,float * z)593 void kernel_strsv_lt_inv_4_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
594 	{
595 
596 	const int bs = 4;
597 
598 	float yy[4] = {0, 0, 0, 0};
599 
600 	float alpha = -1.0;
601 	float beta = 1.0;
602 	kernel_sgemv_t_4_lib4(kmax-4, &alpha, 0, A+4+(sda-1)*bs, sda, x+4, &beta, y, yy);
603 
604 	// bottom trinagle
605 	yy[3] *= inv_diag_A[3];
606 	z[3] = yy[3];
607 
608 	yy[2] -= A[3+bs*2] * yy[3];
609 	yy[2] *= inv_diag_A[2];
610 	z[2] = yy[2];
611 
612 	// square
613 	yy[0] -= A[2+bs*0]*yy[2] + A[3+bs*0]*yy[3];
614 	yy[1] -= A[2+bs*1]*yy[2] + A[3+bs*1]*yy[3];
615 
616 	// top trinagle
617 	yy[1] *= inv_diag_A[1];
618 	z[1] = yy[1];
619 
620 	yy[0] -= A[1+bs*0] * yy[1];
621 	yy[0] *= inv_diag_A[0];
622 	z[0] = yy[0];
623 
624 	return;
625 
626 	}
627 #endif
628 
629 
630 
631 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsv_lt_inv_3_lib4(int kmax,float * A,int sda,float * inv_diag_A,float * x,float * y,float * z)632 void kernel_strsv_lt_inv_3_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
633 	{
634 
635 	const int bs = 4;
636 
637 	int
638 		k;
639 
640 	float *tA, *tx;
641 	tA = A;
642 	tx = x;
643 
644 	float
645 		x_0, x_1, x_2, x_3,
646 		y_0=0, y_1=0, y_2=0;
647 
648 	k = 3;
649 	if(kmax>4)
650 		{
651 		// clean up at the beginning
652 		x_3 = x[3];
653 
654 		y_0 -= A[3+bs*0] * x_3;
655 		y_1 -= A[3+bs*1] * x_3;
656 		y_2 -= A[3+bs*2] * x_3;
657 
658 		k=4;
659 		A += 4 + (sda-1)*bs;
660 		x += 4;
661 		for(; k<kmax-3; k+=4)
662 			{
663 
664 			x_0 = x[0];
665 			x_1 = x[1];
666 			x_2 = x[2];
667 			x_3 = x[3];
668 
669 			y_0 -= A[0+bs*0] * x_0;
670 			y_1 -= A[0+bs*1] * x_0;
671 			y_2 -= A[0+bs*2] * x_0;
672 
673 			y_0 -= A[1+bs*0] * x_1;
674 			y_1 -= A[1+bs*1] * x_1;
675 			y_2 -= A[1+bs*2] * x_1;
676 
677 			y_0 -= A[2+bs*0] * x_2;
678 			y_1 -= A[2+bs*1] * x_2;
679 			y_2 -= A[2+bs*2] * x_2;
680 
681 			y_0 -= A[3+bs*0] * x_3;
682 			y_1 -= A[3+bs*1] * x_3;
683 			y_2 -= A[3+bs*2] * x_3;
684 
685 			A += sda*bs;
686 			x += 4;
687 
688 			}
689 		}
690 	else
691 		{
692 		A += 3;
693 		x += 1;
694 		}
695 	for(; k<kmax; k++)
696 		{
697 
698 		x_0 = x[0];
699 
700 		y_0 -= A[0+bs*0] * x_0;
701 		y_1 -= A[0+bs*1] * x_0;
702 		y_2 -= A[0+bs*2] * x_0;
703 
704 		A += 1;//sda*bs;
705 		x += 1;
706 
707 		}
708 
709 	y_0 = y[0] + y_0;
710 	y_1 = y[1] + y_1;
711 	y_2 = y[2] + y_2;
712 
713 	A = tA;
714 	x = tx;
715 
716 	// bottom trinagle
717 	y_2 *= inv_diag_A[2];
718 	z[2] = y_2;
719 
720 	// square
721 	y_0 -= A[2+bs*0]*y_2;
722 	y_1 -= A[2+bs*1]*y_2;
723 
724 	// top trinagle
725 	y_1 *= inv_diag_A[1];
726 	z[1] = y_1;
727 
728 	y_0 -= A[1+bs*0] * y_1;
729 	y_0 *= inv_diag_A[0];
730 	z[0] = y_0;
731 
732 	}
733 #endif
734 
735 
736 
737 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsv_lt_inv_2_lib4(int kmax,float * A,int sda,float * inv_diag_A,float * x,float * y,float * z)738 void kernel_strsv_lt_inv_2_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
739 	{
740 
741 	const int bs = 4;
742 
743 	int
744 		k;
745 
746 	float *tA, *tx;
747 	tA = A;
748 	tx = x;
749 
750 	float
751 		x_0, x_1, x_2, x_3,
752 		y_0=0, y_1=0;
753 
754 	k = 2;
755 	if(kmax>4)
756 		{
757 		// clean up at the beginning
758 		x_2 = x[2];
759 		x_3 = x[3];
760 
761 		y_0 -= A[2+bs*0] * x_2;
762 		y_1 -= A[2+bs*1] * x_2;
763 
764 		y_0 -= A[3+bs*0] * x_3;
765 		y_1 -= A[3+bs*1] * x_3;
766 
767 		k=4;
768 		A += 4 + (sda-1)*bs;
769 		x += 4;
770 		for(; k<kmax-3; k+=4)
771 			{
772 
773 			x_0 = x[0];
774 			x_1 = x[1];
775 			x_2 = x[2];
776 			x_3 = x[3];
777 
778 			y_0 -= A[0+bs*0] * x_0;
779 			y_1 -= A[0+bs*1] * x_0;
780 
781 			y_0 -= A[1+bs*0] * x_1;
782 			y_1 -= A[1+bs*1] * x_1;
783 
784 			y_0 -= A[2+bs*0] * x_2;
785 			y_1 -= A[2+bs*1] * x_2;
786 
787 			y_0 -= A[3+bs*0] * x_3;
788 			y_1 -= A[3+bs*1] * x_3;
789 
790 			A += sda*bs;
791 			x += 4;
792 
793 			}
794 		}
795 	else
796 		{
797 		A += 2;
798 		x += 2;
799 		}
800 	for(; k<kmax; k++)
801 		{
802 
803 		x_0 = x[0];
804 
805 		y_0 -= A[0+bs*0] * x_0;
806 		y_1 -= A[0+bs*1] * x_0;
807 
808 		A += 1;//sda*bs;
809 		x += 1;
810 
811 		}
812 
813 	y_0 = y[0] + y_0;
814 	y_1 = y[1] + y_1;
815 
816 	A = tA;
817 	x = tx;
818 
819 	// top trinagle
820 	y_1 *= inv_diag_A[1];
821 	z[1] = y_1;
822 
823 	y_0 -= A[1+bs*0] * y_1;
824 	y_0 *= inv_diag_A[0];
825 	z[0] = y_0;
826 
827 	}
828 #endif
829 
830 
831 
832 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsv_lt_inv_1_lib4(int kmax,float * A,int sda,float * inv_diag_A,float * x,float * y,float * z)833 void kernel_strsv_lt_inv_1_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
834 	{
835 
836 	const int bs = 4;
837 
838 	int
839 		k;
840 
841 	float *tA, *tx;
842 	tA = A;
843 	tx = x;
844 
845 	float
846 		x_0, x_1, x_2, x_3,
847 		y_0=0;
848 
849 	k = 1;
850 	if(kmax>4)
851 		{
852 		// clean up at the beginning
853 		x_1 = x[1];
854 		x_2 = x[2];
855 		x_3 = x[3];
856 
857 		y_0 -= A[1+bs*0] * x_1;
858 		y_0 -= A[2+bs*0] * x_2;
859 		y_0 -= A[3+bs*0] * x_3;
860 
861 		k=4;
862 		A += 4 + (sda-1)*bs;
863 		x += 4;
864 		for(; k<kmax-3; k+=4)
865 			{
866 
867 			x_0 = x[0];
868 			x_1 = x[1];
869 			x_2 = x[2];
870 			x_3 = x[3];
871 
872 			y_0 -= A[0+bs*0] * x_0;
873 			y_0 -= A[1+bs*0] * x_1;
874 			y_0 -= A[2+bs*0] * x_2;
875 			y_0 -= A[3+bs*0] * x_3;
876 
877 			A += sda*bs;
878 			x += 4;
879 
880 			}
881 		}
882 	else
883 		{
884 		A += 1;
885 		x += 1;
886 		}
887 	for(; k<kmax; k++)
888 		{
889 
890 		x_0 = x[0];
891 
892 		y_0 -= A[0+bs*0] * x_0;
893 
894 		A += 1;//sda*bs;
895 		x += 1;
896 
897 		}
898 
899 	y_0 = y[0] + y_0;
900 
901 	A = tA;
902 	x = tx;
903 
904 	// top trinagle
905 	y_0 *= inv_diag_A[0];
906 	z[0] = y_0;
907 
908 	}
909 #endif
910 
911 
912 
913 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsv_lt_one_4_lib4(int kmax,float * A,int sda,float * x,float * y,float * z)914 void kernel_strsv_lt_one_4_lib4(int kmax, float *A, int sda, float *x, float *y, float *z)
915 	{
916 
917 	const int bs = 4;
918 
919 	float yy[4] = {0, 0, 0, 0};
920 
921 	float alpha = -1.0;
922 	float beta = 1.0;
923 
924 	kernel_sgemv_t_4_lib4(kmax-4, &alpha, 0, A+4+(sda-1)*bs, sda, x+4, &beta, y, yy);
925 
926 	// bottom trinagle
927 	z[3] = yy[3];
928 
929 	yy[2] -= A[3+bs*2] * yy[3];
930 	z[2] = yy[2];
931 
932 	// square
933 	yy[0] -= A[2+bs*0]*yy[2] + A[3+bs*0]*yy[3];
934 	yy[1] -= A[2+bs*1]*yy[2] + A[3+bs*1]*yy[3];
935 
936 	// top trinagle
937 	z[1] = yy[1];
938 
939 	yy[0] -= A[1+bs*0] * yy[1];
940 	z[0] = yy[0];
941 
942 	}
943 #endif
944 
945 
946 
947 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsv_lt_one_3_lib4(int kmax,float * A,int sda,float * x,float * y,float * z)948 void kernel_strsv_lt_one_3_lib4(int kmax, float *A, int sda, float *x, float *y, float *z)
949 	{
950 
951 	const int bs = 4;
952 
953 	int
954 		k;
955 
956 	float *tA, *tx;
957 	tA = A;
958 	tx = x;
959 
960 	float
961 		x_0, x_1, x_2, x_3,
962 		y_0=0, y_1=0, y_2=0;
963 
964 	k = 3;
965 	if(kmax>4)
966 		{
967 		// clean up at the beginning
968 		x_3 = x[3];
969 
970 		y_0 -= A[3+bs*0] * x_3;
971 		y_1 -= A[3+bs*1] * x_3;
972 		y_2 -= A[3+bs*2] * x_3;
973 
974 		k=4;
975 		A += 4 + (sda-1)*bs;
976 		x += 4;
977 		for(; k<kmax-3; k+=4)
978 			{
979 
980 			x_0 = x[0];
981 			x_1 = x[1];
982 			x_2 = x[2];
983 			x_3 = x[3];
984 
985 			y_0 -= A[0+bs*0] * x_0;
986 			y_1 -= A[0+bs*1] * x_0;
987 			y_2 -= A[0+bs*2] * x_0;
988 
989 			y_0 -= A[1+bs*0] * x_1;
990 			y_1 -= A[1+bs*1] * x_1;
991 			y_2 -= A[1+bs*2] * x_1;
992 
993 			y_0 -= A[2+bs*0] * x_2;
994 			y_1 -= A[2+bs*1] * x_2;
995 			y_2 -= A[2+bs*2] * x_2;
996 
997 			y_0 -= A[3+bs*0] * x_3;
998 			y_1 -= A[3+bs*1] * x_3;
999 			y_2 -= A[3+bs*2] * x_3;
1000 
1001 			A += sda*bs;
1002 			x += 4;
1003 
1004 			}
1005 		}
1006 	else
1007 		{
1008 		A += 3;
1009 		x += 1;
1010 		}
1011 	for(; k<kmax; k++)
1012 		{
1013 
1014 		x_0 = x[0];
1015 
1016 		y_0 -= A[0+bs*0] * x_0;
1017 		y_1 -= A[0+bs*1] * x_0;
1018 		y_2 -= A[0+bs*2] * x_0;
1019 
1020 		A += 1;//sda*bs;
1021 		x += 1;
1022 
1023 		}
1024 
1025 	y_0 = y[0] + y_0;
1026 	y_1 = y[1] + y_1;
1027 	y_2 = y[2] + y_2;
1028 
1029 	A = tA;
1030 	x = tx;
1031 
1032 	// bottom trinagle
1033 	z[2] = y_2;
1034 
1035 	// square
1036 	y_0 -= A[2+bs*0]*y_2;
1037 	y_1 -= A[2+bs*1]*y_2;
1038 
1039 	// top trinagle
1040 	z[1] = y_1;
1041 
1042 	y_0 -= A[1+bs*0] * y_1;
1043 	z[0] = y_0;
1044 
1045 	}
1046 #endif
1047 
1048 
1049 
1050 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsv_lt_one_2_lib4(int kmax,float * A,int sda,float * x,float * y,float * z)1051 void kernel_strsv_lt_one_2_lib4(int kmax, float *A, int sda, float *x, float *y, float *z)
1052 	{
1053 
1054 	const int bs = 4;
1055 
1056 	int
1057 		k;
1058 
1059 	float *tA, *tx;
1060 	tA = A;
1061 	tx = x;
1062 
1063 	float
1064 		x_0, x_1, x_2, x_3,
1065 		y_0=0, y_1=0;
1066 
1067 	k = 2;
1068 	if(kmax>4)
1069 		{
1070 		// clean up at the beginning
1071 		x_2 = x[2];
1072 		x_3 = x[3];
1073 
1074 		y_0 -= A[2+bs*0] * x_2;
1075 		y_1 -= A[2+bs*1] * x_2;
1076 
1077 		y_0 -= A[3+bs*0] * x_3;
1078 		y_1 -= A[3+bs*1] * x_3;
1079 
1080 		k=4;
1081 		A += 4 + (sda-1)*bs;
1082 		x += 4;
1083 		for(; k<kmax-3; k+=4)
1084 			{
1085 
1086 			x_0 = x[0];
1087 			x_1 = x[1];
1088 			x_2 = x[2];
1089 			x_3 = x[3];
1090 
1091 			y_0 -= A[0+bs*0] * x_0;
1092 			y_1 -= A[0+bs*1] * x_0;
1093 
1094 			y_0 -= A[1+bs*0] * x_1;
1095 			y_1 -= A[1+bs*1] * x_1;
1096 
1097 			y_0 -= A[2+bs*0] * x_2;
1098 			y_1 -= A[2+bs*1] * x_2;
1099 
1100 			y_0 -= A[3+bs*0] * x_3;
1101 			y_1 -= A[3+bs*1] * x_3;
1102 
1103 			A += sda*bs;
1104 			x += 4;
1105 
1106 			}
1107 		}
1108 	else
1109 		{
1110 		A += 2;
1111 		x += 2;
1112 		}
1113 	for(; k<kmax; k++)
1114 		{
1115 
1116 		x_0 = x[0];
1117 
1118 		y_0 -= A[0+bs*0] * x_0;
1119 		y_1 -= A[0+bs*1] * x_0;
1120 
1121 		A += 1;//sda*bs;
1122 		x += 1;
1123 
1124 		}
1125 
1126 	y_0 = y[0] + y_0;
1127 	y_1 = y[1] + y_1;
1128 
1129 	A = tA;
1130 	x = tx;
1131 
1132 	// top trinagle
1133 	z[1] = y_1;
1134 
1135 	y_0 -= A[1+bs*0] * y_1;
1136 	z[0] = y_0;
1137 
1138 	}
1139 #endif
1140 
1141 
1142 
1143 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsv_lt_one_1_lib4(int kmax,float * A,int sda,float * x,float * y,float * z)1144 void kernel_strsv_lt_one_1_lib4(int kmax, float *A, int sda, float *x, float *y, float *z)
1145 	{
1146 
1147 	const int bs = 4;
1148 
1149 	int
1150 		k;
1151 
1152 	float *tA, *tx;
1153 	tA = A;
1154 	tx = x;
1155 
1156 	float
1157 		x_0, x_1, x_2, x_3,
1158 		y_0=0;
1159 
1160 	k = 1;
1161 	if(kmax>4)
1162 		{
1163 		// clean up at the beginning
1164 		x_1 = x[1];
1165 		x_2 = x[2];
1166 		x_3 = x[3];
1167 
1168 		y_0 -= A[1+bs*0] * x_1;
1169 		y_0 -= A[2+bs*0] * x_2;
1170 		y_0 -= A[3+bs*0] * x_3;
1171 
1172 		k=4;
1173 		A += 4 + (sda-1)*bs;
1174 		x += 4;
1175 		for(; k<kmax-3; k+=4)
1176 			{
1177 
1178 			x_0 = x[0];
1179 			x_1 = x[1];
1180 			x_2 = x[2];
1181 			x_3 = x[3];
1182 
1183 			y_0 -= A[0+bs*0] * x_0;
1184 			y_0 -= A[1+bs*0] * x_1;
1185 			y_0 -= A[2+bs*0] * x_2;
1186 			y_0 -= A[3+bs*0] * x_3;
1187 
1188 			A += sda*bs;
1189 			x += 4;
1190 
1191 			}
1192 		}
1193 	else
1194 		{
1195 		A += 1;
1196 		x += 1;
1197 		}
1198 	for(; k<kmax; k++)
1199 		{
1200 
1201 		x_0 = x[0];
1202 
1203 		y_0 -= A[0+bs*0] * x_0;
1204 
1205 		A += 1;//sda*bs;
1206 		x += 1;
1207 
1208 		}
1209 
1210 	y_0 = y[0] + y_0;
1211 
1212 	A = tA;
1213 	x = tx;
1214 
1215 	// top trinagle
1216 	z[0] = y_0;
1217 
1218 	}
1219 #endif
1220 
1221 
1222 
1223 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsv_un_inv_4_lib4(int kmax,float * A,float * inv_diag_A,float * x,float * y,float * z)1224 void kernel_strsv_un_inv_4_lib4(int kmax, float *A, float *inv_diag_A, float *x, float *y, float *z)
1225 	{
1226 
1227 	const int bs = 4;
1228 
1229 	float yy[4] = {0, 0, 0, 0};
1230 
1231 	float alpha = -1.0;
1232 	float beta = 1.0;
1233 
1234 	kernel_sgemv_n_4_lib4(kmax-4, &alpha, A+4*bs, x+4, &beta, y, yy);
1235 
1236 	// bottom trinagle
1237 	yy[3] *= inv_diag_A[3];
1238 	z[3] = yy[3];
1239 
1240 	yy[2] -= A[2+bs*3] * yy[3];
1241 	yy[2] *= inv_diag_A[2];
1242 	z[2] = yy[2];
1243 
1244 	// square
1245 	yy[0] -= A[0+bs*2]*yy[2] + A[0+bs*3]*yy[3];
1246 	yy[1] -= A[1+bs*2]*yy[2] + A[1+bs*3]*yy[3];
1247 
1248 	// top trinagle
1249 	yy[1] *= inv_diag_A[1];
1250 	z[1] = yy[1];
1251 
1252 	yy[0] -= A[0+bs*1] * yy[1];
1253 	yy[0] *= inv_diag_A[0];
1254 	z[0] = yy[0];
1255 
1256 	}
1257 #endif
1258 
1259 
1260 
1261 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsv_ut_inv_4_vs_lib4(int kmax,float * A,int sda,float * inv_diag_A,float * x,float * y,float * z,int m1,int n1)1262 void kernel_strsv_ut_inv_4_vs_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z, int m1, int n1)
1263 	{
1264 
1265 	const int bs = 4;
1266 
1267 	float yy[4] = {0, 0, 0, 0};
1268 
1269 	int k1 = kmax/bs*bs;
1270 	float alpha = -1.0;
1271 	float beta = 1.0;
1272 
1273 	kernel_sgemv_t_4_lib4(k1, &alpha, 0, A, sda, x, &beta, y, yy);
1274 
1275 	A += sda*k1;
1276 
1277 	float
1278 		a_00, a_10, a_20, a_30,
1279 		a_11, a_21, a_31;
1280 
1281 	// a_00
1282 	a_00 = inv_diag_A[0];
1283 	a_10 = A[0+bs*1];
1284 	a_20 = A[0+bs*2];
1285 	a_30 = A[0+bs*3];
1286 	yy[0] *= a_00;
1287 	z[0] = yy[0];
1288 	yy[1] -= a_10 * yy[0];
1289 	yy[2] -= a_20 * yy[0];
1290 	yy[3] -= a_30 * yy[0];
1291 
1292 	if(n1==1)
1293 		{
1294 		if(m1==1)
1295 			return;
1296 		z[1] = yy[1];
1297 		if(m1==2)
1298 			return;
1299 		z[2] = yy[2];
1300 		if(m1==3)
1301 			return;
1302 		z[3] = yy[3];
1303 		return;
1304 		}
1305 
1306 	// a_11
1307 	a_11 = inv_diag_A[1];
1308 	a_21 = A[1+bs*2];
1309 	a_31 = A[1+bs*3];
1310 	yy[1] *= a_11;
1311 	z[1] = yy[1];
1312 	yy[2] -= a_21 * yy[1];
1313 	yy[3] -= a_31 * yy[1];
1314 
1315 	if(n1==2)
1316 		{
1317 		if(m1==2)
1318 			return;
1319 		z[2] = yy[2];
1320 		if(m1==3)
1321 			return;
1322 		z[3] = yy[3];
1323 		return;
1324 		}
1325 
1326 	// a_22
1327 	a_00 = inv_diag_A[2];
1328 	a_10 = A[2+bs*3];
1329 	yy[2] *= a_00;
1330 	z[2] = yy[2];
1331 	yy[3] -= a_10 * yy[2];
1332 
1333 	if(n1==3)
1334 		{
1335 		if(m1==3)
1336 			return;
1337 		z[3] = yy[3];
1338 
1339 		return;
1340 		}
1341 
1342 	// a_33
1343 	a_11 = inv_diag_A[3];
1344 	yy[3] *= a_11;
1345 	z[3] = yy[3];
1346 
1347 	return;
1348 
1349 	}
1350 #endif
1351 
1352 
1353 
1354 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsv_ut_inv_4_lib4(int kmax,float * A,int sda,float * inv_diag_A,float * x,float * y,float * z)1355 void kernel_strsv_ut_inv_4_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
1356 	{
1357 
1358 	const int bs = 4;
1359 
1360 	float yy[4] = {0, 0, 0, 0};
1361 
1362 	int k1 = kmax/bs*bs;
1363 	float alpha = -1.0;
1364 	float beta = 1.0;
1365 
1366 	kernel_sgemv_t_4_lib4(k1, &alpha, 0, A, sda, x, &beta, y, yy);
1367 
1368 	A += sda*k1;
1369 
1370 	float
1371 		a_00, a_10, a_20, a_30,
1372 		a_11, a_21, a_31;
1373 
1374 	// a_00
1375 	a_00 = inv_diag_A[0];
1376 	a_10 = A[0+bs*1];
1377 	a_20 = A[0+bs*2];
1378 	a_30 = A[0+bs*3];
1379 	yy[0] *= a_00;
1380 	z[0] = yy[0];
1381 	yy[1] -= a_10 * yy[0];
1382 	yy[2] -= a_20 * yy[0];
1383 	yy[3] -= a_30 * yy[0];
1384 
1385 	// a_11
1386 	a_11 = inv_diag_A[1];
1387 	a_21 = A[1+bs*2];
1388 	a_31 = A[1+bs*3];
1389 	yy[1] *= a_11;
1390 	z[1] = yy[1];
1391 	yy[2] -= a_21 * yy[1];
1392 	yy[3] -= a_31 * yy[1];
1393 
1394 	// a_22
1395 	a_00 = inv_diag_A[2];
1396 	a_10 = A[2+bs*3];
1397 	yy[2] *= a_00;
1398 	z[2] = yy[2];
1399 	yy[3] -= a_10 * yy[2];
1400 
1401 	// a_33
1402 	a_11 = inv_diag_A[3];
1403 	yy[3] *= a_11;
1404 	z[3] = yy[3];
1405 
1406 	return;
1407 
1408 	}
1409 #endif
1410 
1411 
1412 
1413 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strmv_un_4_lib4(int kmax,float * A,float * x,float * z)1414 void kernel_strmv_un_4_lib4(int kmax, float *A, float *x, float *z)
1415 	{
1416 
1417 	const int bs = 4;
1418 
1419 	float yy[4] = {0, 0, 0, 0};
1420 
1421 	float x_0, x_1, x_2, x_3;
1422 
1423 	x_0 = x[0];
1424 	x_1 = x[1];
1425 	x_2 = x[2];
1426 	x_3 = x[3];
1427 
1428 	yy[0] += A[0+bs*0] * x_0;
1429 /*	yy[1] += A[1+bs*0] * x_0;*/
1430 /*	yy[2] += A[2+bs*0] * x_0;*/
1431 /*	yy[3] += A[3+bs*0] * x_0;*/
1432 
1433 	yy[0] += A[0+bs*1] * x_1;
1434 	yy[1] += A[1+bs*1] * x_1;
1435 /*	yy[2] += A[2+bs*1] * x_1;*/
1436 /*	yy[3] += A[3+bs*1] * x_1;*/
1437 
1438 	yy[0] += A[0+bs*2] * x_2;
1439 	yy[1] += A[1+bs*2] * x_2;
1440 	yy[2] += A[2+bs*2] * x_2;
1441 /*	yy[3] += A[3+bs*2] * x_2;*/
1442 
1443 	yy[0] += A[0+bs*3] * x_3;
1444 	yy[1] += A[1+bs*3] * x_3;
1445 	yy[2] += A[2+bs*3] * x_3;
1446 	yy[3] += A[3+bs*3] * x_3;
1447 
1448 	float alpha1 = 1.0;
1449 	float beta1  = 1.0;
1450 
1451 	kernel_sgemv_n_4_lib4(kmax-4, &alpha1, A+4*bs, x+4, &beta1, yy, z);
1452 
1453 	return;
1454 
1455 	}
1456 #endif
1457 
1458 
1459 
1460 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strmv_ut_4_vs_lib4(int kmax,float * A,int sda,float * x,float * z,int m1)1461 void kernel_strmv_ut_4_vs_lib4(int kmax, float *A, int sda, float *x, float *z, int m1)
1462 	{
1463 
1464 	const int bs  = 4;
1465 
1466 	float yy[4] = {0, 0, 0, 0};
1467 
1468 	float x_0, x_1, x_2, x_3;
1469 
1470 	int k1 = kmax/bs*bs;
1471 	float alpha1 = 1.0;
1472 	float beta1  = 1.0;
1473 
1474 	kernel_sgemv_t_4_lib4(k1, &alpha1, 0, A, sda, x, &beta1, yy, yy);
1475 
1476 	A += k1*sda;
1477 	x += k1;
1478 
1479 	x_0 = x[0];
1480 	x_1 = x[1];
1481 	x_2 = x[2];
1482 	x_3 = x[3];
1483 
1484 	yy[0] += A[0+bs*0] * x_0;
1485 	yy[1] += A[0+bs*1] * x_0;
1486 	yy[2] += A[0+bs*2] * x_0;
1487 	yy[3] += A[0+bs*3] * x_0;
1488 
1489 /*	yy[0] += A[1+bs*0] * x_1;*/
1490 	yy[1] += A[1+bs*1] * x_1;
1491 	yy[2] += A[1+bs*2] * x_1;
1492 	yy[3] += A[1+bs*3] * x_1;
1493 
1494 /*	yy[0] += A[2+bs*0] * x_2;*/
1495 /*	yy[1] += A[2+bs*1] * x_2;*/
1496 	yy[2] += A[2+bs*2] * x_2;
1497 	yy[3] += A[2+bs*3] * x_2;
1498 
1499 /*	yy[0] += A[3+bs*0] * x_3;*/
1500 /*	yy[1] += A[3+bs*1] * x_3;*/
1501 /*	yy[2] += A[3+bs*2] * x_3;*/
1502 	yy[3] += A[3+bs*3] * x_3;
1503 
1504 //	A += sda*bs;
1505 //	x += 4;
1506 
1507 	// store_vs
1508 	if(m1>=4)
1509 		{
1510 		z[0] = yy[0];
1511 		z[1] = yy[1];
1512 		z[2] = yy[2];
1513 		z[3] = yy[3];
1514 		}
1515 	else
1516 		{
1517 		z[0] = yy[0];
1518 		if(m1>=2)
1519 			{
1520 			z[1] = yy[1];
1521 			if(m1>2)
1522 				{
1523 				z[2] = yy[2];
1524 				}
1525 			}
1526 		}
1527 
1528 	return;
1529 
1530 	}
1531 #endif
1532 
1533 
1534 
1535 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strmv_ut_4_lib4(int kmax,float * A,int sda,float * x,float * z)1536 void kernel_strmv_ut_4_lib4(int kmax, float *A, int sda, float *x, float *z)
1537 	{
1538 
1539 	const int bs  = 4;
1540 
1541 	float yy[4] = {0, 0, 0, 0};
1542 
1543 	float x_0, x_1, x_2, x_3;
1544 
1545 	int k1 = kmax/bs*bs;
1546 	float alpha1 = 1.0;
1547 	float beta1  = 1.0;
1548 
1549 	kernel_sgemv_t_4_lib4(k1, &alpha1, 0, A, sda, x, &beta1, yy, yy);
1550 
1551 	A += k1*sda;
1552 	x += k1;
1553 
1554 	x_0 = x[0];
1555 	x_1 = x[1];
1556 	x_2 = x[2];
1557 	x_3 = x[3];
1558 
1559 	yy[0] += A[0+bs*0] * x_0;
1560 	yy[1] += A[0+bs*1] * x_0;
1561 	yy[2] += A[0+bs*2] * x_0;
1562 	yy[3] += A[0+bs*3] * x_0;
1563 
1564 /*	yy[0] += A[1+bs*0] * x_1;*/
1565 	yy[1] += A[1+bs*1] * x_1;
1566 	yy[2] += A[1+bs*2] * x_1;
1567 	yy[3] += A[1+bs*3] * x_1;
1568 
1569 /*	yy[0] += A[2+bs*0] * x_2;*/
1570 /*	yy[1] += A[2+bs*1] * x_2;*/
1571 	yy[2] += A[2+bs*2] * x_2;
1572 	yy[3] += A[2+bs*3] * x_2;
1573 
1574 /*	yy[0] += A[3+bs*0] * x_3;*/
1575 /*	yy[1] += A[3+bs*1] * x_3;*/
1576 /*	yy[2] += A[3+bs*2] * x_3;*/
1577 	yy[3] += A[3+bs*3] * x_3;
1578 
1579 //	A += sda*bs;
1580 //	x += 4;
1581 
1582 	z[0] = yy[0];
1583 	z[1] = yy[1];
1584 	z[2] = yy[2];
1585 	z[3] = yy[3];
1586 
1587 	return;
1588 
1589 	}
1590 #endif
1591 
1592 
1593