1 /**************************************************************************************************
2 *                                                                                                 *
3 * This file is part of BLASFEO.                                                                   *
4 *                                                                                                 *
5 * BLASFEO -- BLAS For Embedded Optimization.                                                      *
6 * Copyright (C) 2019 by Gianluca Frison.                                                          *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
8 * All rights reserved.                                                                            *
9 *                                                                                                 *
10 * The 2-Clause BSD License                                                                        *
11 *                                                                                                 *
12 * Redistribution and use in source and binary forms, with or without                              *
13 * modification, are permitted provided that the following conditions are met:                     *
14 *                                                                                                 *
15 * 1. Redistributions of source code must retain the above copyright notice, this                  *
16 *    list of conditions and the following disclaimer.                                             *
17 * 2. Redistributions in binary form must reproduce the above copyright notice,                    *
18 *    this list of conditions and the following disclaimer in the documentation                    *
19 *    and/or other materials provided with the distribution.                                       *
20 *                                                                                                 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND                 *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED                   *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE                          *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR                 *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES                  *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;                    *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND                     *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT                      *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS                   *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                                    *
31 *                                                                                                 *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de                             *
33 *                                                                                                 *
34 **************************************************************************************************/
35 
36 
37 
38 // XXX copy and scale y_n into z_n outside the kernel !!!!!
39 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dgemv_nt_4_vs_lib4(int kmax,double * alpha_n,double * alpha_t,double * A,int sda,double * x_n,double * x_t,double * beta_t,double * y_t,double * z_n,double * z_t,int km)40 void kernel_dgemv_nt_4_vs_lib4(int kmax, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t, int km)
41 	{
42 
43 	if(kmax<=0)
44 		return;
45 
46 	const int bs = 4;
47 
48 	int k;
49 
50 	double
51 		a_00, a_01, a_02, a_03,
52 		x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
53 		x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
54 
55 	x_n_0 = 0;
56 	x_n_1 = 0;
57 	x_n_2 = 0;
58 	x_n_3 = 0;
59 
60 	x_n_0 = alpha_n[0]*x_n[0];
61 	if(km>1)
62 		{
63 		x_n_1 = alpha_n[0]*x_n[1];
64 		if(km>2)
65 			{
66 			x_n_2 = alpha_n[0]*x_n[2];
67 			if(km>3)
68 				{
69 				x_n_3 = alpha_n[0]*x_n[3];
70 				}
71 			}
72 		}
73 
74 	y_t_0 = 0;
75 	y_t_1 = 0;
76 	y_t_2 = 0;
77 	y_t_3 = 0;
78 
79 	k = 0;
80 	for(; k<kmax-3; k+=bs)
81 		{
82 
83 		// 0
84 
85 		y_n_0 = z_n[0];
86 		x_t_0 = x_t[0];
87 
88 		a_00 = A[0+bs*0];
89 		a_01 = A[0+bs*1];
90 		a_02 = A[0+bs*2];
91 		a_03 = A[0+bs*3];
92 
93 		y_n_0 += a_00 * x_n_0;
94 		y_t_0 += a_00 * x_t_0;
95 		y_n_0 += a_01 * x_n_1;
96 		y_t_1 += a_01 * x_t_0;
97 		y_n_0 += a_02 * x_n_2;
98 		y_t_2 += a_02 * x_t_0;
99 		y_n_0 += a_03 * x_n_3;
100 		y_t_3 += a_03 * x_t_0;
101 
102 		z_n[0] = y_n_0;
103 
104 
105 		// 1
106 
107 		y_n_0 = z_n[1];
108 		x_t_0 = x_t[1];
109 
110 		a_00 = A[1+bs*0];
111 		a_01 = A[1+bs*1];
112 		a_02 = A[1+bs*2];
113 		a_03 = A[1+bs*3];
114 
115 		y_n_0 += a_00 * x_n_0;
116 		y_t_0 += a_00 * x_t_0;
117 		y_n_0 += a_01 * x_n_1;
118 		y_t_1 += a_01 * x_t_0;
119 		y_n_0 += a_02 * x_n_2;
120 		y_t_2 += a_02 * x_t_0;
121 		y_n_0 += a_03 * x_n_3;
122 		y_t_3 += a_03 * x_t_0;
123 
124 		z_n[1] = y_n_0;
125 
126 
127 		// 2
128 
129 		y_n_0 = z_n[2];
130 		x_t_0 = x_t[2];
131 
132 		a_00 = A[2+bs*0];
133 		a_01 = A[2+bs*1];
134 		a_02 = A[2+bs*2];
135 		a_03 = A[2+bs*3];
136 
137 		y_n_0 += a_00 * x_n_0;
138 		y_t_0 += a_00 * x_t_0;
139 		y_n_0 += a_01 * x_n_1;
140 		y_t_1 += a_01 * x_t_0;
141 		y_n_0 += a_02 * x_n_2;
142 		y_t_2 += a_02 * x_t_0;
143 		y_n_0 += a_03 * x_n_3;
144 		y_t_3 += a_03 * x_t_0;
145 
146 		z_n[2] = y_n_0;
147 
148 
149 		// 3
150 
151 		y_n_0 = z_n[3];
152 		x_t_0 = x_t[3];
153 
154 		a_00 = A[3+bs*0];
155 		a_01 = A[3+bs*1];
156 		a_02 = A[3+bs*2];
157 		a_03 = A[3+bs*3];
158 
159 		y_n_0 += a_00 * x_n_0;
160 		y_t_0 += a_00 * x_t_0;
161 		y_n_0 += a_01 * x_n_1;
162 		y_t_1 += a_01 * x_t_0;
163 		y_n_0 += a_02 * x_n_2;
164 		y_t_2 += a_02 * x_t_0;
165 		y_n_0 += a_03 * x_n_3;
166 		y_t_3 += a_03 * x_t_0;
167 
168 		z_n[3] = y_n_0;
169 
170 
171 		A += sda*bs;
172 		z_n += 4;
173 		x_t += 4;
174 
175 		}
176 	for(; k<kmax; k++)
177 		{
178 
179 		// 0
180 
181 		y_n_0 = z_n[0];
182 		x_t_0 = x_t[0];
183 
184 		a_00 = A[0+bs*0];
185 		a_01 = A[0+bs*1];
186 		a_02 = A[0+bs*2];
187 		a_03 = A[0+bs*3];
188 
189 		y_n_0 += a_00 * x_n_0;
190 		y_t_0 += a_00 * x_t_0;
191 		y_n_0 += a_01 * x_n_1;
192 		y_t_1 += a_01 * x_t_0;
193 		y_n_0 += a_02 * x_n_2;
194 		y_t_2 += a_02 * x_t_0;
195 		y_n_0 += a_03 * x_n_3;
196 		y_t_3 += a_03 * x_t_0;
197 
198 		z_n[0] = y_n_0;
199 
200 		A += 1;
201 		z_n += 1;
202 		x_t += 1;
203 
204 		}
205 
206 	// store t
207 	z_t[0] = alpha_t[0]*y_t_0 + beta_t[0]*y_t[0];
208 	if(km>1)
209 		{
210 		z_t[1] = alpha_t[0]*y_t_1 + beta_t[0]*y_t[1];
211 		if(km>2)
212 			{
213 			z_t[2] = alpha_t[0]*y_t_2 + beta_t[0]*y_t[2];
214 			if(km>3)
215 				{
216 				z_t[3] = alpha_t[0]*y_t_3 + beta_t[0]*y_t[3];
217 				}
218 			}
219 		}
220 
221 	return;
222 
223 	}
224 #endif
225 
226 
227 
228 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
229 // XXX copy and scale y_n into z_n outside the kernel !!!!!
kernel_dgemv_nt_4_lib4(int kmax,double * alpha_n,double * alpha_t,double * A,int sda,double * x_n,double * x_t,double * beta_t,double * y_t,double * z_n,double * z_t)230 void kernel_dgemv_nt_4_lib4(int kmax, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t)
231 	{
232 
233 	kernel_dgemv_nt_4_vs_lib4(kmax, alpha_n, alpha_t, A, sda, x_n, x_t, beta_t, y_t, z_n, z_t, 4);
234 
235 	return;
236 
237 	}
238 #endif
239 
240 
241 
242 // XXX copy and scale y_n into z_n outside the kernel !!!!!
243 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dsymv_l_4_gen_lib4(int kmax,double * alpha,int offA,double * A,int sda,double * x_n,double * z_n,int km)244 void kernel_dsymv_l_4_gen_lib4(int kmax, double *alpha, int offA, double *A, int sda, double *x_n, double *z_n, int km)
245 	{
246 
247 	if(kmax<=0)
248 		return;
249 
250 	double *x_t = x_n;
251 	double *z_t = z_n;
252 
253 	const int bs = 4;
254 
255 	int k;
256 
257 	double
258 		a_00, a_01, a_02, a_03,
259 		x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
260 		x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
261 
262 	x_n_0 = 0;
263 	x_n_1 = 0;
264 	x_n_2 = 0;
265 	x_n_3 = 0;
266 
267 	x_n_0 = alpha[0]*x_n[0];
268 	if(km>1)
269 		{
270 		x_n_1 = alpha[0]*x_n[1];
271 		if(km>2)
272 			{
273 			x_n_2 = alpha[0]*x_n[2];
274 			if(km>3)
275 				{
276 				x_n_3 = alpha[0]*x_n[3];
277 				}
278 			}
279 		}
280 
281 	y_t_0 = 0;
282 	y_t_1 = 0;
283 	y_t_2 = 0;
284 	y_t_3 = 0;
285 
286 	k = 0;
287 	if(offA==0)
288 		{
289 		if(kmax<4)
290 			{
291 			// 0
292 
293 			x_t_0 = x_t[0];
294 
295 			a_00 = A[0+bs*0];
296 
297 			y_t_0 += a_00 * x_t_0;
298 
299 			if(kmax==1)
300 				goto store_t;
301 
302 			// 1
303 
304 			y_n_0 = z_n[1];
305 			x_t_0 = x_t[1];
306 
307 			a_00 = A[1+bs*0];
308 			a_01 = A[1+bs*1];
309 
310 			y_n_0 += a_00 * x_n_0;
311 			y_t_0 += a_00 * x_t_0;
312 			y_t_1 += a_01 * x_t_0;
313 
314 			z_n[1] = y_n_0;
315 
316 			if(kmax==2)
317 				goto store_t;
318 
319 			// 2
320 
321 			y_n_0 = z_n[2];
322 			x_t_0 = x_t[2];
323 
324 			a_00 = A[2+bs*0];
325 			a_01 = A[2+bs*1];
326 			a_02 = A[2+bs*2];
327 
328 			y_n_0 += a_00 * x_n_0;
329 			y_t_0 += a_00 * x_t_0;
330 			y_n_0 += a_01 * x_n_1;
331 			y_t_1 += a_01 * x_t_0;
332 			y_t_2 += a_02 * x_t_0;
333 
334 			z_n[2] = y_n_0;
335 
336 			goto store_t;
337 			}
338 		else
339 			{
340 
341 			// 0
342 
343 			x_t_0 = x_t[0];
344 
345 			a_00 = A[0+bs*0];
346 
347 			y_t_0 += a_00 * x_t_0;
348 
349 
350 			// 1
351 
352 			y_n_0 = z_n[1];
353 			x_t_0 = x_t[1];
354 
355 			a_00 = A[1+bs*0];
356 			a_01 = A[1+bs*1];
357 
358 			y_n_0 += a_00 * x_n_0;
359 			y_t_0 += a_00 * x_t_0;
360 			y_t_1 += a_01 * x_t_0;
361 
362 			z_n[1] = y_n_0;
363 
364 
365 			// 2
366 
367 			y_n_0 = z_n[2];
368 			x_t_0 = x_t[2];
369 
370 			a_00 = A[2+bs*0];
371 			a_01 = A[2+bs*1];
372 			a_02 = A[2+bs*2];
373 
374 			y_n_0 += a_00 * x_n_0;
375 			y_t_0 += a_00 * x_t_0;
376 			y_n_0 += a_01 * x_n_1;
377 			y_t_1 += a_01 * x_t_0;
378 			y_t_2 += a_02 * x_t_0;
379 
380 			z_n[2] = y_n_0;
381 
382 
383 			// 3
384 
385 			y_n_0 = z_n[3];
386 			x_t_0 = x_t[3];
387 
388 			a_00 = A[3+bs*0];
389 			a_01 = A[3+bs*1];
390 			a_02 = A[3+bs*2];
391 			a_03 = A[3+bs*3];
392 
393 			y_n_0 += a_00 * x_n_0;
394 			y_t_0 += a_00 * x_t_0;
395 			y_n_0 += a_01 * x_n_1;
396 			y_t_1 += a_01 * x_t_0;
397 			y_n_0 += a_02 * x_n_2;
398 			y_t_2 += a_02 * x_t_0;
399 			y_t_3 += a_03 * x_t_0;
400 
401 			z_n[3] = y_n_0;
402 
403 			k += 4;
404 			A += sda*bs;
405 			z_n += 4;
406 			x_t += 4;
407 
408 			}
409 		}
410 	else if(offA==1)
411 		{
412 
413 		// 0
414 
415 		x_t_0 = x_t[0];
416 
417 		a_00 = A[0+bs*0];
418 
419 		y_t_0 += a_00 * x_t_0;
420 
421 		A += 1;
422 		z_n += 1;
423 		x_t += 1;
424 
425 		if(kmax==1)
426 			goto store_t;
427 
428 		// 1
429 
430 		y_n_0 = z_n[0];
431 		x_t_0 = x_t[0];
432 
433 		a_00 = A[0+bs*0];
434 		a_01 = A[0+bs*1];
435 
436 		y_n_0 += a_00 * x_n_0;
437 		y_t_0 += a_00 * x_t_0;
438 		y_t_1 += a_01 * x_t_0;
439 
440 		z_n[0] = y_n_0;
441 
442 		A += 1;
443 		z_n += 1;
444 		x_t += 1;
445 
446 		if(kmax==2)
447 			goto store_t;
448 
449 		// 2
450 
451 		y_n_0 = z_n[0];
452 		x_t_0 = x_t[0];
453 
454 		a_00 = A[0+bs*0];
455 		a_01 = A[0+bs*1];
456 		a_02 = A[0+bs*2];
457 
458 		y_n_0 += a_00 * x_n_0;
459 		y_t_0 += a_00 * x_t_0;
460 		y_n_0 += a_01 * x_n_1;
461 		y_t_1 += a_01 * x_t_0;
462 		y_t_2 += a_02 * x_t_0;
463 
464 		z_n[0] = y_n_0;
465 
466 		A += 1;
467 		z_n += 1;
468 		x_t += 1;
469 
470 		A += (sda-1)*bs; // new panel
471 
472 		if(kmax==3)
473 			goto store_t;
474 
475 		// 3
476 
477 		y_n_0 = z_n[0];
478 		x_t_0 = x_t[0];
479 
480 		a_00 = A[0+bs*0];
481 		a_01 = A[0+bs*1];
482 		a_02 = A[0+bs*2];
483 		a_03 = A[0+bs*3];
484 
485 		y_n_0 += a_00 * x_n_0;
486 		y_t_0 += a_00 * x_t_0;
487 		y_n_0 += a_01 * x_n_1;
488 		y_t_1 += a_01 * x_t_0;
489 		y_n_0 += a_02 * x_n_2;
490 		y_t_2 += a_02 * x_t_0;
491 		y_t_3 += a_03 * x_t_0;
492 
493 		z_n[0] = y_n_0;
494 
495 		A += 1;
496 		z_n += 1;
497 		x_t += 1;
498 
499 		if(kmax==4)
500 			goto store_t;
501 
502 		// 4
503 
504 		y_n_0 = z_n[0];
505 		x_t_0 = x_t[0];
506 
507 		a_00 = A[0+bs*0];
508 		a_01 = A[0+bs*1];
509 		a_02 = A[0+bs*2];
510 		a_03 = A[0+bs*3];
511 
512 		y_n_0 += a_00 * x_n_0;
513 		y_t_0 += a_00 * x_t_0;
514 		y_n_0 += a_01 * x_n_1;
515 		y_t_1 += a_01 * x_t_0;
516 		y_n_0 += a_02 * x_n_2;
517 		y_t_2 += a_02 * x_t_0;
518 		y_n_0 += a_03 * x_n_3;
519 		y_t_3 += a_03 * x_t_0;
520 
521 		z_n[0] = y_n_0;
522 
523 		A += 1;
524 		z_n += 1;
525 		x_t += 1;
526 
527 		if(kmax==5)
528 			goto store_t;
529 
530 		// 5
531 
532 		y_n_0 = z_n[0];
533 		x_t_0 = x_t[0];
534 
535 		a_00 = A[0+bs*0];
536 		a_01 = A[0+bs*1];
537 		a_02 = A[0+bs*2];
538 		a_03 = A[0+bs*3];
539 
540 		y_n_0 += a_00 * x_n_0;
541 		y_t_0 += a_00 * x_t_0;
542 		y_n_0 += a_01 * x_n_1;
543 		y_t_1 += a_01 * x_t_0;
544 		y_n_0 += a_02 * x_n_2;
545 		y_t_2 += a_02 * x_t_0;
546 		y_n_0 += a_03 * x_n_3;
547 		y_t_3 += a_03 * x_t_0;
548 
549 		z_n[0] = y_n_0;
550 
551 		A += 1;
552 		z_n += 1;
553 		x_t += 1;
554 
555 		if(kmax==6)
556 			goto store_t;
557 
558 		// 6
559 
560 		y_n_0 = z_n[0];
561 		x_t_0 = x_t[0];
562 
563 		a_00 = A[0+bs*0];
564 		a_01 = A[0+bs*1];
565 		a_02 = A[0+bs*2];
566 		a_03 = A[0+bs*3];
567 
568 		y_n_0 += a_00 * x_n_0;
569 		y_t_0 += a_00 * x_t_0;
570 		y_n_0 += a_01 * x_n_1;
571 		y_t_1 += a_01 * x_t_0;
572 		y_n_0 += a_02 * x_n_2;
573 		y_t_2 += a_02 * x_t_0;
574 		y_n_0 += a_03 * x_n_3;
575 		y_t_3 += a_03 * x_t_0;
576 
577 		z_n[0] = y_n_0;
578 
579 		A += 1;
580 		z_n += 1;
581 		x_t += 1;
582 
583 		A += (sda-1)*bs; // new panel
584 
585 		if(kmax==7)
586 			goto store_t;
587 
588 		k += 7;
589 
590 		}
591 	else if(offA==2)
592 		{
593 
594 		// 0
595 
596 		x_t_0 = x_t[0];
597 
598 		a_00 = A[0+bs*0];
599 
600 		y_t_0 += a_00 * x_t_0;
601 
602 		A += 1;
603 		z_n += 1;
604 		x_t += 1;
605 
606 		if(kmax==1)
607 			goto store_t;
608 
609 		// 1
610 
611 		y_n_0 = z_n[0];
612 		x_t_0 = x_t[0];
613 
614 		a_00 = A[0+bs*0];
615 		a_01 = A[0+bs*1];
616 
617 		y_n_0 += a_00 * x_n_0;
618 		y_t_0 += a_00 * x_t_0;
619 		y_t_1 += a_01 * x_t_0;
620 
621 		z_n[0] = y_n_0;
622 
623 		A += 1;
624 		z_n += 1;
625 		x_t += 1;
626 
627 		A += (sda-1)*bs; // new panel
628 
629 		if(kmax==2)
630 			goto store_t;
631 
632 		// 2
633 
634 		y_n_0 = z_n[0];
635 		x_t_0 = x_t[0];
636 
637 		a_00 = A[0+bs*0];
638 		a_01 = A[0+bs*1];
639 		a_02 = A[0+bs*2];
640 
641 		y_n_0 += a_00 * x_n_0;
642 		y_t_0 += a_00 * x_t_0;
643 		y_n_0 += a_01 * x_n_1;
644 		y_t_1 += a_01 * x_t_0;
645 		y_t_2 += a_02 * x_t_0;
646 
647 		z_n[0] = y_n_0;
648 
649 		A += 1;
650 		z_n += 1;
651 		x_t += 1;
652 
653 		if(kmax==3)
654 			goto store_t;
655 
656 		// 3
657 
658 		y_n_0 = z_n[0];
659 		x_t_0 = x_t[0];
660 
661 		a_00 = A[0+bs*0];
662 		a_01 = A[0+bs*1];
663 		a_02 = A[0+bs*2];
664 		a_03 = A[0+bs*3];
665 
666 		y_n_0 += a_00 * x_n_0;
667 		y_t_0 += a_00 * x_t_0;
668 		y_n_0 += a_01 * x_n_1;
669 		y_t_1 += a_01 * x_t_0;
670 		y_n_0 += a_02 * x_n_2;
671 		y_t_2 += a_02 * x_t_0;
672 		y_t_3 += a_03 * x_t_0;
673 
674 		z_n[0] = y_n_0;
675 
676 		A += 1;
677 		z_n += 1;
678 		x_t += 1;
679 
680 		if(kmax==4)
681 			goto store_t;
682 
683 		// 4
684 
685 		y_n_0 = z_n[0];
686 		x_t_0 = x_t[0];
687 
688 		a_00 = A[0+bs*0];
689 		a_01 = A[0+bs*1];
690 		a_02 = A[0+bs*2];
691 		a_03 = A[0+bs*3];
692 
693 		y_n_0 += a_00 * x_n_0;
694 		y_t_0 += a_00 * x_t_0;
695 		y_n_0 += a_01 * x_n_1;
696 		y_t_1 += a_01 * x_t_0;
697 		y_n_0 += a_02 * x_n_2;
698 		y_t_2 += a_02 * x_t_0;
699 		y_n_0 += a_03 * x_n_3;
700 		y_t_3 += a_03 * x_t_0;
701 
702 		z_n[0] = y_n_0;
703 
704 		A += 1;
705 		z_n += 1;
706 		x_t += 1;
707 
708 		if(kmax==5)
709 			goto store_t;
710 
711 		// 5
712 
713 		y_n_0 = z_n[0];
714 		x_t_0 = x_t[0];
715 
716 		a_00 = A[0+bs*0];
717 		a_01 = A[0+bs*1];
718 		a_02 = A[0+bs*2];
719 		a_03 = A[0+bs*3];
720 
721 		y_n_0 += a_00 * x_n_0;
722 		y_t_0 += a_00 * x_t_0;
723 		y_n_0 += a_01 * x_n_1;
724 		y_t_1 += a_01 * x_t_0;
725 		y_n_0 += a_02 * x_n_2;
726 		y_t_2 += a_02 * x_t_0;
727 		y_n_0 += a_03 * x_n_3;
728 		y_t_3 += a_03 * x_t_0;
729 
730 		z_n[0] = y_n_0;
731 
732 		A += 1;
733 		z_n += 1;
734 		x_t += 1;
735 
736 		A += (sda-1)*bs; // new panel
737 
738 		if(kmax==6)
739 			goto store_t;
740 
741 		k += 6;
742 
743 		}
744 	else // if(offA==3)
745 		{
746 
747 		// 0
748 
749 		x_t_0 = x_t[0];
750 
751 		a_00 = A[0+bs*0];
752 
753 		y_t_0 += a_00 * x_t_0;
754 
755 		A += 1;
756 		z_n += 1;
757 		x_t += 1;
758 
759 		A += (sda-1)*bs; // new panel
760 
761 		if(kmax==1)
762 			goto store_t;
763 
764 		// 1
765 
766 		y_n_0 = z_n[0];
767 		x_t_0 = x_t[0];
768 
769 		a_00 = A[0+bs*0];
770 		a_01 = A[0+bs*1];
771 
772 		y_n_0 += a_00 * x_n_0;
773 		y_t_0 += a_00 * x_t_0;
774 		y_t_1 += a_01 * x_t_0;
775 
776 		z_n[0] = y_n_0;
777 
778 		A += 1;
779 		z_n += 1;
780 		x_t += 1;
781 
782 		if(kmax==2)
783 			goto store_t;
784 
785 		// 2
786 
787 		y_n_0 = z_n[0];
788 		x_t_0 = x_t[0];
789 
790 		a_00 = A[0+bs*0];
791 		a_01 = A[0+bs*1];
792 		a_02 = A[0+bs*2];
793 
794 		y_n_0 += a_00 * x_n_0;
795 		y_t_0 += a_00 * x_t_0;
796 		y_n_0 += a_01 * x_n_1;
797 		y_t_1 += a_01 * x_t_0;
798 		y_t_2 += a_02 * x_t_0;
799 
800 		z_n[0] = y_n_0;
801 
802 		A += 1;
803 		z_n += 1;
804 		x_t += 1;
805 
806 		if(kmax==3)
807 			goto store_t;
808 
809 		// 3
810 
811 		y_n_0 = z_n[0];
812 		x_t_0 = x_t[0];
813 
814 		a_00 = A[0+bs*0];
815 		a_01 = A[0+bs*1];
816 		a_02 = A[0+bs*2];
817 		a_03 = A[0+bs*3];
818 
819 		y_n_0 += a_00 * x_n_0;
820 		y_t_0 += a_00 * x_t_0;
821 		y_n_0 += a_01 * x_n_1;
822 		y_t_1 += a_01 * x_t_0;
823 		y_n_0 += a_02 * x_n_2;
824 		y_t_2 += a_02 * x_t_0;
825 		y_t_3 += a_03 * x_t_0;
826 
827 		z_n[0] = y_n_0;
828 
829 		A += 1;
830 		z_n += 1;
831 		x_t += 1;
832 
833 		if(kmax==4)
834 			goto store_t;
835 
836 		// 4
837 
838 		y_n_0 = z_n[0];
839 		x_t_0 = x_t[0];
840 
841 		a_00 = A[0+bs*0];
842 		a_01 = A[0+bs*1];
843 		a_02 = A[0+bs*2];
844 		a_03 = A[0+bs*3];
845 
846 		y_n_0 += a_00 * x_n_0;
847 		y_t_0 += a_00 * x_t_0;
848 		y_n_0 += a_01 * x_n_1;
849 		y_t_1 += a_01 * x_t_0;
850 		y_n_0 += a_02 * x_n_2;
851 		y_t_2 += a_02 * x_t_0;
852 		y_n_0 += a_03 * x_n_3;
853 		y_t_3 += a_03 * x_t_0;
854 
855 		z_n[0] = y_n_0;
856 
857 		A += 1;
858 		z_n += 1;
859 		x_t += 1;
860 
861 		A += (sda-1)*bs; // new panel
862 
863 		if(kmax==5)
864 			goto store_t;
865 
866 		k += 5;
867 
868 		}
869 	for(; k<kmax-3; k+=bs)
870 		{
871 
872 		// 0
873 
874 		y_n_0 = z_n[0];
875 		x_t_0 = x_t[0];
876 
877 		a_00 = A[0+bs*0];
878 		a_01 = A[0+bs*1];
879 		a_02 = A[0+bs*2];
880 		a_03 = A[0+bs*3];
881 
882 		y_n_0 += a_00 * x_n_0;
883 		y_t_0 += a_00 * x_t_0;
884 		y_n_0 += a_01 * x_n_1;
885 		y_t_1 += a_01 * x_t_0;
886 		y_n_0 += a_02 * x_n_2;
887 		y_t_2 += a_02 * x_t_0;
888 		y_n_0 += a_03 * x_n_3;
889 		y_t_3 += a_03 * x_t_0;
890 
891 		z_n[0] = y_n_0;
892 
893 
894 		// 1
895 
896 		y_n_0 = z_n[1];
897 		x_t_0 = x_t[1];
898 
899 		a_00 = A[1+bs*0];
900 		a_01 = A[1+bs*1];
901 		a_02 = A[1+bs*2];
902 		a_03 = A[1+bs*3];
903 
904 		y_n_0 += a_00 * x_n_0;
905 		y_t_0 += a_00 * x_t_0;
906 		y_n_0 += a_01 * x_n_1;
907 		y_t_1 += a_01 * x_t_0;
908 		y_n_0 += a_02 * x_n_2;
909 		y_t_2 += a_02 * x_t_0;
910 		y_n_0 += a_03 * x_n_3;
911 		y_t_3 += a_03 * x_t_0;
912 
913 		z_n[1] = y_n_0;
914 
915 
916 		// 2
917 
918 		y_n_0 = z_n[2];
919 		x_t_0 = x_t[2];
920 
921 		a_00 = A[2+bs*0];
922 		a_01 = A[2+bs*1];
923 		a_02 = A[2+bs*2];
924 		a_03 = A[2+bs*3];
925 
926 		y_n_0 += a_00 * x_n_0;
927 		y_t_0 += a_00 * x_t_0;
928 		y_n_0 += a_01 * x_n_1;
929 		y_t_1 += a_01 * x_t_0;
930 		y_n_0 += a_02 * x_n_2;
931 		y_t_2 += a_02 * x_t_0;
932 		y_n_0 += a_03 * x_n_3;
933 		y_t_3 += a_03 * x_t_0;
934 
935 		z_n[2] = y_n_0;
936 
937 
938 		// 3
939 
940 		y_n_0 = z_n[3];
941 		x_t_0 = x_t[3];
942 
943 		a_00 = A[3+bs*0];
944 		a_01 = A[3+bs*1];
945 		a_02 = A[3+bs*2];
946 		a_03 = A[3+bs*3];
947 
948 		y_n_0 += a_00 * x_n_0;
949 		y_t_0 += a_00 * x_t_0;
950 		y_n_0 += a_01 * x_n_1;
951 		y_t_1 += a_01 * x_t_0;
952 		y_n_0 += a_02 * x_n_2;
953 		y_t_2 += a_02 * x_t_0;
954 		y_n_0 += a_03 * x_n_3;
955 		y_t_3 += a_03 * x_t_0;
956 
957 		z_n[3] = y_n_0;
958 
959 
960 		A += sda*bs;
961 		z_n += 4;
962 		x_t += 4;
963 
964 		}
965 	for(; k<kmax; k++)
966 		{
967 
968 		// 0
969 
970 		y_n_0 = z_n[0];
971 		x_t_0 = x_t[0];
972 
973 		a_00 = A[0+bs*0];
974 		a_01 = A[0+bs*1];
975 		a_02 = A[0+bs*2];
976 		a_03 = A[0+bs*3];
977 
978 		y_n_0 += a_00 * x_n_0;
979 		y_t_0 += a_00 * x_t_0;
980 		y_n_0 += a_01 * x_n_1;
981 		y_t_1 += a_01 * x_t_0;
982 		y_n_0 += a_02 * x_n_2;
983 		y_t_2 += a_02 * x_t_0;
984 		y_n_0 += a_03 * x_n_3;
985 		y_t_3 += a_03 * x_t_0;
986 
987 		z_n[0] = y_n_0;
988 
989 		A += 1;
990 		z_n += 1;
991 		x_t += 1;
992 
993 		}
994 
995 	store_t:
996 	z_t[0] += alpha[0]*y_t_0;
997 	if(km>1)
998 		{
999 		z_t[1] += alpha[0]*y_t_1;
1000 		if(km>2)
1001 			{
1002 			z_t[2] += alpha[0]*y_t_2;
1003 			if(km>3)
1004 				{
1005 				z_t[3] += alpha[0]*y_t_3;
1006 				}
1007 			}
1008 		}
1009 
1010 	return;
1011 
1012 	}
1013 #endif
1014 
1015 
1016 
1017 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
1018 // XXX copy and scale y_n into z_n outside the kernel !!!!!
kernel_dsymv_l_4_lib4(int kmax,double * alpha,double * A,int sda,double * x_n,double * z_n)1019 void kernel_dsymv_l_4_lib4(int kmax, double *alpha, double *A, int sda, double *x_n, double *z_n)
1020 	{
1021 
1022 	kernel_dsymv_l_4_gen_lib4(kmax, alpha, 0, A, sda, x_n, z_n, 4);
1023 
1024 	return;
1025 
1026 	}
1027 #endif
1028 
1029 
1030 
1031 
1032