1 /**************************************************************************************************
2 * *
3 * This file is part of BLASFEO. *
4 * *
5 * BLASFEO -- BLAS For Embedded Optimization. *
6 * Copyright (C) 2019 by Gianluca Frison. *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8 * All rights reserved. *
9 * *
10 * The 2-Clause BSD License *
11 * *
12 * Redistribution and use in source and binary forms, with or without *
13 * modification, are permitted provided that the following conditions are met: *
14 * *
15 * 1. Redistributions of source code must retain the above copyright notice, this *
16 * list of conditions and the following disclaimer. *
17 * 2. Redistributions in binary form must reproduce the above copyright notice, *
18 * this list of conditions and the following disclaimer in the documentation *
19 * and/or other materials provided with the distribution. *
20 * *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
31 * *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de *
33 * *
34 **************************************************************************************************/
35
36
37
38 // XXX copy and scale y_n into z_n outside the kernel !!!!!
39 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A53) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
kernel_sgemv_nt_4_vs_lib4(int kmax,float * alpha_n,float * alpha_t,float * A,int sda,float * x_n,float * x_t,float * beta_t,float * y_t,float * z_n,float * z_t,int km)40 void kernel_sgemv_nt_4_vs_lib4(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t, int km)
41 {
42
43 if(kmax<=0)
44 return;
45
46 const int bs = 4;
47
48 int k;
49
50 float
51 a_00, a_01, a_02, a_03,
52 x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
53 x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
54
55 x_n_0 = 0;
56 x_n_1 = 0;
57 x_n_2 = 0;
58 x_n_3 = 0;
59
60 x_n_0 = alpha_n[0]*x_n[0];
61 if(km>1)
62 {
63 x_n_1 = alpha_n[0]*x_n[1];
64 if(km>2)
65 {
66 x_n_2 = alpha_n[0]*x_n[2];
67 if(km>3)
68 {
69 x_n_3 = alpha_n[0]*x_n[3];
70 }
71 }
72 }
73
74 y_t_0 = 0;
75 y_t_1 = 0;
76 y_t_2 = 0;
77 y_t_3 = 0;
78
79 k = 0;
80 for(; k<kmax-3; k+=bs)
81 {
82
83 // 0
84
85 y_n_0 = z_n[0];
86 x_t_0 = x_t[0];
87
88 a_00 = A[0+bs*0];
89 a_01 = A[0+bs*1];
90 a_02 = A[0+bs*2];
91 a_03 = A[0+bs*3];
92
93 y_n_0 += a_00 * x_n_0;
94 y_t_0 += a_00 * x_t_0;
95 y_n_0 += a_01 * x_n_1;
96 y_t_1 += a_01 * x_t_0;
97 y_n_0 += a_02 * x_n_2;
98 y_t_2 += a_02 * x_t_0;
99 y_n_0 += a_03 * x_n_3;
100 y_t_3 += a_03 * x_t_0;
101
102 z_n[0] = y_n_0;
103
104
105 // 1
106
107 y_n_0 = z_n[1];
108 x_t_0 = x_t[1];
109
110 a_00 = A[1+bs*0];
111 a_01 = A[1+bs*1];
112 a_02 = A[1+bs*2];
113 a_03 = A[1+bs*3];
114
115 y_n_0 += a_00 * x_n_0;
116 y_t_0 += a_00 * x_t_0;
117 y_n_0 += a_01 * x_n_1;
118 y_t_1 += a_01 * x_t_0;
119 y_n_0 += a_02 * x_n_2;
120 y_t_2 += a_02 * x_t_0;
121 y_n_0 += a_03 * x_n_3;
122 y_t_3 += a_03 * x_t_0;
123
124 z_n[1] = y_n_0;
125
126
127 // 2
128
129 y_n_0 = z_n[2];
130 x_t_0 = x_t[2];
131
132 a_00 = A[2+bs*0];
133 a_01 = A[2+bs*1];
134 a_02 = A[2+bs*2];
135 a_03 = A[2+bs*3];
136
137 y_n_0 += a_00 * x_n_0;
138 y_t_0 += a_00 * x_t_0;
139 y_n_0 += a_01 * x_n_1;
140 y_t_1 += a_01 * x_t_0;
141 y_n_0 += a_02 * x_n_2;
142 y_t_2 += a_02 * x_t_0;
143 y_n_0 += a_03 * x_n_3;
144 y_t_3 += a_03 * x_t_0;
145
146 z_n[2] = y_n_0;
147
148
149 // 3
150
151 y_n_0 = z_n[3];
152 x_t_0 = x_t[3];
153
154 a_00 = A[3+bs*0];
155 a_01 = A[3+bs*1];
156 a_02 = A[3+bs*2];
157 a_03 = A[3+bs*3];
158
159 y_n_0 += a_00 * x_n_0;
160 y_t_0 += a_00 * x_t_0;
161 y_n_0 += a_01 * x_n_1;
162 y_t_1 += a_01 * x_t_0;
163 y_n_0 += a_02 * x_n_2;
164 y_t_2 += a_02 * x_t_0;
165 y_n_0 += a_03 * x_n_3;
166 y_t_3 += a_03 * x_t_0;
167
168 z_n[3] = y_n_0;
169
170
171 A += sda*bs;
172 z_n += 4;
173 x_t += 4;
174
175 }
176 for(; k<kmax; k++)
177 {
178
179 // 0
180
181 y_n_0 = z_n[0];
182 x_t_0 = x_t[0];
183
184 a_00 = A[0+bs*0];
185 a_01 = A[0+bs*1];
186 a_02 = A[0+bs*2];
187 a_03 = A[0+bs*3];
188
189 y_n_0 += a_00 * x_n_0;
190 y_t_0 += a_00 * x_t_0;
191 y_n_0 += a_01 * x_n_1;
192 y_t_1 += a_01 * x_t_0;
193 y_n_0 += a_02 * x_n_2;
194 y_t_2 += a_02 * x_t_0;
195 y_n_0 += a_03 * x_n_3;
196 y_t_3 += a_03 * x_t_0;
197
198 z_n[0] = y_n_0;
199
200 A += 1;
201 z_n += 1;
202 x_t += 1;
203
204 }
205
206 // store t
207 z_t[0] = alpha_t[0]*y_t_0 + beta_t[0]*y_t[0];
208 if(km>1)
209 {
210 z_t[1] = alpha_t[0]*y_t_1 + beta_t[0]*y_t[1];
211 if(km>2)
212 {
213 z_t[2] = alpha_t[0]*y_t_2 + beta_t[0]*y_t[2];
214 if(km>3)
215 {
216 z_t[3] = alpha_t[0]*y_t_3 + beta_t[0]*y_t[3];
217 }
218 }
219 }
220
221 return;
222
223 }
224 #endif
225
226
227
228 // XXX copy and scale y_n into z_n outside the kernel !!!!!
229 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A53) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
kernel_sgemv_nt_4_lib4(int kmax,float * alpha_n,float * alpha_t,float * A,int sda,float * x_n,float * x_t,float * beta_t,float * y_t,float * z_n,float * z_t)230 void kernel_sgemv_nt_4_lib4(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t)
231 {
232
233 kernel_sgemv_nt_4_vs_lib4(kmax, alpha_n, alpha_t, A, sda, x_n, x_t, beta_t, y_t, z_n, z_t, 4);
234
235 return;
236
237 }
238 #endif
239
240
241
242 // XXX copy and scale y_n into z_n outside the kernel !!!!!
243 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A53) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
kernel_ssymv_l_4_gen_lib4(int kmax,float * alpha,int offA,float * A,int sda,float * x_n,float * z_n,int km)244 void kernel_ssymv_l_4_gen_lib4(int kmax, float *alpha, int offA, float *A, int sda, float *x_n, float *z_n, int km)
245 {
246
247 if(kmax<=0)
248 return;
249
250 float *x_t = x_n;
251 float *z_t = z_n;
252
253 const int bs = 4;
254
255 int k;
256
257 float
258 a_00, a_01, a_02, a_03,
259 x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
260 x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
261
262 x_n_0 = 0;
263 x_n_1 = 0;
264 x_n_2 = 0;
265 x_n_3 = 0;
266
267 x_n_0 = alpha[0]*x_n[0];
268 if(km>1)
269 {
270 x_n_1 = alpha[0]*x_n[1];
271 if(km>2)
272 {
273 x_n_2 = alpha[0]*x_n[2];
274 if(km>3)
275 {
276 x_n_3 = alpha[0]*x_n[3];
277 }
278 }
279 }
280
281 y_t_0 = 0;
282 y_t_1 = 0;
283 y_t_2 = 0;
284 y_t_3 = 0;
285
286 k = 0;
287 if(offA==0)
288 {
289 if(kmax<4)
290 {
291 // 0
292
293 x_t_0 = x_t[0];
294
295 a_00 = A[0+bs*0];
296
297 y_t_0 += a_00 * x_t_0;
298
299 if(kmax==1)
300 goto store_t;
301
302 // 1
303
304 y_n_0 = z_n[1];
305 x_t_0 = x_t[1];
306
307 a_00 = A[1+bs*0];
308 a_01 = A[1+bs*1];
309
310 y_n_0 += a_00 * x_n_0;
311 y_t_0 += a_00 * x_t_0;
312 y_t_1 += a_01 * x_t_0;
313
314 z_n[1] = y_n_0;
315
316 if(kmax==2)
317 goto store_t;
318
319 // 2
320
321 y_n_0 = z_n[2];
322 x_t_0 = x_t[2];
323
324 a_00 = A[2+bs*0];
325 a_01 = A[2+bs*1];
326 a_02 = A[2+bs*2];
327
328 y_n_0 += a_00 * x_n_0;
329 y_t_0 += a_00 * x_t_0;
330 y_n_0 += a_01 * x_n_1;
331 y_t_1 += a_01 * x_t_0;
332 y_t_2 += a_02 * x_t_0;
333
334 z_n[2] = y_n_0;
335
336 goto store_t;
337 }
338 else
339 {
340
341 // 0
342
343 x_t_0 = x_t[0];
344
345 a_00 = A[0+bs*0];
346
347 y_t_0 += a_00 * x_t_0;
348
349
350 // 1
351
352 y_n_0 = z_n[1];
353 x_t_0 = x_t[1];
354
355 a_00 = A[1+bs*0];
356 a_01 = A[1+bs*1];
357
358 y_n_0 += a_00 * x_n_0;
359 y_t_0 += a_00 * x_t_0;
360 y_t_1 += a_01 * x_t_0;
361
362 z_n[1] = y_n_0;
363
364
365 // 2
366
367 y_n_0 = z_n[2];
368 x_t_0 = x_t[2];
369
370 a_00 = A[2+bs*0];
371 a_01 = A[2+bs*1];
372 a_02 = A[2+bs*2];
373
374 y_n_0 += a_00 * x_n_0;
375 y_t_0 += a_00 * x_t_0;
376 y_n_0 += a_01 * x_n_1;
377 y_t_1 += a_01 * x_t_0;
378 y_t_2 += a_02 * x_t_0;
379
380 z_n[2] = y_n_0;
381
382
383 // 3
384
385 y_n_0 = z_n[3];
386 x_t_0 = x_t[3];
387
388 a_00 = A[3+bs*0];
389 a_01 = A[3+bs*1];
390 a_02 = A[3+bs*2];
391 a_03 = A[3+bs*3];
392
393 y_n_0 += a_00 * x_n_0;
394 y_t_0 += a_00 * x_t_0;
395 y_n_0 += a_01 * x_n_1;
396 y_t_1 += a_01 * x_t_0;
397 y_n_0 += a_02 * x_n_2;
398 y_t_2 += a_02 * x_t_0;
399 y_t_3 += a_03 * x_t_0;
400
401 z_n[3] = y_n_0;
402
403 k += 4;
404 A += sda*bs;
405 z_n += 4;
406 x_t += 4;
407
408 }
409 }
410 else if(offA==1)
411 {
412
413 // 0
414
415 x_t_0 = x_t[0];
416
417 a_00 = A[0+bs*0];
418
419 y_t_0 += a_00 * x_t_0;
420
421 A += 1;
422 z_n += 1;
423 x_t += 1;
424
425 if(kmax==1)
426 goto store_t;
427
428 // 1
429
430 y_n_0 = z_n[0];
431 x_t_0 = x_t[0];
432
433 a_00 = A[0+bs*0];
434 a_01 = A[0+bs*1];
435
436 y_n_0 += a_00 * x_n_0;
437 y_t_0 += a_00 * x_t_0;
438 y_t_1 += a_01 * x_t_0;
439
440 z_n[0] = y_n_0;
441
442 A += 1;
443 z_n += 1;
444 x_t += 1;
445
446 if(kmax==2)
447 goto store_t;
448
449 // 2
450
451 y_n_0 = z_n[0];
452 x_t_0 = x_t[0];
453
454 a_00 = A[0+bs*0];
455 a_01 = A[0+bs*1];
456 a_02 = A[0+bs*2];
457
458 y_n_0 += a_00 * x_n_0;
459 y_t_0 += a_00 * x_t_0;
460 y_n_0 += a_01 * x_n_1;
461 y_t_1 += a_01 * x_t_0;
462 y_t_2 += a_02 * x_t_0;
463
464 z_n[0] = y_n_0;
465
466 A += 1;
467 z_n += 1;
468 x_t += 1;
469
470 A += (sda-1)*bs; // new panel
471
472 if(kmax==3)
473 goto store_t;
474
475 // 3
476
477 y_n_0 = z_n[0];
478 x_t_0 = x_t[0];
479
480 a_00 = A[0+bs*0];
481 a_01 = A[0+bs*1];
482 a_02 = A[0+bs*2];
483 a_03 = A[0+bs*3];
484
485 y_n_0 += a_00 * x_n_0;
486 y_t_0 += a_00 * x_t_0;
487 y_n_0 += a_01 * x_n_1;
488 y_t_1 += a_01 * x_t_0;
489 y_n_0 += a_02 * x_n_2;
490 y_t_2 += a_02 * x_t_0;
491 y_t_3 += a_03 * x_t_0;
492
493 z_n[0] = y_n_0;
494
495 A += 1;
496 z_n += 1;
497 x_t += 1;
498
499 if(kmax==4)
500 goto store_t;
501
502 // 4
503
504 y_n_0 = z_n[0];
505 x_t_0 = x_t[0];
506
507 a_00 = A[0+bs*0];
508 a_01 = A[0+bs*1];
509 a_02 = A[0+bs*2];
510 a_03 = A[0+bs*3];
511
512 y_n_0 += a_00 * x_n_0;
513 y_t_0 += a_00 * x_t_0;
514 y_n_0 += a_01 * x_n_1;
515 y_t_1 += a_01 * x_t_0;
516 y_n_0 += a_02 * x_n_2;
517 y_t_2 += a_02 * x_t_0;
518 y_n_0 += a_03 * x_n_3;
519 y_t_3 += a_03 * x_t_0;
520
521 z_n[0] = y_n_0;
522
523 A += 1;
524 z_n += 1;
525 x_t += 1;
526
527 if(kmax==5)
528 goto store_t;
529
530 // 5
531
532 y_n_0 = z_n[0];
533 x_t_0 = x_t[0];
534
535 a_00 = A[0+bs*0];
536 a_01 = A[0+bs*1];
537 a_02 = A[0+bs*2];
538 a_03 = A[0+bs*3];
539
540 y_n_0 += a_00 * x_n_0;
541 y_t_0 += a_00 * x_t_0;
542 y_n_0 += a_01 * x_n_1;
543 y_t_1 += a_01 * x_t_0;
544 y_n_0 += a_02 * x_n_2;
545 y_t_2 += a_02 * x_t_0;
546 y_n_0 += a_03 * x_n_3;
547 y_t_3 += a_03 * x_t_0;
548
549 z_n[0] = y_n_0;
550
551 A += 1;
552 z_n += 1;
553 x_t += 1;
554
555 if(kmax==6)
556 goto store_t;
557
558 // 6
559
560 y_n_0 = z_n[0];
561 x_t_0 = x_t[0];
562
563 a_00 = A[0+bs*0];
564 a_01 = A[0+bs*1];
565 a_02 = A[0+bs*2];
566 a_03 = A[0+bs*3];
567
568 y_n_0 += a_00 * x_n_0;
569 y_t_0 += a_00 * x_t_0;
570 y_n_0 += a_01 * x_n_1;
571 y_t_1 += a_01 * x_t_0;
572 y_n_0 += a_02 * x_n_2;
573 y_t_2 += a_02 * x_t_0;
574 y_n_0 += a_03 * x_n_3;
575 y_t_3 += a_03 * x_t_0;
576
577 z_n[0] = y_n_0;
578
579 A += 1;
580 z_n += 1;
581 x_t += 1;
582
583 A += (sda-1)*bs; // new panel
584
585 if(kmax==7)
586 goto store_t;
587
588 k += 7;
589
590 }
591 else if(offA==2)
592 {
593
594 // 0
595
596 x_t_0 = x_t[0];
597
598 a_00 = A[0+bs*0];
599
600 y_t_0 += a_00 * x_t_0;
601
602 A += 1;
603 z_n += 1;
604 x_t += 1;
605
606 if(kmax==1)
607 goto store_t;
608
609 // 1
610
611 y_n_0 = z_n[0];
612 x_t_0 = x_t[0];
613
614 a_00 = A[0+bs*0];
615 a_01 = A[0+bs*1];
616
617 y_n_0 += a_00 * x_n_0;
618 y_t_0 += a_00 * x_t_0;
619 y_t_1 += a_01 * x_t_0;
620
621 z_n[0] = y_n_0;
622
623 A += 1;
624 z_n += 1;
625 x_t += 1;
626
627 A += (sda-1)*bs; // new panel
628
629 if(kmax==2)
630 goto store_t;
631
632 // 2
633
634 y_n_0 = z_n[0];
635 x_t_0 = x_t[0];
636
637 a_00 = A[0+bs*0];
638 a_01 = A[0+bs*1];
639 a_02 = A[0+bs*2];
640
641 y_n_0 += a_00 * x_n_0;
642 y_t_0 += a_00 * x_t_0;
643 y_n_0 += a_01 * x_n_1;
644 y_t_1 += a_01 * x_t_0;
645 y_t_2 += a_02 * x_t_0;
646
647 z_n[0] = y_n_0;
648
649 A += 1;
650 z_n += 1;
651 x_t += 1;
652
653 if(kmax==3)
654 goto store_t;
655
656 // 3
657
658 y_n_0 = z_n[0];
659 x_t_0 = x_t[0];
660
661 a_00 = A[0+bs*0];
662 a_01 = A[0+bs*1];
663 a_02 = A[0+bs*2];
664 a_03 = A[0+bs*3];
665
666 y_n_0 += a_00 * x_n_0;
667 y_t_0 += a_00 * x_t_0;
668 y_n_0 += a_01 * x_n_1;
669 y_t_1 += a_01 * x_t_0;
670 y_n_0 += a_02 * x_n_2;
671 y_t_2 += a_02 * x_t_0;
672 y_t_3 += a_03 * x_t_0;
673
674 z_n[0] = y_n_0;
675
676 A += 1;
677 z_n += 1;
678 x_t += 1;
679
680 if(kmax==4)
681 goto store_t;
682
683 // 4
684
685 y_n_0 = z_n[0];
686 x_t_0 = x_t[0];
687
688 a_00 = A[0+bs*0];
689 a_01 = A[0+bs*1];
690 a_02 = A[0+bs*2];
691 a_03 = A[0+bs*3];
692
693 y_n_0 += a_00 * x_n_0;
694 y_t_0 += a_00 * x_t_0;
695 y_n_0 += a_01 * x_n_1;
696 y_t_1 += a_01 * x_t_0;
697 y_n_0 += a_02 * x_n_2;
698 y_t_2 += a_02 * x_t_0;
699 y_n_0 += a_03 * x_n_3;
700 y_t_3 += a_03 * x_t_0;
701
702 z_n[0] = y_n_0;
703
704 A += 1;
705 z_n += 1;
706 x_t += 1;
707
708 if(kmax==5)
709 goto store_t;
710
711 // 5
712
713 y_n_0 = z_n[0];
714 x_t_0 = x_t[0];
715
716 a_00 = A[0+bs*0];
717 a_01 = A[0+bs*1];
718 a_02 = A[0+bs*2];
719 a_03 = A[0+bs*3];
720
721 y_n_0 += a_00 * x_n_0;
722 y_t_0 += a_00 * x_t_0;
723 y_n_0 += a_01 * x_n_1;
724 y_t_1 += a_01 * x_t_0;
725 y_n_0 += a_02 * x_n_2;
726 y_t_2 += a_02 * x_t_0;
727 y_n_0 += a_03 * x_n_3;
728 y_t_3 += a_03 * x_t_0;
729
730 z_n[0] = y_n_0;
731
732 A += 1;
733 z_n += 1;
734 x_t += 1;
735
736 A += (sda-1)*bs; // new panel
737
738 if(kmax==6)
739 goto store_t;
740
741 k += 6;
742
743 }
744 else // if(offA==3)
745 {
746
747 // 0
748
749 x_t_0 = x_t[0];
750
751 a_00 = A[0+bs*0];
752
753 y_t_0 += a_00 * x_t_0;
754
755 A += 1;
756 z_n += 1;
757 x_t += 1;
758
759 A += (sda-1)*bs; // new panel
760
761 if(kmax==1)
762 goto store_t;
763
764 // 1
765
766 y_n_0 = z_n[0];
767 x_t_0 = x_t[0];
768
769 a_00 = A[0+bs*0];
770 a_01 = A[0+bs*1];
771
772 y_n_0 += a_00 * x_n_0;
773 y_t_0 += a_00 * x_t_0;
774 y_t_1 += a_01 * x_t_0;
775
776 z_n[0] = y_n_0;
777
778 A += 1;
779 z_n += 1;
780 x_t += 1;
781
782 if(kmax==2)
783 goto store_t;
784
785 // 2
786
787 y_n_0 = z_n[0];
788 x_t_0 = x_t[0];
789
790 a_00 = A[0+bs*0];
791 a_01 = A[0+bs*1];
792 a_02 = A[0+bs*2];
793
794 y_n_0 += a_00 * x_n_0;
795 y_t_0 += a_00 * x_t_0;
796 y_n_0 += a_01 * x_n_1;
797 y_t_1 += a_01 * x_t_0;
798 y_t_2 += a_02 * x_t_0;
799
800 z_n[0] = y_n_0;
801
802 A += 1;
803 z_n += 1;
804 x_t += 1;
805
806 if(kmax==3)
807 goto store_t;
808
809 // 3
810
811 y_n_0 = z_n[0];
812 x_t_0 = x_t[0];
813
814 a_00 = A[0+bs*0];
815 a_01 = A[0+bs*1];
816 a_02 = A[0+bs*2];
817 a_03 = A[0+bs*3];
818
819 y_n_0 += a_00 * x_n_0;
820 y_t_0 += a_00 * x_t_0;
821 y_n_0 += a_01 * x_n_1;
822 y_t_1 += a_01 * x_t_0;
823 y_n_0 += a_02 * x_n_2;
824 y_t_2 += a_02 * x_t_0;
825 y_t_3 += a_03 * x_t_0;
826
827 z_n[0] = y_n_0;
828
829 A += 1;
830 z_n += 1;
831 x_t += 1;
832
833 if(kmax==4)
834 goto store_t;
835
836 // 4
837
838 y_n_0 = z_n[0];
839 x_t_0 = x_t[0];
840
841 a_00 = A[0+bs*0];
842 a_01 = A[0+bs*1];
843 a_02 = A[0+bs*2];
844 a_03 = A[0+bs*3];
845
846 y_n_0 += a_00 * x_n_0;
847 y_t_0 += a_00 * x_t_0;
848 y_n_0 += a_01 * x_n_1;
849 y_t_1 += a_01 * x_t_0;
850 y_n_0 += a_02 * x_n_2;
851 y_t_2 += a_02 * x_t_0;
852 y_n_0 += a_03 * x_n_3;
853 y_t_3 += a_03 * x_t_0;
854
855 z_n[0] = y_n_0;
856
857 A += 1;
858 z_n += 1;
859 x_t += 1;
860
861 A += (sda-1)*bs; // new panel
862
863 if(kmax==5)
864 goto store_t;
865
866 k += 5;
867
868 }
869 for(; k<kmax-3; k+=bs)
870 {
871
872 // 0
873
874 y_n_0 = z_n[0];
875 x_t_0 = x_t[0];
876
877 a_00 = A[0+bs*0];
878 a_01 = A[0+bs*1];
879 a_02 = A[0+bs*2];
880 a_03 = A[0+bs*3];
881
882 y_n_0 += a_00 * x_n_0;
883 y_t_0 += a_00 * x_t_0;
884 y_n_0 += a_01 * x_n_1;
885 y_t_1 += a_01 * x_t_0;
886 y_n_0 += a_02 * x_n_2;
887 y_t_2 += a_02 * x_t_0;
888 y_n_0 += a_03 * x_n_3;
889 y_t_3 += a_03 * x_t_0;
890
891 z_n[0] = y_n_0;
892
893
894 // 1
895
896 y_n_0 = z_n[1];
897 x_t_0 = x_t[1];
898
899 a_00 = A[1+bs*0];
900 a_01 = A[1+bs*1];
901 a_02 = A[1+bs*2];
902 a_03 = A[1+bs*3];
903
904 y_n_0 += a_00 * x_n_0;
905 y_t_0 += a_00 * x_t_0;
906 y_n_0 += a_01 * x_n_1;
907 y_t_1 += a_01 * x_t_0;
908 y_n_0 += a_02 * x_n_2;
909 y_t_2 += a_02 * x_t_0;
910 y_n_0 += a_03 * x_n_3;
911 y_t_3 += a_03 * x_t_0;
912
913 z_n[1] = y_n_0;
914
915
916 // 2
917
918 y_n_0 = z_n[2];
919 x_t_0 = x_t[2];
920
921 a_00 = A[2+bs*0];
922 a_01 = A[2+bs*1];
923 a_02 = A[2+bs*2];
924 a_03 = A[2+bs*3];
925
926 y_n_0 += a_00 * x_n_0;
927 y_t_0 += a_00 * x_t_0;
928 y_n_0 += a_01 * x_n_1;
929 y_t_1 += a_01 * x_t_0;
930 y_n_0 += a_02 * x_n_2;
931 y_t_2 += a_02 * x_t_0;
932 y_n_0 += a_03 * x_n_3;
933 y_t_3 += a_03 * x_t_0;
934
935 z_n[2] = y_n_0;
936
937
938 // 3
939
940 y_n_0 = z_n[3];
941 x_t_0 = x_t[3];
942
943 a_00 = A[3+bs*0];
944 a_01 = A[3+bs*1];
945 a_02 = A[3+bs*2];
946 a_03 = A[3+bs*3];
947
948 y_n_0 += a_00 * x_n_0;
949 y_t_0 += a_00 * x_t_0;
950 y_n_0 += a_01 * x_n_1;
951 y_t_1 += a_01 * x_t_0;
952 y_n_0 += a_02 * x_n_2;
953 y_t_2 += a_02 * x_t_0;
954 y_n_0 += a_03 * x_n_3;
955 y_t_3 += a_03 * x_t_0;
956
957 z_n[3] = y_n_0;
958
959
960 A += sda*bs;
961 z_n += 4;
962 x_t += 4;
963
964 }
965 for(; k<kmax; k++)
966 {
967
968 // 0
969
970 y_n_0 = z_n[0];
971 x_t_0 = x_t[0];
972
973 a_00 = A[0+bs*0];
974 a_01 = A[0+bs*1];
975 a_02 = A[0+bs*2];
976 a_03 = A[0+bs*3];
977
978 y_n_0 += a_00 * x_n_0;
979 y_t_0 += a_00 * x_t_0;
980 y_n_0 += a_01 * x_n_1;
981 y_t_1 += a_01 * x_t_0;
982 y_n_0 += a_02 * x_n_2;
983 y_t_2 += a_02 * x_t_0;
984 y_n_0 += a_03 * x_n_3;
985 y_t_3 += a_03 * x_t_0;
986
987 z_n[0] = y_n_0;
988
989 A += 1;
990 z_n += 1;
991 x_t += 1;
992
993 }
994
995 store_t:
996 z_t[0] += alpha[0]*y_t_0;
997 if(km>1)
998 {
999 z_t[1] += alpha[0]*y_t_1;
1000 if(km>2)
1001 {
1002 z_t[2] += alpha[0]*y_t_2;
1003 if(km>3)
1004 {
1005 z_t[3] += alpha[0]*y_t_3;
1006 }
1007 }
1008 }
1009
1010 return;
1011
1012 }
1013 #endif
1014
1015
1016
1017 // XXX copy and scale y_n into z_n outside the kernel !!!!!
1018 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A53) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
kernel_ssymv_l_4_lib4(int kmax,float * alpha,float * A,int sda,float * x_n,float * z_n)1019 void kernel_ssymv_l_4_lib4(int kmax, float *alpha, float *A, int sda, float *x_n, float *z_n)
1020 {
1021
1022 kernel_ssymv_l_4_gen_lib4(kmax, alpha, 0, A, sda, x_n, z_n, 4);
1023
1024 return;
1025
1026 }
1027 #endif
1028
1029
1030
1031
1032
1033