1 /**************************************************************************************************
2 * *
3 * This file is part of BLASFEO. *
4 * *
5 * BLASFEO -- BLAS For Embedded Optimization. *
6 * Copyright (C) 2019 by Gianluca Frison. *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8 * All rights reserved. *
9 * *
10 * The 2-Clause BSD License *
11 * *
12 * Redistribution and use in source and binary forms, with or without *
13 * modification, are permitted provided that the following conditions are met: *
14 * *
15 * 1. Redistributions of source code must retain the above copyright notice, this *
16 * list of conditions and the following disclaimer. *
17 * 2. Redistributions in binary form must reproduce the above copyright notice, *
18 * this list of conditions and the following disclaimer in the documentation *
19 * and/or other materials provided with the distribution. *
20 * *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
31 * *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de *
33 * *
34 **************************************************************************************************/
35
36
37 // ---- ge
38
39 // 4
40
41 // both A and B are aligned to 256-bit boundaries
kernel_sgecpsc_4_0_lib4(int kmax,float * alphap,float * A,float * B)42 void kernel_sgecpsc_4_0_lib4(int kmax, float *alphap, float *A, float *B)
43 {
44
45 if(kmax<=0)
46 return;
47
48 const int bs = 4;
49 float alpha = alphap[0];
50
51 int k;
52
53 for(k=0; k<kmax; k++)
54 {
55
56 B[0+bs*0] = alpha * A[0+bs*0];
57 B[1+bs*0] = alpha * A[1+bs*0];
58 B[2+bs*0] = alpha * A[2+bs*0];
59 B[3+bs*0] = alpha * A[3+bs*0];
60
61 A += 4;
62 B += 4;
63
64 }
65
66 }
67
kernel_sgecp_4_0_lib4(int kmax,float * A,float * B)68 void kernel_sgecp_4_0_lib4(int kmax, float *A, float *B)
69 {
70
71 if(kmax<=0)
72 return;
73
74 const int bs = 4;
75
76 int k;
77
78 for(k=0; k<kmax; k++)
79 {
80
81 B[0+bs*0] = A[0+bs*0];
82 B[1+bs*0] = A[1+bs*0];
83 B[2+bs*0] = A[2+bs*0];
84 B[3+bs*0] = A[3+bs*0];
85
86 A += 4;
87 B += 4;
88
89 }
90
91 }
92
93 // both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
kernel_sgecpsc_4_1_lib4(int kmax,float * alphap,float * A0,int sda,float * B)94 void kernel_sgecpsc_4_1_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
95 {
96
97 if(kmax<=0)
98 return;
99
100 const int bs = 4;
101
102 float *A1 = A0 + bs*sda;
103 float alpha = alphap[0];
104
105 int k;
106
107 for(k=0; k<kmax; k++)
108 {
109
110 B[0+bs*0] = alpha * A0[1+bs*0];
111 B[1+bs*0] = alpha * A0[2+bs*0];
112 B[2+bs*0] = alpha * A0[3+bs*0];
113
114 B[3+bs*0] = alpha * A1[0+bs*0];
115
116 A0 += 4;
117 A1 += 4;
118 B += 4;
119
120 }
121
122 }
123
124 // both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
kernel_sgecp_4_1_lib4(int kmax,float * A0,int sda,float * B)125 void kernel_sgecp_4_1_lib4(int kmax, float *A0, int sda, float *B)
126 {
127
128 if(kmax<=0)
129 return;
130
131 const int bs = 4;
132
133 float *A1 = A0 + bs*sda;
134
135 int k;
136
137 for(k=0; k<kmax; k++)
138 {
139
140 B[0+bs*0] = A0[1+bs*0];
141 B[1+bs*0] = A0[2+bs*0];
142 B[2+bs*0] = A0[3+bs*0];
143
144 B[3+bs*0] = A1[0+bs*0];
145
146 A0 += 4;
147 A1 += 4;
148 B += 4;
149
150 }
151
152 }
153
154 // both A and B are aligned to 256-bit boundaries, 2 element of A must be skipped
kernel_sgecpsc_4_2_lib4(int kmax,float * alphap,float * A0,int sda,float * B)155 void kernel_sgecpsc_4_2_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
156 {
157
158 if(kmax<=0)
159 return;
160
161 const int bs = 4;
162
163 float *A1 = A0 + bs*sda;
164 float alpha = alphap[0];
165
166 int k;
167
168 for(k=0; k<kmax; k++)
169 {
170
171 B[0+bs*0] = alpha * A0[2+bs*0];
172 B[1+bs*0] = alpha * A0[3+bs*0];
173
174 B[2+bs*0] = alpha * A1[0+bs*0];
175 B[3+bs*0] = alpha * A1[1+bs*0];
176
177 A0 += 4;
178 A1 += 4;
179 B += 4;
180
181 }
182
183 }
184
185 // both A and B are aligned to 256-bit boundaries, 2 element of A must be skipped
kernel_sgecp_4_2_lib4(int kmax,float * A0,int sda,float * B)186 void kernel_sgecp_4_2_lib4(int kmax, float *A0, int sda, float *B)
187 {
188
189 if(kmax<=0)
190 return;
191
192 const int bs = 4;
193
194 float *A1 = A0 + bs*sda;
195
196 int k;
197
198 for(k=0; k<kmax; k++)
199 {
200
201 B[0+bs*0] = A0[2+bs*0];
202 B[1+bs*0] = A0[3+bs*0];
203
204 B[2+bs*0] = A1[0+bs*0];
205 B[3+bs*0] = A1[1+bs*0];
206
207
208 A0 += 4;
209 A1 += 4;
210 B += 4;
211
212 }
213
214 }
215
216 // both A and B are aligned to 256-bit boundaries, 3 element of A must be skipped
kernel_sgecpsc_4_3_lib4(int kmax,float * alphap,float * A0,int sda,float * B)217 void kernel_sgecpsc_4_3_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
218 {
219
220 if(kmax<=0)
221 return;
222
223 const int bs = 4;
224
225 float *A1 = A0 + bs*sda;
226 float alpha = alphap[0];
227
228 int k;
229
230 for(k=0; k<kmax; k++)
231 {
232
233 B[0+bs*0] = alpha * A0[3+bs*0];
234
235 B[1+bs*0] = alpha * A1[0+bs*0];
236 B[2+bs*0] = alpha * A1[1+bs*0];
237 B[3+bs*0] = alpha * A1[2+bs*0];
238
239 A0 += 4;
240 A1 += 4;
241 B += 4;
242
243 }
244
245 }
246
247 // both A and B are aligned to 256-bit boundaries, 3 element of A must be skipped
kernel_sgecp_4_3_lib4(int kmax,float * A0,int sda,float * B)248 void kernel_sgecp_4_3_lib4(int kmax, float *A0, int sda, float *B)
249 {
250
251 if(kmax<=0)
252 return;
253
254 const int bs = 4;
255
256 float *A1 = A0 + bs*sda;
257
258 int k;
259
260 for(k=0; k<kmax; k++)
261 {
262
263 B[0+bs*0] = A0[3+bs*0];
264
265 B[1+bs*0] = A1[0+bs*0];
266 B[2+bs*0] = A1[1+bs*0];
267 B[3+bs*0] = A1[2+bs*0];
268
269
270 A0 += 4;
271 A1 += 4;
272 B += 4;
273
274 }
275
276 }
277
278 // 3
279
kernel_sgecpsc_3_0_lib4(int kmax,float * alphap,float * A,float * B)280 void kernel_sgecpsc_3_0_lib4(int kmax, float *alphap, float *A, float *B)
281 {
282
283 if(kmax<=0)
284 return;
285
286 const int bs = 4;
287
288 float alpha = alphap[0];
289
290 int k;
291
292 for(k=0; k<kmax; k++)
293 {
294
295 B[0+bs*0] = alpha * A[0+bs*0];
296 B[1+bs*0] = alpha * A[1+bs*0];
297 B[2+bs*0] = alpha * A[2+bs*0];
298
299 A += 4;
300 B += 4;
301
302 }
303
304 }
305
kernel_sgecp_3_0_lib4(int kmax,float * A,float * B)306 void kernel_sgecp_3_0_lib4(int kmax, float *A, float *B)
307 {
308
309 if(kmax<=0)
310 return;
311
312 const int bs = 4;
313
314 int k;
315
316 for(k=0; k<kmax; k++)
317 {
318
319 B[0+bs*0] = A[0+bs*0];
320 B[1+bs*0] = A[1+bs*0];
321 B[2+bs*0] = A[2+bs*0];
322
323 A += 4;
324 B += 4;
325
326 }
327
328 }
329
330 // both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
kernel_sgecpsc_3_2_lib4(int kmax,float * alphap,float * A0,int sda,float * B)331 void kernel_sgecpsc_3_2_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
332 {
333
334 if(kmax<=0)
335 return;
336
337 const int bs = 4;
338
339 float *A1 = A0 + bs*sda;
340 float alpha = alphap[0];
341
342 int k;
343
344 for(k=0; k<kmax; k++)
345 {
346
347 B[0+bs*0] = alpha * A0[2+bs*0];
348 B[1+bs*0] = alpha * A0[3+bs*0];
349
350 B[2+bs*0] = alpha * A1[0+bs*0];
351
352 A0 += 4;
353 A1 += 4;
354 B += 4;
355
356 }
357
358 }
359
360 // both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
kernel_sgecp_3_2_lib4(int kmax,float * A0,int sda,float * B)361 void kernel_sgecp_3_2_lib4(int kmax, float *A0, int sda, float *B)
362 {
363
364 if(kmax<=0)
365 return;
366
367 const int bs = 4;
368
369 float *A1 = A0 + bs*sda;
370
371 int k;
372
373 for(k=0; k<kmax; k++)
374 {
375
376 B[0+bs*0] = A0[2+bs*0];
377
378 B[1+bs*0] = A0[3+bs*0];
379 B[2+bs*0] = A1[0+bs*0];
380
381 A0 += 4;
382 A1 += 4;
383 B += 4;
384
385 }
386
387 }
388
389 // both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
kernel_sgecpsc_3_3_lib4(int kmax,float * alphap,float * A0,int sda,float * B)390 void kernel_sgecpsc_3_3_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
391 {
392
393 if(kmax<=0)
394 return;
395
396 const int bs = 4;
397
398 float *A1 = A0 + bs*sda;
399 float alpha = *alphap;
400
401 int k;
402
403 for(k=0; k<kmax; k++)
404 {
405
406 B[0+bs*0] = alpha * A0[3+bs*0];
407
408 B[1+bs*0] = alpha * A1[0+bs*0];
409 B[2+bs*0] = alpha * A1[1+bs*0];
410
411 A0 += 4;
412 A1 += 4;
413 B += 4;
414
415 }
416
417 }
418
419 // both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
kernel_sgecp_3_3_lib4(int kmax,float * A0,int sda,float * B)420 void kernel_sgecp_3_3_lib4(int kmax, float *A0, int sda, float *B)
421 {
422
423 if(kmax<=0)
424 return;
425
426 const int bs = 4;
427
428 float *A1 = A0 + bs*sda;
429
430 int k;
431
432 for(k=0; k<kmax; k++)
433 {
434
435 B[0+bs*0] = A0[3+bs*0];
436
437 B[1+bs*0] = A1[0+bs*0];
438 B[2+bs*0] = A1[1+bs*0];
439
440 A0 += 4;
441 A1 += 4;
442 B += 4;
443
444 }
445
446 }
447
448 // 2
449
kernel_sgecpsc_2_0_lib4(int kmax,float * alphap,float * A,float * B)450 void kernel_sgecpsc_2_0_lib4(int kmax, float *alphap, float *A, float *B)
451 {
452
453 if(kmax<=0)
454 return;
455
456 const int bs = 4;
457 float alpha = alphap[0];
458
459 int k;
460
461 for(k=0; k<kmax; k++)
462 {
463
464 B[0+bs*0] = alpha * A[0+bs*0];
465 B[1+bs*0] = alpha * A[1+bs*0];
466
467 A += 4;
468 B += 4;
469
470 }
471
472 }
473
kernel_sgecp_2_0_lib4(int kmax,float * A,float * B)474 void kernel_sgecp_2_0_lib4(int kmax, float *A, float *B)
475 {
476
477 if(kmax<=0)
478 return;
479
480 const int bs = 4;
481
482 int k;
483
484 for(k=0; k<kmax; k++)
485 {
486
487 B[0+bs*0] = A[0+bs*0];
488 B[1+bs*0] = A[1+bs*0];
489
490 A += 4;
491 B += 4;
492
493 }
494
495 }
496
497 // both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
kernel_sgecpsc_2_3_lib4(int kmax,float * alphap,float * A0,int sda,float * B)498 void kernel_sgecpsc_2_3_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
499 {
500
501 if(kmax<=0)
502 return;
503
504 const int bs = 4;
505 float alpha = alphap[0];
506 float *A1 = A0 + bs*sda;
507
508 int k;
509
510 for(k=0; k<kmax; k++)
511 {
512
513 B[0+bs*0] = alpha * A0[3+bs*0];
514 B[1+bs*0] = alpha * A1[0+bs*0];
515
516 A0 += 4;
517 A1 += 4;
518 B += 4;
519
520 }
521
522 }
523
524 // both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
kernel_sgecp_2_3_lib4(int kmax,float * A0,int sda,float * B)525 void kernel_sgecp_2_3_lib4(int kmax, float *A0, int sda, float *B)
526 {
527
528 if(kmax<=0)
529 return;
530
531 const int bs = 4;
532
533 float *A1 = A0 + bs*sda;
534
535 int k;
536
537 for(k=0; k<kmax; k++)
538 {
539
540 B[0+bs*0] = A0[3+bs*0];
541 B[1+bs*0] = A1[0+bs*0];
542
543 A0 += 4;
544 A1 += 4;
545 B += 4;
546
547 }
548
549 }
550
551 // 1
552
kernel_sgecpsc_1_0_lib4(int kmax,float * alphap,float * A,float * B)553 void kernel_sgecpsc_1_0_lib4(int kmax, float *alphap, float *A, float *B)
554 {
555
556 if(kmax<=0)
557 return;
558
559 const int bs = 4;
560
561 float alpha = alphap[0];
562
563 int k;
564
565 for(k=0; k<kmax; k++)
566 {
567
568 B[0+bs*0] = alpha * A[0+bs*0];
569
570 A += 4;
571 B += 4;
572
573 }
574
575 }
576
kernel_sgecp_1_0_lib4(int kmax,float * A,float * B)577 void kernel_sgecp_1_0_lib4(int kmax, float *A, float *B)
578 {
579
580 if(kmax<=0)
581 return;
582
583 const int bs = 4;
584
585 int k;
586
587 for(k=0; k<kmax; k++)
588 {
589
590 B[0+bs*0] = A[0+bs*0];
591
592 A += 4;
593 B += 4;
594
595 }
596
597 }
598
599
600
601 // ---- tr
602
603 // both A and B are aligned to 256-bit boundaries
kernel_strcp_l_4_0_lib4(int kmax,float * A,float * B)604 void kernel_strcp_l_4_0_lib4(int kmax, float *A, float *B)
605 {
606
607 // A and C are lower triangular
608 // kmax+1 4-wide + end 3x3 triangle
609
610 kmax += 1;
611
612 if(kmax<=0)
613 return;
614
615 const int bs = 4;
616
617 int k;
618
619 for(k=0; k<kmax; k++)
620 {
621
622 B[0+bs*0] = A[0+bs*0];
623 B[1+bs*0] = A[1+bs*0];
624 B[2+bs*0] = A[2+bs*0];
625 B[3+bs*0] = A[3+bs*0];
626
627 A += 4;
628 B += 4;
629
630 }
631
632 // 3x3 triangle
633
634 B[1+bs*0] = A[1+bs*0];
635 B[2+bs*0] = A[2+bs*0];
636 B[3+bs*0] = A[3+bs*0];
637
638 B[2+bs*1] = A[2+bs*1];
639 B[3+bs*1] = A[3+bs*1];
640
641 B[3+bs*2] = A[3+bs*2];
642
643 }
644
645
646
647 // both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
kernel_strcp_l_4_1_lib4(int kmax,float * A0,int sda,float * B)648 void kernel_strcp_l_4_1_lib4(int kmax, float *A0, int sda, float *B)
649 {
650
651 // A and C are lower triangular
652 // kmax+1 4-wide + end 3x3 triangle
653
654 kmax += 1;
655
656 if(kmax<=0)
657 return;
658
659 const int bs = 4;
660
661 float *A1 = A0 + bs*sda;
662
663 int k;
664
665 for(k=0; k<kmax; k++)
666 {
667
668 B[0+bs*0] = A0[1+bs*0];
669 B[1+bs*0] = A0[2+bs*0];
670 B[2+bs*0] = A0[3+bs*0];
671 B[3+bs*0] = A1[0+bs*0];
672
673 A0 += 4;
674 A1 += 4;
675 B += 4;
676
677 }
678
679 // 3x3 triangle
680
681 B[1+0*bs] = A0[2+0*bs];
682 B[2+0*bs] = A0[3+0*bs];
683 B[3+0*bs] = A1[0+0*bs];
684
685 B[2+1*bs] = A0[3+1*bs];
686 B[3+1*bs] = A1[0+1*bs];
687
688 B[3+2*bs] = A1[0+2*bs];
689
690 }
691
692
693
694 // both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
kernel_strcp_l_4_2_lib4(int kmax,float * A0,int sda,float * B)695 void kernel_strcp_l_4_2_lib4(int kmax, float *A0, int sda, float *B)
696 {
697
698 // A and C are lower triangular
699 // kmax+1 4-wide + end 3x3 triangle
700
701 kmax += 1;
702
703 if(kmax<=0)
704 return;
705
706 const int bs = 4;
707
708 float *A1 = A0 + bs*sda;
709
710 int k;
711
712 for(k=0; k<kmax; k++)
713 {
714
715 B[0+bs*0] = A0[2+bs*0];
716 B[1+bs*0] = A0[3+bs*0];
717 B[2+bs*0] = A1[0+bs*0];
718 B[3+bs*0] = A1[1+bs*0];
719
720 A0 += 4;
721 A1 += 4;
722 B += 4;
723
724 }
725
726 // 3x3 triangle}
727
728 B[1+bs*0] = A0[3+bs*0];
729 B[2+bs*0] = A1[0+bs*0];
730 B[3+bs*0] = A1[1+bs*0];
731
732 B[2+bs*1] = A1[0+bs*1];
733 B[3+bs*1] = A1[1+bs*1];
734
735 B[3+bs*2] = A1[1+bs*2];
736
737 }
738
739
740
741 // both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
kernel_strcp_l_4_3_lib4(int kmax,float * A0,int sda,float * B)742 void kernel_strcp_l_4_3_lib4(int kmax, float *A0, int sda, float *B)
743 {
744
745 // A and C are lower triangular
746 // kmax+1 4-wide + end 3x3 triangle
747
748 kmax += 1;
749
750 if(kmax<=0)
751 return;
752
753 const int bs = 4;
754
755 float *A1 = A0 + bs*sda;
756
757 int k;
758
759 for(k=0; k<kmax; k++)
760 {
761
762 B[0+bs*0] = A0[3+bs*0];
763 B[1+bs*0] = A1[0+bs*0];
764 B[2+bs*0] = A1[1+bs*0];
765 B[3+bs*0] = A1[2+bs*0];
766
767 A0 += 4;
768 A1 += 4;
769 B += 4;
770
771 }
772
773 // 3x3 triangle
774
775 B[1+bs*0] = A1[0+bs*0];
776 B[2+bs*0] = A1[1+bs*0];
777 B[3+bs*0] = A1[2+bs*0];
778
779 B[2+bs*1] = A1[1+bs*1];
780 B[3+bs*1] = A1[2+bs*1];
781
782 B[3+bs*2] = A1[2+bs*2];
783
784 }
785
786
787
788 // both A and B are aligned to 64-bit boundaries
kernel_strcp_l_3_0_lib4(int kmax,float * A,float * B)789 void kernel_strcp_l_3_0_lib4(int kmax, float *A, float *B)
790 {
791
792 // A and C are lower triangular
793 // kmax+1 3-wide + end 2x2 triangle
794
795 kmax += 1;
796
797 if(kmax<=0)
798 return;
799
800 const int bs = 4;
801
802 int k;
803
804 for(k=0; k<kmax; k++)
805 {
806
807 B[0+bs*0] = A[0+bs*0];
808 B[1+bs*0] = A[1+bs*0];
809 B[2+bs*0] = A[2+bs*0];
810
811 A += 4;
812 B += 4;
813
814 }
815
816 // 2x2 triangle
817
818 B[1+bs*0] = A[1+bs*0];
819 B[2+bs*0] = A[2+bs*0];
820
821 B[2+bs*1] = A[2+bs*1];
822
823 }
824
825
826
827 // both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
kernel_strcp_l_3_2_lib4(int kmax,float * A0,int sda,float * B)828 void kernel_strcp_l_3_2_lib4(int kmax, float *A0, int sda, float *B)
829 {
830
831 // A and C are lower triangular
832 // kmax+1 3-wide + end 2x2 triangle
833
834 kmax += 1;
835
836 if(kmax<=0)
837 return;
838
839 const int bs = 4;
840
841 float *A1 = A0 + bs*sda;
842
843 int k;
844
845 for(k=0; k<kmax; k++)
846 {
847
848 B[0+bs*0] = A0[2+bs*0];
849 B[1+bs*0] = A0[3+bs*0];
850 B[2+bs*0] = A1[0+bs*0];
851
852 A0 += 4;
853 A1 += 4;
854 B += 4;
855
856 }
857
858 // 2x2 triangle
859
860 B[1+bs*0] = A0[3+bs*0];
861 B[2+bs*0] = A1[0+bs*0];
862
863 B[2+bs*1] = A1[0+bs*1];
864
865 }
866
867
868
869 // both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
kernel_strcp_l_3_3_lib4(int kmax,float * A0,int sda,float * B)870 void kernel_strcp_l_3_3_lib4(int kmax, float *A0, int sda, float *B)
871 {
872
873 // A and C are lower triangular
874 // kmax+1 3-wide + end 2x2 triangle
875
876 kmax += 1;
877
878 if(kmax<=0)
879 return;
880
881 const int bs = 4;
882
883 float *A1 = A0 + bs*sda;
884
885 int k;
886
887 for(k=0; k<kmax; k++)
888 {
889
890 B[0+bs*0] = A0[3+bs*0];
891 B[1+bs*0] = A1[0+bs*0];
892 B[2+bs*0] = A1[1+bs*0];
893
894 A0 += 4;
895 A1 += 4;
896 B += 4;
897
898 }
899
900 // 2x2 triangle
901
902 B[1+bs*0] = A1[0+bs*0];
903 B[2+bs*0] = A1[1+bs*0];
904
905 B[2+bs*1] = A1[1+bs*1];
906
907 }
908
909
910
911 // both A and B are aligned to 64-bit boundaries
kernel_strcp_l_2_0_lib4(int kmax,float * A,float * B)912 void kernel_strcp_l_2_0_lib4(int kmax, float *A, float *B)
913 {
914
915 // A and C are lower triangular
916 // kmax+1 2-wide + end 1x1 triangle
917
918 kmax += 1;
919
920 if(kmax<=0)
921 return;
922
923 const int bs = 4;
924
925 int k;
926
927 for(k=0; k<kmax; k++)
928 {
929
930 B[0+bs*0] = A[0+bs*0];
931 B[1+bs*0] = A[1+bs*0];
932
933 A += 4;
934 B += 4;
935
936 }
937
938 // 1x1 triangle
939
940 B[1+bs*0] = A[1+bs*0];
941
942 }
943
944
945
946 // both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
kernel_strcp_l_2_3_lib4(int kmax,float * A0,int sda,float * B)947 void kernel_strcp_l_2_3_lib4(int kmax, float *A0, int sda, float *B)
948 {
949
950 // A and C are lower triangular
951 // kmax+1 2-wide + end 1x1 triangle
952
953 kmax += 1;
954
955 if(kmax<=0)
956 return;
957
958 const int bs = 4;
959
960 float *A1 = A0 + bs*sda;
961
962 int k;
963
964 for(k=0; k<kmax; k++)
965 {
966
967 B[0+bs*0] = A0[3+bs*0];
968 B[1+bs*0] = A1[0+bs*0];
969
970 A0 += 4;
971 A1 += 4;
972 B += 4;
973
974 }
975
976 // 1x1 triangle
977
978 B[1+bs*0] = A1[0+bs*0];
979
980 }
981
982
983
984 // both A and B are aligned 64-bit boundaries
kernel_strcp_l_1_0_lib4(int kmax,float * A,float * B)985 void kernel_strcp_l_1_0_lib4(int kmax, float *A, float *B)
986 {
987
988 // A and C are lower triangular
989 // kmax+1 1-wide
990
991 kmax += 1;
992
993 if(kmax<=0)
994 return;
995
996 const int bs = 4;
997
998 int k;
999
1000 for(k=0; k<kmax; k++)
1001 {
1002
1003 B[0+bs*0] = A[0+bs*0];
1004
1005 A += 4;
1006 B += 4;
1007
1008 }
1009
1010 }
1011
1012
1013 // --- add
1014
1015 // both A and B are aligned to 256-bit boundaries
kernel_sgead_4_0_lib4(int kmax,float * alphap,float * A,float * B)1016 void kernel_sgead_4_0_lib4(int kmax, float *alphap, float *A, float *B)
1017 {
1018
1019 if(kmax<=0)
1020 return;
1021
1022 const int bs = 4;
1023
1024 float alpha = alphap[0];
1025
1026 int k;
1027
1028 for(k=0; k<kmax; k++)
1029 {
1030
1031 B[0+bs*0] += alpha * A[0+bs*0];
1032 B[1+bs*0] += alpha * A[1+bs*0];
1033 B[2+bs*0] += alpha * A[2+bs*0];
1034 B[3+bs*0] += alpha * A[3+bs*0];
1035
1036 A += 4;
1037 B += 4;
1038
1039 }
1040
1041 }
1042
1043
1044
1045 // both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
kernel_sgead_4_1_lib4(int kmax,float * alphap,float * A0,int sda,float * B)1046 void kernel_sgead_4_1_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
1047 {
1048
1049 if(kmax<=0)
1050 return;
1051
1052 const int bs = 4;
1053
1054 float alpha = alphap[0];
1055
1056 float *A1 = A0 + bs*sda;
1057
1058 int k;
1059
1060 for(k=0; k<kmax; k++)
1061 {
1062
1063 B[0+bs*0] += alpha * A0[1+bs*0];
1064 B[1+bs*0] += alpha * A0[2+bs*0];
1065 B[2+bs*0] += alpha * A0[3+bs*0];
1066 B[3+bs*0] += alpha * A1[0+bs*0];
1067
1068 A0 += 4;
1069 A1 += 4;
1070 B += 4;
1071
1072 }
1073
1074 }
1075
1076
1077
1078 // both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
kernel_sgead_4_2_lib4(int kmax,float * alphap,float * A0,int sda,float * B)1079 void kernel_sgead_4_2_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
1080 {
1081
1082 if(kmax<=0)
1083 return;
1084
1085 const int bs = 4;
1086
1087 float alpha = alphap[0];
1088
1089 float *A1 = A0 + bs*sda;
1090
1091 int k;
1092
1093 for(k=0; k<kmax; k++)
1094 {
1095
1096 B[0+bs*0] += alpha * A0[2+bs*0];
1097 B[1+bs*0] += alpha * A0[3+bs*0];
1098 B[2+bs*0] += alpha * A1[0+bs*0];
1099 B[3+bs*0] += alpha * A1[1+bs*0];
1100
1101 A0 += 4;
1102 A1 += 4;
1103 B += 4;
1104
1105 }
1106
1107 }
1108
1109
1110
1111 // both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
kernel_sgead_4_3_lib4(int kmax,float * alphap,float * A0,int sda,float * B)1112 void kernel_sgead_4_3_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
1113 {
1114
1115 if(kmax<=0)
1116 return;
1117
1118 const int bs = 4;
1119
1120 float alpha = alphap[0];
1121
1122 float *A1 = A0 + bs*sda;
1123
1124 int k;
1125
1126 for(k=0; k<kmax; k++)
1127 {
1128
1129 B[0+bs*0] += alpha * A0[3+bs*0];
1130 B[1+bs*0] += alpha * A1[0+bs*0];
1131 B[2+bs*0] += alpha * A1[1+bs*0];
1132 B[3+bs*0] += alpha * A1[2+bs*0];
1133
1134 A0 += 4;
1135 A1 += 4;
1136 B += 4;
1137
1138 }
1139
1140 }
1141
1142
1143
1144 // both A and B are aligned to 64-bit boundaries
kernel_sgead_3_0_lib4(int kmax,float * alphap,float * A,float * B)1145 void kernel_sgead_3_0_lib4(int kmax, float *alphap, float *A, float *B)
1146 {
1147
1148 if(kmax<=0)
1149 return;
1150
1151 const int bs = 4;
1152
1153 float alpha = alphap[0];
1154
1155 int k;
1156
1157 for(k=0; k<kmax; k++)
1158 {
1159
1160 B[0+bs*0] += alpha * A[0+bs*0];
1161 B[1+bs*0] += alpha * A[1+bs*0];
1162 B[2+bs*0] += alpha * A[2+bs*0];
1163
1164 A += 4;
1165 B += 4;
1166
1167 }
1168
1169 }
1170
1171
1172
1173 // both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
kernel_sgead_3_2_lib4(int kmax,float * alphap,float * A0,int sda,float * B)1174 void kernel_sgead_3_2_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
1175 {
1176
1177 if(kmax<=0)
1178 return;
1179
1180 const int bs = 4;
1181
1182 float alpha = alphap[0];
1183
1184 float *A1 = A0 + bs*sda;
1185
1186 int k;
1187
1188 for(k=0; k<kmax; k++)
1189 {
1190
1191 B[0+bs*0] += alpha * A0[2+bs*0];
1192 B[1+bs*0] += alpha * A0[3+bs*0];
1193 B[2+bs*0] += alpha * A1[0+bs*0];
1194
1195 A0 += 4;
1196 A1 += 4;
1197 B += 4;
1198
1199 }
1200
1201 }
1202
1203
1204
1205 // both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
kernel_sgead_3_3_lib4(int kmax,float * alphap,float * A0,int sda,float * B)1206 void kernel_sgead_3_3_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
1207 {
1208
1209 if(kmax<=0)
1210 return;
1211
1212 const int bs = 4;
1213
1214 float alpha = alphap[0];
1215
1216 float *A1 = A0 + bs*sda;
1217
1218 int k;
1219
1220 for(k=0; k<kmax; k++)
1221 {
1222
1223 B[0+bs*0] += alpha * A0[3+bs*0];
1224 B[1+bs*0] += alpha * A1[0+bs*0];
1225 B[2+bs*0] += alpha * A1[1+bs*0];
1226
1227 A0 += 4;
1228 A1 += 4;
1229 B += 4;
1230
1231 }
1232
1233 }
1234
1235
1236
1237 // both A and B are aligned to 64-bit boundaries
kernel_sgead_2_0_lib4(int kmax,float * alphap,float * A,float * B)1238 void kernel_sgead_2_0_lib4(int kmax, float *alphap, float *A, float *B)
1239 {
1240
1241 if(kmax<=0)
1242 return;
1243
1244 const int bs = 4;
1245
1246 float alpha = alphap[0];
1247
1248 int k;
1249
1250 for(k=0; k<kmax; k++)
1251 {
1252
1253 B[0+bs*0] += alpha * A[0+bs*0];
1254 B[1+bs*0] += alpha * A[1+bs*0];
1255
1256 A += 4;
1257 B += 4;
1258
1259 }
1260
1261 }
1262
1263
1264
1265 // both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
kernel_sgead_2_3_lib4(int kmax,float * alphap,float * A0,int sda,float * B)1266 void kernel_sgead_2_3_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
1267 {
1268
1269 if(kmax<=0)
1270 return;
1271
1272 const int bs = 4;
1273
1274 float alpha = alphap[0];
1275
1276 float *A1 = A0 + bs*sda;
1277
1278 int k;
1279
1280 for(k=0; k<kmax; k++)
1281 {
1282
1283 B[0+bs*0] += alpha * A0[3+bs*0];
1284 B[1+bs*0] += alpha * A1[0+bs*0];
1285
1286 A0 += 4;
1287 A1 += 4;
1288 B += 4;
1289
1290 }
1291
1292 }
1293
1294
1295
1296 // both A and B are aligned 64-bit boundaries
kernel_sgead_1_0_lib4(int kmax,float * alphap,float * A,float * B)1297 void kernel_sgead_1_0_lib4(int kmax, float *alphap, float *A, float *B)
1298 {
1299
1300 if(kmax<=0)
1301 return;
1302
1303 const int bs = 4;
1304
1305 float alpha = alphap[0];
1306
1307 int k;
1308
1309 for(k=0; k<kmax; k++)
1310 {
1311
1312 B[0+bs*0] += alpha * A[0+bs*0];
1313
1314 A += 4;
1315 B += 4;
1316
1317 }
1318
1319 }
1320
1321
1322
1323
1324
1325