1 /**************************************************************************************************
2 * *
3 * This file is part of BLASFEO. *
4 * *
5 * BLASFEO -- BLAS For Embedded Optimization. *
6 * Copyright (C) 2019 by Gianluca Frison. *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8 * All rights reserved. *
9 * *
10 * The 2-Clause BSD License *
11 * *
12 * Redistribution and use in source and binary forms, with or without *
13 * modification, are permitted provided that the following conditions are met: *
14 * *
15 * 1. Redistributions of source code must retain the above copyright notice, this *
16 * list of conditions and the following disclaimer. *
17 * 2. Redistributions in binary form must reproduce the above copyright notice, *
18 * this list of conditions and the following disclaimer in the documentation *
19 * and/or other materials provided with the distribution. *
20 * *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
31 * *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de *
33 * *
34 **************************************************************************************************/
35
36
37
38
39
40 /*
41 * Copy only
42 */
43
44
45
46
47
48 // both A and B are aligned to 256-bit boundaries
kernel_dgecp_4_0_lib4(int tri,int kmax,double * A,double * B)49 void kernel_dgecp_4_0_lib4(int tri, int kmax, double *A, double *B)
50 {
51
52 if(tri==1)
53 {
54 // A and C are lower triangular
55 // kmax+1 4-wide + end 3x3 triangle
56
57 kmax += 1;
58 }
59
60 if(kmax<=0)
61 return;
62
63 const int bs = 4;
64
65 int k;
66
67 for(k=0; k<kmax-3; k+=4)
68 {
69 B[0+bs*0] = A[0+bs*0];
70 B[1+bs*0] = A[1+bs*0];
71 B[2+bs*0] = A[2+bs*0];
72 B[3+bs*0] = A[3+bs*0];
73
74 B[0+bs*1] = A[0+bs*1];
75 B[1+bs*1] = A[1+bs*1];
76 B[2+bs*1] = A[2+bs*1];
77 B[3+bs*1] = A[3+bs*1];
78
79 B[0+bs*2] = A[0+bs*2];
80 B[1+bs*2] = A[1+bs*2];
81 B[2+bs*2] = A[2+bs*2];
82 B[3+bs*2] = A[3+bs*2];
83
84 B[0+bs*3] = A[0+bs*3];
85 B[1+bs*3] = A[1+bs*3];
86 B[2+bs*3] = A[2+bs*3];
87 B[3+bs*3] = A[3+bs*3];
88
89 A += 16;
90 B += 16;
91
92 }
93 for(; k<kmax; k++)
94 {
95
96 B[0+bs*0] = A[0+bs*0];
97 B[1+bs*0] = A[1+bs*0];
98 B[2+bs*0] = A[2+bs*0];
99 B[3+bs*0] = A[3+bs*0];
100
101 A += 4;
102 B += 4;
103
104 }
105
106 if(tri==1)
107 {
108 // 3x3 triangle
109
110 B[1+bs*0] = A[1+bs*0];
111 B[2+bs*0] = A[2+bs*0];
112 B[3+bs*0] = A[3+bs*0];
113
114 B[2+bs*1] = A[2+bs*1];
115 B[3+bs*1] = A[3+bs*1];
116
117 B[3+bs*2] = A[3+bs*2];
118
119 }
120
121 }
122
123
124
125
126
127
128 /*
129 * Copy and Scale
130 *
131 * Used by: dge dtr
132 */
133
134
135
136
137
138 // both.o A and B are aligned to 256-bit boundaries
kernel_dgecpsc_4_0_lib4(int tri,int kmax,double alpha,double * A,double * B)139 void kernel_dgecpsc_4_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
140 {
141
142 if(tri==1)
143 {
144 // A and C are lower triangular
145 // kmax+1 4-wide + end 3x3 triangle
146
147 kmax += 1;
148 }
149
150 if(kmax<=0)
151 return;
152
153 const int bs = 4;
154
155 int k;
156
157 for(k=0; k<kmax-3; k+=4)
158 {
159 B[0+bs*0] = alpha*A[0+bs*0];
160 B[1+bs*0] = alpha*A[1+bs*0];
161 B[2+bs*0] = alpha*A[2+bs*0];
162 B[3+bs*0] = alpha*A[3+bs*0];
163
164 B[0+bs*1] = alpha*A[0+bs*1];
165 B[1+bs*1] = alpha*A[1+bs*1];
166 B[2+bs*1] = alpha*A[2+bs*1];
167 B[3+bs*1] = alpha*A[3+bs*1];
168
169 B[0+bs*2] = alpha*A[0+bs*2];
170 B[1+bs*2] = alpha*A[1+bs*2];
171 B[2+bs*2] = alpha*A[2+bs*2];
172 B[3+bs*2] = alpha*A[3+bs*2];
173
174 B[0+bs*3] = alpha*A[0+bs*3];
175 B[1+bs*3] = alpha*A[1+bs*3];
176 B[2+bs*3] = alpha*A[2+bs*3];
177 B[3+bs*3] = alpha*A[3+bs*3];
178
179 A += 16;
180 B += 16;
181
182 }
183 for(; k<kmax; k++)
184 {
185
186 B[0+bs*0] = alpha*A[0+bs*0];
187 B[1+bs*0] = alpha*A[1+bs*0];
188 B[2+bs*0] = alpha*A[2+bs*0];
189 B[3+bs*0] = alpha*A[3+bs*0];
190
191 A += 4;
192 B += 4;
193
194 }
195
196 if(tri==1)
197 {
198 // 3x3 triangle
199
200 B[1+bs*0] = alpha*A[1+bs*0];
201 B[2+bs*0] = alpha*A[2+bs*0];
202 B[3+bs*0] = alpha*A[3+bs*0];
203
204 B[2+bs*1] = alpha*A[2+bs*1];
205 B[3+bs*1] = alpha*A[3+bs*1];
206
207 B[3+bs*2] = alpha*A[3+bs*2];
208
209 }
210
211 }
212
213
214
215 // both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
kernel_dgecpsc_4_1_lib4(int tri,int kmax,double alpha,double * A0,int sda,double * B)216 void kernel_dgecpsc_4_1_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
217 {
218
219 if(tri==1)
220 {
221 // A and C are lower triangular
222 // kmax+1 4-wide + end 3x3 triangle
223
224 kmax += 1;
225 }
226
227 if(kmax<=0)
228 return;
229
230 const int bs = 4;
231
232 double *A1 = A0 + bs*sda;
233
234 int k;
235
236 for(k=0; k<kmax-3; k+=4)
237 {
238
239 B[0+bs*0] = alpha*A0[1+bs*0];
240 B[1+bs*0] = alpha*A0[2+bs*0];
241 B[2+bs*0] = alpha*A0[3+bs*0];
242 B[3+bs*0] = alpha*A1[0+bs*0];
243
244 B[0+bs*1] = alpha*A0[1+bs*1];
245 B[1+bs*1] = alpha*A0[2+bs*1];
246 B[2+bs*1] = alpha*A0[3+bs*1];
247 B[3+bs*1] = alpha*A1[0+bs*1];
248
249 B[0+bs*2] = alpha*A0[1+bs*2];
250 B[1+bs*2] = alpha*A0[2+bs*2];
251 B[2+bs*2] = alpha*A0[3+bs*2];
252 B[3+bs*2] = alpha*A1[0+bs*2];
253
254 B[0+bs*3] = alpha*A0[1+bs*3];
255 B[1+bs*3] = alpha*A0[2+bs*3];
256 B[2+bs*3] = alpha*A0[3+bs*3];
257 B[3+bs*3] = alpha*A1[0+bs*3];
258
259 A0 += 16;
260 A1 += 16;
261 B += 16;
262
263 }
264 for(; k<kmax; k++)
265 {
266
267 B[0+bs*0] = alpha*A0[1+bs*0];
268 B[1+bs*0] = alpha*A0[2+bs*0];
269 B[2+bs*0] = alpha*A0[3+bs*0];
270 B[3+bs*0] = alpha*A1[0+bs*0];
271
272 A0 += 4;
273 A1 += 4;
274 B += 4;
275
276 }
277
278 if(tri==1)
279 {
280 // 3x3 triangle
281
282 B[1+0*bs] = alpha*A0[2+0*bs];
283 B[2+0*bs] = alpha*A0[3+0*bs];
284 B[3+0*bs] = alpha*A1[0+0*bs];
285
286 B[2+1*bs] = alpha*A0[3+1*bs];
287 B[3+1*bs] = alpha*A1[0+1*bs];
288
289 B[3+2*bs] = alpha*A1[0+2*bs];
290
291 }
292
293 }
294
295
296
297 // both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
kernel_dgecpsc_4_2_lib4(int tri,int kmax,double alpha,double * A0,int sda,double * B)298 void kernel_dgecpsc_4_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
299 {
300
301 if(tri==1)
302 {
303 // A and C are lower triangular
304 // kmax+1 4-wide + end 3x3 triangle
305
306 kmax += 1;
307 }
308
309 if(kmax<=0)
310 return;
311
312 const int bs = 4;
313
314 double *A1 = A0 + bs*sda;
315
316 int k;
317
318 for(k=0; k<kmax-3; k+=4)
319 {
320
321 B[0+bs*0] = alpha*A0[2+bs*0];
322 B[1+bs*0] = alpha*A0[3+bs*0];
323 B[2+bs*0] = alpha*A1[0+bs*0];
324 B[3+bs*0] = alpha*A1[1+bs*0];
325
326 B[0+bs*1] = alpha*A0[2+bs*1];
327 B[1+bs*1] = alpha*A0[3+bs*1];
328 B[2+bs*1] = alpha*A1[0+bs*1];
329 B[3+bs*1] = alpha*A1[1+bs*1];
330
331 B[0+bs*2] = alpha*A0[2+bs*2];
332 B[1+bs*2] = alpha*A0[3+bs*2];
333 B[2+bs*2] = alpha*A1[0+bs*2];
334 B[3+bs*2] = alpha*A1[1+bs*2];
335
336 B[0+bs*3] = alpha*A0[2+bs*3];
337 B[1+bs*3] = alpha*A0[3+bs*3];
338 B[2+bs*3] = alpha*A1[0+bs*3];
339 B[3+bs*3] = alpha*A1[1+bs*3];
340
341 A0 += 16;
342 A1 += 16;
343 B += 16;
344
345 }
346 for(; k<kmax; k++)
347 {
348
349 B[0+bs*0] = alpha*A0[2+bs*0];
350 B[1+bs*0] = alpha*A0[3+bs*0];
351 B[2+bs*0] = alpha*A1[0+bs*0];
352 B[3+bs*0] = alpha*A1[1+bs*0];
353
354 A0 += 4;
355 A1 += 4;
356 B += 4;
357
358 }
359
360 if(tri==1)
361 {
362 // 3x3 triangle}
363
364 B[1+bs*0] = alpha*A0[3+bs*0];
365 B[2+bs*0] = alpha*A1[0+bs*0];
366 B[3+bs*0] = alpha*A1[1+bs*0];
367
368 B[2+bs*1] = alpha*A1[0+bs*1];
369 B[3+bs*1] = alpha*A1[1+bs*1];
370
371 B[3+bs*2] = alpha*A1[1+bs*2];
372
373 }
374
375 }
376
377
378
379 // both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
kernel_dgecpsc_4_3_lib4(int tri,int kmax,double alpha,double * A0,int sda,double * B)380 void kernel_dgecpsc_4_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
381 {
382
383 if(tri==1)
384 {
385 // A and C are lower triangular
386 // kmax+1 4-wide + end 3x3 triangle
387
388 kmax += 1;
389 }
390
391 if(kmax<=0)
392 return;
393
394 const int bs = 4;
395
396 double *A1 = A0 + bs*sda;
397
398 int k;
399
400 for(k=0; k<kmax-3; k+=4)
401 {
402
403 B[0+bs*0] = alpha*A0[3+bs*0];
404 B[1+bs*0] = alpha*A1[0+bs*0];
405 B[2+bs*0] = alpha*A1[1+bs*0];
406 B[3+bs*0] = alpha*A1[2+bs*0];
407
408 B[0+bs*1] = alpha*A0[3+bs*1];
409 B[1+bs*1] = alpha*A1[0+bs*1];
410 B[2+bs*1] = alpha*A1[1+bs*1];
411 B[3+bs*1] = alpha*A1[2+bs*1];
412
413 B[0+bs*2] = alpha*A0[3+bs*2];
414 B[1+bs*2] = alpha*A1[0+bs*2];
415 B[2+bs*2] = alpha*A1[1+bs*2];
416 B[3+bs*2] = alpha*A1[2+bs*2];
417
418 B[0+bs*3] = alpha*A0[3+bs*3];
419 B[1+bs*3] = alpha*A1[0+bs*3];
420 B[2+bs*3] = alpha*A1[1+bs*3];
421 B[3+bs*3] = alpha*A1[2+bs*3];
422
423 A0 += 16;
424 A1 += 16;
425 B += 16;
426
427 }
428 for(; k<kmax; k++)
429 {
430
431 B[0+bs*0] = alpha*A0[3+bs*0];
432 B[1+bs*0] = alpha*A1[0+bs*0];
433 B[2+bs*0] = alpha*A1[1+bs*0];
434 B[3+bs*0] = alpha*A1[2+bs*0];
435
436 A0 += 4;
437 A1 += 4;
438 B += 4;
439
440 }
441
442 if(tri==1)
443 {
444 // 3x3 triangle
445
446 B[1+bs*0] = alpha*A1[0+bs*0];
447 B[2+bs*0] = alpha*A1[1+bs*0];
448 B[3+bs*0] = alpha*A1[2+bs*0];
449
450 B[2+bs*1] = alpha*A1[1+bs*1];
451 B[3+bs*1] = alpha*A1[2+bs*1];
452
453 B[3+bs*2] = alpha*A1[2+bs*2];
454
455 }
456
457 }
458
459
460
461 // both A and B are aligned to 64-bit boundaries
kernel_dgecpsc_3_0_lib4(int tri,int kmax,double alpha,double * A,double * B)462 void kernel_dgecpsc_3_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
463 {
464
465 if(tri==1)
466 {
467 // A and C are lower triangular
468 // kmax+1 3-wide + end 2x2 triangle
469
470 kmax += 1;
471 }
472
473 if(kmax<=0)
474 return;
475
476 const int bs = 4;
477
478 int k;
479
480 for(k=0; k<kmax-3; k+=4)
481 {
482 B[0+bs*0] = alpha*A[0+bs*0];
483 B[1+bs*0] = alpha*A[1+bs*0];
484 B[2+bs*0] = alpha*A[2+bs*0];
485
486 B[0+bs*1] = alpha*A[0+bs*1];
487 B[1+bs*1] = alpha*A[1+bs*1];
488 B[2+bs*1] = alpha*A[2+bs*1];
489
490 B[0+bs*2] = alpha*A[0+bs*2];
491 B[1+bs*2] = alpha*A[1+bs*2];
492 B[2+bs*2] = alpha*A[2+bs*2];
493
494 B[0+bs*3] = alpha*A[0+bs*3];
495 B[1+bs*3] = alpha*A[1+bs*3];
496 B[2+bs*3] = alpha*A[2+bs*3];
497
498 A += 16;
499 B += 16;
500
501 }
502 for(; k<kmax; k++)
503 {
504
505 B[0+bs*0] = alpha*A[0+bs*0];
506 B[1+bs*0] = alpha*A[1+bs*0];
507 B[2+bs*0] = alpha*A[2+bs*0];
508
509 A += 4;
510 B += 4;
511
512 }
513
514 if(tri==1)
515 {
516 // 2x2 triangle
517
518 B[1+bs*0] = alpha*A[1+bs*0];
519 B[2+bs*0] = alpha*A[2+bs*0];
520
521 B[2+bs*1] = alpha*A[2+bs*1];
522
523 }
524
525 }
526
527
528 // both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
kernel_dgecpsc_3_2_lib4(int tri,int kmax,double alpha,double * A0,int sda,double * B)529 void kernel_dgecpsc_3_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
530 {
531
532 if(tri==1)
533 {
534 // A and C are lower triangular
535 // kmax+1 3-wide + end 2x2 triangle
536
537 kmax += 1;
538 }
539
540 if(kmax<=0)
541 return;
542
543 const int bs = 4;
544
545 double *A1 = A0 + bs*sda;
546
547 int k;
548
549 for(k=0; k<kmax-3; k+=4)
550 {
551
552 B[0+bs*0] = alpha*A0[2+bs*0];
553 B[1+bs*0] = alpha*A0[3+bs*0];
554 B[2+bs*0] = alpha*A1[0+bs*0];
555
556 B[0+bs*1] = alpha*A0[2+bs*1];
557 B[1+bs*1] = alpha*A0[3+bs*1];
558 B[2+bs*1] = alpha*A1[0+bs*1];
559
560 B[0+bs*2] = alpha*A0[2+bs*2];
561 B[1+bs*2] = alpha*A0[3+bs*2];
562 B[2+bs*2] = alpha*A1[0+bs*2];
563
564 B[0+bs*3] = alpha*A0[2+bs*3];
565 B[1+bs*3] = alpha*A0[3+bs*3];
566 B[2+bs*3] = alpha*A1[0+bs*3];
567
568 A0 += 16;
569 A1 += 16;
570 B += 16;
571
572 }
573 for(; k<kmax; k++)
574 {
575
576 B[0+bs*0] = alpha*A0[2+bs*0];
577 B[1+bs*0] = alpha*A0[3+bs*0];
578 B[2+bs*0] = alpha*A1[0+bs*0];
579
580 A0 += 4;
581 A1 += 4;
582 B += 4;
583
584 }
585
586 if(tri==1)
587 {
588 // 2x2 triangle
589
590 B[1+bs*0] = alpha*A0[3+bs*0];
591 B[2+bs*0] = alpha*A1[0+bs*0];
592
593 B[2+bs*1] = alpha*A1[0+bs*1];
594
595 }
596
597 }
598
599
600
601 // both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
kernel_dgecpsc_3_3_lib4(int tri,int kmax,double alpha,double * A0,int sda,double * B)602 void kernel_dgecpsc_3_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
603 {
604
605 if(tri==1)
606 {
607 // A and C are lower triangular
608 // kmax+1 3-wide + end 2x2 triangle
609
610 kmax += 1;
611 }
612
613 if(kmax<=0)
614 return;
615
616 const int bs = 4;
617
618 double *A1 = A0 + bs*sda;
619
620 int k;
621
622 for(k=0; k<kmax-3; k+=4)
623 {
624
625 B[0+bs*0] = alpha*A0[3+bs*0];
626 B[1+bs*0] = alpha*A1[0+bs*0];
627 B[2+bs*0] = alpha*A1[1+bs*0];
628
629 B[0+bs*1] = alpha*A0[3+bs*1];
630 B[1+bs*1] = alpha*A1[0+bs*1];
631 B[2+bs*1] = alpha*A1[1+bs*1];
632
633 B[0+bs*2] = alpha*A0[3+bs*2];
634 B[1+bs*2] = alpha*A1[0+bs*2];
635 B[2+bs*2] = alpha*A1[1+bs*2];
636
637 B[0+bs*3] = alpha*A0[3+bs*3];
638 B[1+bs*3] = alpha*A1[0+bs*3];
639 B[2+bs*3] = alpha*A1[1+bs*3];
640
641 A0 += 16;
642 A1 += 16;
643 B += 16;
644
645 }
646 for(; k<kmax; k++)
647 {
648
649 B[0+bs*0] = alpha*A0[3+bs*0];
650 B[1+bs*0] = alpha*A1[0+bs*0];
651 B[2+bs*0] = alpha*A1[1+bs*0];
652
653 A0 += 4;
654 A1 += 4;
655 B += 4;
656
657 }
658
659 if(tri==1)
660 {
661 // 2x2 triangle
662
663 B[1+bs*0] = alpha*A1[0+bs*0];
664 B[2+bs*0] = alpha*A1[1+bs*0];
665
666 B[2+bs*1] = alpha*A1[1+bs*1];
667
668 }
669
670 }
671
672
673
674 // both A and B are aligned to 64-bit boundaries
kernel_dgecpsc_2_0_lib4(int tri,int kmax,double alpha,double * A,double * B)675 void kernel_dgecpsc_2_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
676 {
677
678 if(tri==1)
679 {
680 // A and C are lower triangular
681 // kmax+1 2-wide + end 1x1 triangle
682
683 kmax += 1;
684 }
685
686 if(kmax<=0)
687 return;
688
689 const int bs = 4;
690
691 int k;
692
693 for(k=0; k<kmax-3; k+=4)
694 {
695 B[0+bs*0] = alpha*A[0+bs*0];
696 B[1+bs*0] = alpha*A[1+bs*0];
697
698 B[0+bs*1] = alpha*A[0+bs*1];
699 B[1+bs*1] = alpha*A[1+bs*1];
700
701 B[0+bs*2] = alpha*A[0+bs*2];
702 B[1+bs*2] = alpha*A[1+bs*2];
703
704 B[0+bs*3] = alpha*A[0+bs*3];
705 B[1+bs*3] = alpha*A[1+bs*3];
706
707 A += 16;
708 B += 16;
709
710 }
711 for(; k<kmax; k++)
712 {
713
714 B[0+bs*0] = alpha*A[0+bs*0];
715 B[1+bs*0] = alpha*A[1+bs*0];
716
717 A += 4;
718 B += 4;
719
720 }
721
722 if(tri==1)
723 {
724 // 1x1 triangle
725
726 B[1+bs*0] = alpha*A[1+bs*0];
727
728 }
729
730 }
731
732
733
734 // both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
kernel_dgecpsc_2_3_lib4(int tri,int kmax,double alpha,double * A0,int sda,double * B)735 void kernel_dgecpsc_2_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
736 {
737
738 if(tri==1)
739 {
740 // A and C are lower triangular
741 // kmax+1 2-wide + end 1x1 triangle
742
743 kmax += 1;
744 }
745
746 if(kmax<=0)
747 return;
748
749 const int bs = 4;
750
751 double *A1 = A0 + bs*sda;
752
753 int k;
754
755 for(k=0; k<kmax-3; k+=4)
756 {
757
758 B[0+bs*0] = alpha*A0[3+bs*0];
759 B[1+bs*0] = alpha*A1[0+bs*0];
760
761 B[0+bs*1] = alpha*A0[3+bs*1];
762 B[1+bs*1] = alpha*A1[0+bs*1];
763
764 B[0+bs*2] = alpha*A0[3+bs*2];
765 B[1+bs*2] = alpha*A1[0+bs*2];
766
767 B[0+bs*3] = alpha*A0[3+bs*3];
768 B[1+bs*3] = alpha*A1[0+bs*3];
769
770 A0 += 16;
771 A1 += 16;
772 B += 16;
773
774 }
775 for(; k<kmax; k++)
776 {
777
778 B[0+bs*0] = alpha*A0[3+bs*0];
779 B[1+bs*0] = alpha*A1[0+bs*0];
780
781 A0 += 4;
782 A1 += 4;
783 B += 4;
784
785 }
786
787 if(tri==1)
788 {
789 // 1x1 triangle
790
791 B[1+bs*0] = alpha*A1[0+bs*0];
792
793 }
794
795 }
796
797
798
799 // both A and B are aligned 64-bit boundaries
kernel_dgecpsc_1_0_lib4(int tri,int kmax,double alpha,double * A,double * B)800 void kernel_dgecpsc_1_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
801 {
802
803 if(tri==1)
804 {
805 // A and C are lower triangular
806 // kmax+1 1-wide
807
808 kmax += 1;
809 }
810
811 if(kmax<=0)
812 return;
813
814 const int bs = 4;
815
816 int k;
817
818 for(k=0; k<kmax-3; k+=4)
819 {
820 B[0+bs*0] = alpha*A[0+bs*0];
821 B[0+bs*1] = alpha*A[0+bs*1];
822 B[0+bs*2] = alpha*A[0+bs*2];
823 B[0+bs*3] = alpha*A[0+bs*3];
824
825 A += 16;
826 B += 16;
827
828 }
829 for(; k<kmax; k++)
830 {
831
832 B[0+bs*0] = alpha*A[0+bs*0];
833
834 A += 4;
835 B += 4;
836
837 }
838
839 }
840
841
842
843
844
845 /*
846 * Add and scale
847 *
848 */
849
850
851
852
853 // both A and B are aligned to 256-bit boundaries
kernel_dgead_4_0_lib4(int kmax,double alpha,double * A,double * B)854 void kernel_dgead_4_0_lib4(int kmax, double alpha, double *A, double *B)
855 {
856
857 if(kmax<=0)
858 return;
859
860 const int bs = 4;
861
862 int k;
863
864 for(k=0; k<kmax-3; k+=4)
865 {
866 B[0+bs*0] += alpha * A[0+bs*0];
867 B[1+bs*0] += alpha * A[1+bs*0];
868 B[2+bs*0] += alpha * A[2+bs*0];
869 B[3+bs*0] += alpha * A[3+bs*0];
870
871 B[0+bs*1] += alpha * A[0+bs*1];
872 B[1+bs*1] += alpha * A[1+bs*1];
873 B[2+bs*1] += alpha * A[2+bs*1];
874 B[3+bs*1] += alpha * A[3+bs*1];
875
876 B[0+bs*2] += alpha * A[0+bs*2];
877 B[1+bs*2] += alpha * A[1+bs*2];
878 B[2+bs*2] += alpha * A[2+bs*2];
879 B[3+bs*2] += alpha * A[3+bs*2];
880
881 B[0+bs*3] += alpha * A[0+bs*3];
882 B[1+bs*3] += alpha * A[1+bs*3];
883 B[2+bs*3] += alpha * A[2+bs*3];
884 B[3+bs*3] += alpha * A[3+bs*3];
885
886 A += 16;
887 B += 16;
888
889 }
890 for(; k<kmax; k++)
891 {
892
893 B[0+bs*0] += alpha * A[0+bs*0];
894 B[1+bs*0] += alpha * A[1+bs*0];
895 B[2+bs*0] += alpha * A[2+bs*0];
896 B[3+bs*0] += alpha * A[3+bs*0];
897
898 A += 4;
899 B += 4;
900
901 }
902
903 }
904
905
906
907 // both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
kernel_dgead_4_1_lib4(int kmax,double alpha,double * A0,int sda,double * B)908 void kernel_dgead_4_1_lib4(int kmax, double alpha, double *A0, int sda, double *B)
909 {
910
911 if(kmax<=0)
912 return;
913
914 const int bs = 4;
915
916 double *A1 = A0 + bs*sda;
917
918 int k;
919
920 for(k=0; k<kmax-3; k+=4)
921 {
922
923 B[0+bs*0] += alpha * A0[1+bs*0];
924 B[1+bs*0] += alpha * A0[2+bs*0];
925 B[2+bs*0] += alpha * A0[3+bs*0];
926 B[3+bs*0] += alpha * A1[0+bs*0];
927
928 B[0+bs*1] += alpha * A0[1+bs*1];
929 B[1+bs*1] += alpha * A0[2+bs*1];
930 B[2+bs*1] += alpha * A0[3+bs*1];
931 B[3+bs*1] += alpha * A1[0+bs*1];
932
933 B[0+bs*2] += alpha * A0[1+bs*2];
934 B[1+bs*2] += alpha * A0[2+bs*2];
935 B[2+bs*2] += alpha * A0[3+bs*2];
936 B[3+bs*2] += alpha * A1[0+bs*2];
937
938 B[0+bs*3] += alpha * A0[1+bs*3];
939 B[1+bs*3] += alpha * A0[2+bs*3];
940 B[2+bs*3] += alpha * A0[3+bs*3];
941 B[3+bs*3] += alpha * A1[0+bs*3];
942
943 A0 += 16;
944 A1 += 16;
945 B += 16;
946
947 }
948 for(; k<kmax; k++)
949 {
950
951 B[0+bs*0] += alpha * A0[1+bs*0];
952 B[1+bs*0] += alpha * A0[2+bs*0];
953 B[2+bs*0] += alpha * A0[3+bs*0];
954 B[3+bs*0] += alpha * A1[0+bs*0];
955
956 A0 += 4;
957 A1 += 4;
958 B += 4;
959
960 }
961
962 }
963
964
965
966 // both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
kernel_dgead_4_2_lib4(int kmax,double alpha,double * A0,int sda,double * B)967 void kernel_dgead_4_2_lib4(int kmax, double alpha, double *A0, int sda, double *B)
968 {
969
970 if(kmax<=0)
971 return;
972
973 const int bs = 4;
974
975 double *A1 = A0 + bs*sda;
976
977 int k;
978
979 for(k=0; k<kmax-3; k+=4)
980 {
981
982 B[0+bs*0] += alpha * A0[2+bs*0];
983 B[1+bs*0] += alpha * A0[3+bs*0];
984 B[2+bs*0] += alpha * A1[0+bs*0];
985 B[3+bs*0] += alpha * A1[1+bs*0];
986
987 B[0+bs*1] += alpha * A0[2+bs*1];
988 B[1+bs*1] += alpha * A0[3+bs*1];
989 B[2+bs*1] += alpha * A1[0+bs*1];
990 B[3+bs*1] += alpha * A1[1+bs*1];
991
992 B[0+bs*2] += alpha * A0[2+bs*2];
993 B[1+bs*2] += alpha * A0[3+bs*2];
994 B[2+bs*2] += alpha * A1[0+bs*2];
995 B[3+bs*2] += alpha * A1[1+bs*2];
996
997 B[0+bs*3] += alpha * A0[2+bs*3];
998 B[1+bs*3] += alpha * A0[3+bs*3];
999 B[2+bs*3] += alpha * A1[0+bs*3];
1000 B[3+bs*3] += alpha * A1[1+bs*3];
1001
1002 A0 += 16;
1003 A1 += 16;
1004 B += 16;
1005
1006 }
1007 for(; k<kmax; k++)
1008 {
1009
1010 B[0+bs*0] += alpha * A0[2+bs*0];
1011 B[1+bs*0] += alpha * A0[3+bs*0];
1012 B[2+bs*0] += alpha * A1[0+bs*0];
1013 B[3+bs*0] += alpha * A1[1+bs*0];
1014
1015 A0 += 4;
1016 A1 += 4;
1017 B += 4;
1018
1019 }
1020
1021 }
1022
1023
1024
1025 // both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
kernel_dgead_4_3_lib4(int kmax,double alpha,double * A0,int sda,double * B)1026 void kernel_dgead_4_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
1027 {
1028
1029 if(kmax<=0)
1030 return;
1031
1032 const int bs = 4;
1033
1034 double *A1 = A0 + bs*sda;
1035
1036 int k;
1037
1038 for(k=0; k<kmax-3; k+=4)
1039 {
1040
1041 B[0+bs*0] += alpha * A0[3+bs*0];
1042 B[1+bs*0] += alpha * A1[0+bs*0];
1043 B[2+bs*0] += alpha * A1[1+bs*0];
1044 B[3+bs*0] += alpha * A1[2+bs*0];
1045
1046 B[0+bs*1] += alpha * A0[3+bs*1];
1047 B[1+bs*1] += alpha * A1[0+bs*1];
1048 B[2+bs*1] += alpha * A1[1+bs*1];
1049 B[3+bs*1] += alpha * A1[2+bs*1];
1050
1051 B[0+bs*2] += alpha * A0[3+bs*2];
1052 B[1+bs*2] += alpha * A1[0+bs*2];
1053 B[2+bs*2] += alpha * A1[1+bs*2];
1054 B[3+bs*2] += alpha * A1[2+bs*2];
1055
1056 B[0+bs*3] += alpha * A0[3+bs*3];
1057 B[1+bs*3] += alpha * A1[0+bs*3];
1058 B[2+bs*3] += alpha * A1[1+bs*3];
1059 B[3+bs*3] += alpha * A1[2+bs*3];
1060
1061 A0 += 16;
1062 A1 += 16;
1063 B += 16;
1064
1065 }
1066 for(; k<kmax; k++)
1067 {
1068
1069 B[0+bs*0] += alpha * A0[3+bs*0];
1070 B[1+bs*0] += alpha * A1[0+bs*0];
1071 B[2+bs*0] += alpha * A1[1+bs*0];
1072 B[3+bs*0] += alpha * A1[2+bs*0];
1073
1074 A0 += 4;
1075 A1 += 4;
1076 B += 4;
1077
1078 }
1079
1080 }
1081
1082
1083
1084 // both A and B are aligned to 64-bit boundaries
kernel_dgead_3_0_lib4(int kmax,double alpha,double * A,double * B)1085 void kernel_dgead_3_0_lib4(int kmax, double alpha, double *A, double *B)
1086 {
1087
1088 if(kmax<=0)
1089 return;
1090
1091 const int bs = 4;
1092
1093 int k;
1094
1095 for(k=0; k<kmax-3; k+=4)
1096 {
1097 B[0+bs*0] += alpha * A[0+bs*0];
1098 B[1+bs*0] += alpha * A[1+bs*0];
1099 B[2+bs*0] += alpha * A[2+bs*0];
1100
1101 B[0+bs*1] += alpha * A[0+bs*1];
1102 B[1+bs*1] += alpha * A[1+bs*1];
1103 B[2+bs*1] += alpha * A[2+bs*1];
1104
1105 B[0+bs*2] += alpha * A[0+bs*2];
1106 B[1+bs*2] += alpha * A[1+bs*2];
1107 B[2+bs*2] += alpha * A[2+bs*2];
1108
1109 B[0+bs*3] += alpha * A[0+bs*3];
1110 B[1+bs*3] += alpha * A[1+bs*3];
1111 B[2+bs*3] += alpha * A[2+bs*3];
1112
1113 A += 16;
1114 B += 16;
1115
1116 }
1117 for(; k<kmax; k++)
1118 {
1119
1120 B[0+bs*0] += alpha * A[0+bs*0];
1121 B[1+bs*0] += alpha * A[1+bs*0];
1122 B[2+bs*0] += alpha * A[2+bs*0];
1123
1124 A += 4;
1125 B += 4;
1126
1127 }
1128
1129 }
1130
1131
1132
1133 // both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
kernel_dgead_3_2_lib4(int kmax,double alpha,double * A0,int sda,double * B)1134 void kernel_dgead_3_2_lib4(int kmax, double alpha, double *A0, int sda, double *B)
1135 {
1136
1137 if(kmax<=0)
1138 return;
1139
1140 const int bs = 4;
1141
1142 double *A1 = A0 + bs*sda;
1143
1144 int k;
1145
1146 for(k=0; k<kmax-3; k+=4)
1147 {
1148
1149 B[0+bs*0] += alpha * A0[2+bs*0];
1150 B[1+bs*0] += alpha * A0[3+bs*0];
1151 B[2+bs*0] += alpha * A1[0+bs*0];
1152
1153 B[0+bs*1] += alpha * A0[2+bs*1];
1154 B[1+bs*1] += alpha * A0[3+bs*1];
1155 B[2+bs*1] += alpha * A1[0+bs*1];
1156
1157 B[0+bs*2] += alpha * A0[2+bs*2];
1158 B[1+bs*2] += alpha * A0[3+bs*2];
1159 B[2+bs*2] += alpha * A1[0+bs*2];
1160
1161 B[0+bs*3] += alpha * A0[2+bs*3];
1162 B[1+bs*3] += alpha * A0[3+bs*3];
1163 B[2+bs*3] += alpha * A1[0+bs*3];
1164
1165 A0 += 16;
1166 A1 += 16;
1167 B += 16;
1168
1169 }
1170 for(; k<kmax; k++)
1171 {
1172
1173 B[0+bs*0] += alpha * A0[2+bs*0];
1174 B[1+bs*0] += alpha * A0[3+bs*0];
1175 B[2+bs*0] += alpha * A1[0+bs*0];
1176
1177 A0 += 4;
1178 A1 += 4;
1179 B += 4;
1180
1181 }
1182
1183 }
1184
1185
1186
1187 // both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
kernel_dgead_3_3_lib4(int kmax,double alpha,double * A0,int sda,double * B)1188 void kernel_dgead_3_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
1189 {
1190
1191 if(kmax<=0)
1192 return;
1193
1194 const int bs = 4;
1195
1196 double *A1 = A0 + bs*sda;
1197
1198 int k;
1199
1200 for(k=0; k<kmax-3; k+=4)
1201 {
1202
1203 B[0+bs*0] += alpha * A0[3+bs*0];
1204 B[1+bs*0] += alpha * A1[0+bs*0];
1205 B[2+bs*0] += alpha * A1[1+bs*0];
1206
1207 B[0+bs*1] += alpha * A0[3+bs*1];
1208 B[1+bs*1] += alpha * A1[0+bs*1];
1209 B[2+bs*1] += alpha * A1[1+bs*1];
1210
1211 B[0+bs*2] += alpha * A0[3+bs*2];
1212 B[1+bs*2] += alpha * A1[0+bs*2];
1213 B[2+bs*2] += alpha * A1[1+bs*2];
1214
1215 B[0+bs*3] += alpha * A0[3+bs*3];
1216 B[1+bs*3] += alpha * A1[0+bs*3];
1217 B[2+bs*3] += alpha * A1[1+bs*3];
1218
1219 A0 += 16;
1220 A1 += 16;
1221 B += 16;
1222
1223 }
1224 for(; k<kmax; k++)
1225 {
1226
1227 B[0+bs*0] += alpha * A0[3+bs*0];
1228 B[1+bs*0] += alpha * A1[0+bs*0];
1229 B[2+bs*0] += alpha * A1[1+bs*0];
1230
1231 A0 += 4;
1232 A1 += 4;
1233 B += 4;
1234
1235 }
1236
1237 }
1238
1239
1240
1241 // both A and B are aligned to 64-bit boundaries
kernel_dgead_2_0_lib4(int kmax,double alpha,double * A,double * B)1242 void kernel_dgead_2_0_lib4(int kmax, double alpha, double *A, double *B)
1243 {
1244
1245 if(kmax<=0)
1246 return;
1247
1248 const int bs = 4;
1249
1250 int k;
1251
1252 for(k=0; k<kmax-3; k+=4)
1253 {
1254 B[0+bs*0] += alpha * A[0+bs*0];
1255 B[1+bs*0] += alpha * A[1+bs*0];
1256
1257 B[0+bs*1] += alpha * A[0+bs*1];
1258 B[1+bs*1] += alpha * A[1+bs*1];
1259
1260 B[0+bs*2] += alpha * A[0+bs*2];
1261 B[1+bs*2] += alpha * A[1+bs*2];
1262
1263 B[0+bs*3] += alpha * A[0+bs*3];
1264 B[1+bs*3] += alpha * A[1+bs*3];
1265
1266 A += 16;
1267 B += 16;
1268
1269 }
1270 for(; k<kmax; k++)
1271 {
1272
1273 B[0+bs*0] += alpha * A[0+bs*0];
1274 B[1+bs*0] += alpha * A[1+bs*0];
1275
1276 A += 4;
1277 B += 4;
1278
1279 }
1280
1281 }
1282
1283
1284
1285 // both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
kernel_dgead_2_3_lib4(int kmax,double alpha,double * A0,int sda,double * B)1286 void kernel_dgead_2_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
1287 {
1288
1289 if(kmax<=0)
1290 return;
1291
1292 const int bs = 4;
1293
1294 double *A1 = A0 + bs*sda;
1295
1296 int k;
1297
1298 for(k=0; k<kmax-3; k+=4)
1299 {
1300
1301 B[0+bs*0] += alpha * A0[3+bs*0];
1302 B[1+bs*0] += alpha * A1[0+bs*0];
1303
1304 B[0+bs*1] += alpha * A0[3+bs*1];
1305 B[1+bs*1] += alpha * A1[0+bs*1];
1306
1307 B[0+bs*2] += alpha * A0[3+bs*2];
1308 B[1+bs*2] += alpha * A1[0+bs*2];
1309
1310 B[0+bs*3] += alpha * A0[3+bs*3];
1311 B[1+bs*3] += alpha * A1[0+bs*3];
1312
1313 A0 += 16;
1314 A1 += 16;
1315 B += 16;
1316
1317 }
1318 for(; k<kmax; k++)
1319 {
1320
1321 B[0+bs*0] += alpha * A0[3+bs*0];
1322 B[1+bs*0] += alpha * A1[0+bs*0];
1323
1324 A0 += 4;
1325 A1 += 4;
1326 B += 4;
1327
1328 }
1329
1330 }
1331
1332
1333
1334 // both A and B are aligned 64-bit boundaries
kernel_dgead_1_0_lib4(int kmax,double alpha,double * A,double * B)1335 void kernel_dgead_1_0_lib4(int kmax, double alpha, double *A, double *B)
1336 {
1337
1338 if(kmax<=0)
1339 return;
1340
1341 const int bs = 4;
1342
1343 int k;
1344
1345 for(k=0; k<kmax-3; k+=4)
1346 {
1347 B[0+bs*0] += alpha * A[0+bs*0];
1348
1349 B[0+bs*1] += alpha * A[0+bs*1];
1350
1351 B[0+bs*2] += alpha * A[0+bs*2];
1352
1353 B[0+bs*3] += alpha * A[0+bs*3];
1354
1355 A += 16;
1356 B += 16;
1357
1358 }
1359 for(; k<kmax; k++)
1360 {
1361
1362 B[0+bs*0] += alpha * A[0+bs*0];
1363
1364 A += 4;
1365 B += 4;
1366
1367 }
1368
1369 }
1370