1 /**************************************************************************************************
2 * *
3 * This file is part of BLASFEO. *
4 * *
5 * BLASFEO -- BLAS For Embedded Optimization. *
6 * Copyright (C) 2019 by Gianluca Frison. *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8 * All rights reserved. *
9 * *
10 * The 2-Clause BSD License *
11 * *
12 * Redistribution and use in source and binary forms, with or without *
13 * modification, are permitted provided that the following conditions are met: *
14 * *
15 * 1. Redistributions of source code must retain the above copyright notice, this *
16 * list of conditions and the following disclaimer. *
17 * 2. Redistributions in binary form must reproduce the above copyright notice, *
18 * this list of conditions and the following disclaimer in the documentation *
19 * and/or other materials provided with the distribution. *
20 * *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
31 * *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de *
33 * *
34 **************************************************************************************************/
35
36
37
38 #include "../../include/blasfeo_d_kernel.h"
39
40
41
42 #if defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dpack_nn_12_vs_lib4(int kmax,double * A,int lda,double * C,int sdc,int m1)43 void kernel_dpack_nn_12_vs_lib4(int kmax, double *A, int lda, double *C, int sdc, int m1)
44 {
45
46 const int ps = 4;
47
48 kernel_dpack_nn_8_lib4(kmax, A+0, lda, C+0*sdc, sdc);
49 kernel_dpack_nn_4_vs_lib4(kmax, A+8, lda, C+8*sdc, m1-8);
50
51 return;
52
53 }
54 #endif
55
56
57
58 #if defined(TARGET_ARMV8A_ARM_CORTEX_A57) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dpack_nn_8_vs_lib4(int kmax,double * A,int lda,double * C,int sdc,int m1)59 void kernel_dpack_nn_8_vs_lib4(int kmax, double *A, int lda, double *C, int sdc, int m1)
60 {
61
62 kernel_dpack_nn_4_lib4(kmax, A+0, lda, C+0*sdc);
63 kernel_dpack_nn_4_vs_lib4(kmax, A+4, lda, C+4*sdc, m1-4);
64
65 return;
66
67 }
68 #endif
69
70
71
72 #if ! ( defined(TARGET_ARMV8A_ARM_CORTEX_A57) | defined(TARGET_ARMV8A_ARM_CORTEX_A53) )
kernel_dpack_nn_4_lib4(int kmax,double * A,int lda,double * C)73 void kernel_dpack_nn_4_lib4(int kmax, double *A, int lda, double *C)
74 {
75
76 const int ps = 4;
77
78 int ii;
79 ii = 0;
80
81 for(; ii<kmax-3; ii+=4)
82 {
83 C[0+ps*0] = A[0+lda*0];
84 C[1+ps*0] = A[1+lda*0];
85 C[2+ps*0] = A[2+lda*0];
86 C[3+ps*0] = A[3+lda*0];
87
88 C[0+ps*1] = A[0+lda*1];
89 C[1+ps*1] = A[1+lda*1];
90 C[2+ps*1] = A[2+lda*1];
91 C[3+ps*1] = A[3+lda*1];
92
93 C[0+ps*2] = A[0+lda*2];
94 C[1+ps*2] = A[1+lda*2];
95 C[2+ps*2] = A[2+lda*2];
96 C[3+ps*2] = A[3+lda*2];
97
98 C[0+ps*3] = A[0+lda*3];
99 C[1+ps*3] = A[1+lda*3];
100 C[2+ps*3] = A[2+lda*3];
101 C[3+ps*3] = A[3+lda*3];
102
103 A += 4*lda;
104 C += 4*ps;
105 }
106 for(; ii<kmax; ii++)
107 {
108 C[0+ps*0] = A[0+lda*0];
109 C[1+ps*0] = A[1+lda*0];
110 C[2+ps*0] = A[2+lda*0];
111 C[3+ps*0] = A[3+lda*0];
112
113 A += 1*lda;
114 C += 1*ps;
115 }
116
117 return;
118
119 }
120 #endif
121
122
123
kernel_dpack_nn_4_vs_lib4(int kmax,double * A,int lda,double * C,int m1)124 void kernel_dpack_nn_4_vs_lib4(int kmax, double *A, int lda, double *C, int m1)
125 {
126
127 if(m1<=0)
128 return;
129
130 const int ps = 4;
131
132 int ii;
133 ii = 0;
134
135 if(m1>=4)
136 {
137 kernel_dpack_nn_4_lib4(kmax, A, lda, C);
138 return;
139 }
140 else if(m1==1)
141 {
142 goto l1;
143 }
144 else if(m1==2)
145 {
146 goto l2;
147 }
148 else //if(m1==3)
149 {
150 goto l3;
151 }
152 return;
153
154 l1:
155 ii = 0;
156 for(; ii<kmax; ii++)
157 {
158 C[0+ps*0] = A[0+lda*0];
159
160 A += 1*lda;
161 C += 1*ps;
162 }
163 return;
164
165 l2:
166 ii = 0;
167 for(; ii<kmax; ii++)
168 {
169 C[0+ps*0] = A[0+lda*0];
170 C[1+ps*0] = A[1+lda*0];
171
172 A += 1*lda;
173 C += 1*ps;
174 }
175 return;
176
177 l3:
178 ii = 0;
179 for(; ii<kmax; ii++)
180 {
181 C[0+ps*0] = A[0+lda*0];
182 C[1+ps*0] = A[1+lda*0];
183 C[2+ps*0] = A[2+lda*0];
184
185 A += 1*lda;
186 C += 1*ps;
187 }
188 return;
189
190 }
191
192
193
kernel_dpack_tn_4_lib4(int kmax,double * A,int lda,double * C)194 void kernel_dpack_tn_4_lib4(int kmax, double *A, int lda, double *C)
195 {
196
197 const int ps = 4;
198
199 int ii;
200 ii = 0;
201
202 for(; ii<kmax-3; ii+=4)
203 {
204 C[0+ps*0] = A[0+lda*0];
205 C[1+ps*0] = A[0+lda*1];
206 C[2+ps*0] = A[0+lda*2];
207 C[3+ps*0] = A[0+lda*3];
208
209 C[0+ps*1] = A[1+lda*0];
210 C[1+ps*1] = A[1+lda*1];
211 C[2+ps*1] = A[1+lda*2];
212 C[3+ps*1] = A[1+lda*3];
213
214 C[0+ps*2] = A[2+lda*0];
215 C[1+ps*2] = A[2+lda*1];
216 C[2+ps*2] = A[2+lda*2];
217 C[3+ps*2] = A[2+lda*3];
218
219 C[0+ps*3] = A[3+lda*0];
220 C[1+ps*3] = A[3+lda*1];
221 C[2+ps*3] = A[3+lda*2];
222 C[3+ps*3] = A[3+lda*3];
223
224 A += 4;
225 C += 4*ps;
226 }
227 for(; ii<kmax; ii++)
228 {
229 C[0+ps*0] = A[0+lda*0];
230 C[1+ps*0] = A[0+lda*1];
231 C[2+ps*0] = A[0+lda*2];
232 C[3+ps*0] = A[0+lda*3];
233
234 A += 1;
235 C += 1*ps;
236 }
237
238 return;
239
240 }
241
242
243
kernel_dpack_tn_4_vs_lib4(int kmax,double * A,int lda,double * C,int m1)244 void kernel_dpack_tn_4_vs_lib4(int kmax, double *A, int lda, double *C, int m1)
245 {
246
247 if(m1<=0)
248 return;
249
250 const int ps = 4;
251
252 int ii;
253 ii = 0;
254
255 if(m1>=4)
256 {
257 kernel_dpack_tn_4_lib4(kmax, A, lda, C);
258 return;
259 }
260 else if(m1==1)
261 {
262 goto l1;
263 }
264 else if(m1==2)
265 {
266 goto l2;
267 }
268 else //if(m1==3)
269 {
270 goto l3;
271 }
272 return;
273
274 l1:
275 ii = 0;
276 for(; ii<kmax; ii++)
277 {
278 C[0+ps*0] = A[0+lda*0];
279
280 A += 1;
281 C += 1*ps;
282 }
283 return;
284
285 l2:
286 ii = 0;
287 for(; ii<kmax; ii++)
288 {
289 C[0+ps*0] = A[0+lda*0];
290 C[1+ps*0] = A[0+lda*1];
291
292 A += 1;
293 C += 1*ps;
294 }
295 return;
296
297 l3:
298 ii = 0;
299 for(; ii<kmax; ii++)
300 {
301 C[0+ps*0] = A[0+lda*0];
302 C[1+ps*0] = A[0+lda*1];
303 C[2+ps*0] = A[0+lda*2];
304
305 A += 1;
306 C += 1*ps;
307 }
308 return;
309
310 }
311
312
313
314 #if ! ( defined(TARGET_ARMV8A_ARM_CORTEX_A57) | defined(TARGET_ARMV8A_ARM_CORTEX_A53) )
kernel_dpack_tt_4_lib4(int kmax,double * A,int lda,double * C,int sdc)315 void kernel_dpack_tt_4_lib4(int kmax, double *A, int lda, double *C, int sdc)
316 {
317
318 const int ps = 4;
319
320 int ii;
321
322 ii = 0;
323 for(; ii<kmax-3; ii+=4)
324 {
325 C[0+ps*0] = A[0+lda*0];
326 C[1+ps*0] = A[1+lda*0];
327 C[2+ps*0] = A[2+lda*0];
328 C[3+ps*0] = A[3+lda*0];
329
330 C[0+ps*1] = A[0+lda*1];
331 C[1+ps*1] = A[1+lda*1];
332 C[2+ps*1] = A[2+lda*1];
333 C[3+ps*1] = A[3+lda*1];
334
335 C[0+ps*2] = A[0+lda*2];
336 C[1+ps*2] = A[1+lda*2];
337 C[2+ps*2] = A[2+lda*2];
338 C[3+ps*2] = A[3+lda*2];
339
340 C[0+ps*3] = A[0+lda*3];
341 C[1+ps*3] = A[1+lda*3];
342 C[2+ps*3] = A[2+lda*3];
343 C[3+ps*3] = A[3+lda*3];
344
345 A += 4;
346 C += 4*sdc;
347 }
348 for(; ii<kmax; ii++)
349 {
350 C[0+ps*0] = A[0+lda*0];
351 C[0+ps*1] = A[0+lda*1];
352 C[0+ps*2] = A[0+lda*2];
353 C[0+ps*3] = A[0+lda*3];
354
355 A += 1;
356 C += 1;
357 }
358
359 return;
360
361 }
362 #endif
363
364
365
kernel_dpack_tt_4_vs_lib4(int kmax,double * A,int lda,double * C,int sdc,int m1)366 void kernel_dpack_tt_4_vs_lib4(int kmax, double *A, int lda, double *C, int sdc, int m1)
367 {
368
369 const int ps = 4;
370
371 int ii;
372 ii = 0;
373
374 if(m1>=4)
375 {
376 kernel_dpack_tt_4_lib4(kmax, A, lda, C, sdc);
377 return;
378 }
379 else if(m1==1)
380 {
381 goto l1;
382 }
383 else if(m1==2)
384 {
385 goto l2;
386 }
387 else //if(m1==3)
388 {
389 goto l3;
390 }
391 return;
392
393 l1:
394 ii = 0;
395 for(; ii<kmax-3; ii+=4)
396 {
397 C[0+ps*0] = A[0+lda*0];
398 C[1+ps*0] = A[1+lda*0];
399 C[2+ps*0] = A[2+lda*0];
400 C[3+ps*0] = A[3+lda*0];
401
402 A += 4;
403 C += 4*sdc;
404 }
405 for(; ii<kmax; ii++)
406 {
407 C[0+ps*0] = A[0+lda*0];
408
409 A += 1;
410 C += 1;
411 }
412 return;
413
414 l2:
415 ii = 0;
416 for(; ii<kmax-3; ii+=4)
417 {
418 C[0+ps*0] = A[0+lda*0];
419 C[1+ps*0] = A[1+lda*0];
420 C[2+ps*0] = A[2+lda*0];
421 C[3+ps*0] = A[3+lda*0];
422
423 C[0+ps*1] = A[0+lda*1];
424 C[1+ps*1] = A[1+lda*1];
425 C[2+ps*1] = A[2+lda*1];
426 C[3+ps*1] = A[3+lda*1];
427
428 A += 4;
429 C += 4*sdc;
430 }
431 for(; ii<kmax; ii++)
432 {
433 C[0+ps*0] = A[0+lda*0];
434 C[0+ps*1] = A[0+lda*1];
435
436 A += 1;
437 C += 1;
438 }
439 return;
440
441 l3:
442 ii = 0;
443 for(; ii<kmax-3; ii+=4)
444 {
445 C[0+ps*0] = A[0+lda*0];
446 C[1+ps*0] = A[1+lda*0];
447 C[2+ps*0] = A[2+lda*0];
448 C[3+ps*0] = A[3+lda*0];
449
450 C[0+ps*1] = A[0+lda*1];
451 C[1+ps*1] = A[1+lda*1];
452 C[2+ps*1] = A[2+lda*1];
453 C[3+ps*1] = A[3+lda*1];
454
455 C[0+ps*2] = A[0+lda*2];
456 C[1+ps*2] = A[1+lda*2];
457 C[2+ps*2] = A[2+lda*2];
458 C[3+ps*2] = A[3+lda*2];
459
460 A += 4;
461 C += 4*sdc;
462 }
463 for(; ii<kmax; ii++)
464 {
465 C[0+ps*0] = A[0+lda*0];
466 C[0+ps*1] = A[0+lda*1];
467 C[0+ps*2] = A[0+lda*2];
468
469 A += 1;
470 C += 1;
471 }
472 return;
473
474 }
475
476
477
478 #if defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dunpack_nn_12_vs_lib4(int kmax,double * A,int sda,double * C,int ldc,int m1)479 void kernel_dunpack_nn_12_vs_lib4(int kmax, double *A, int sda, double *C, int ldc, int m1)
480 {
481
482 kernel_dunpack_nn_8_lib4(kmax, A+0*sda, sda, C+0, ldc);
483 kernel_dunpack_nn_4_vs_lib4(kmax, A+8*sda, C+8, ldc, m1-8);
484
485 return;
486
487 }
488 #endif
489
490
491
492 #if defined(TARGET_ARMV8A_ARM_CORTEX_A57) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dunpack_nn_8_vs_lib4(int kmax,double * A,int sda,double * C,int ldc,int m1)493 void kernel_dunpack_nn_8_vs_lib4(int kmax, double *A, int sda, double *C, int ldc, int m1)
494 {
495
496 kernel_dunpack_nn_4_lib4(kmax, A+0*sda, C+0, ldc);
497 kernel_dunpack_nn_4_vs_lib4(kmax, A+4*sda, C+4, ldc, m1-4);
498
499 return;
500
501 }
502 #endif
503
504
505
506 #if ! ( defined(TARGET_ARMV8A_ARM_CORTEX_A57) | defined(TARGET_ARMV8A_ARM_CORTEX_A53) )
kernel_dunpack_nn_4_lib4(int kmax,double * A,double * C,int ldc)507 void kernel_dunpack_nn_4_lib4(int kmax, double *A, double *C, int ldc)
508 {
509
510 const int ps = 4;
511
512 int ii;
513 ii = 0;
514
515 for(; ii<kmax-3; ii+=4)
516 {
517 C[0+ldc*0] = A[0+ps*0];
518 C[1+ldc*0] = A[1+ps*0];
519 C[2+ldc*0] = A[2+ps*0];
520 C[3+ldc*0] = A[3+ps*0];
521
522 C[0+ldc*1] = A[0+ps*1];
523 C[1+ldc*1] = A[1+ps*1];
524 C[2+ldc*1] = A[2+ps*1];
525 C[3+ldc*1] = A[3+ps*1];
526
527 C[0+ldc*2] = A[0+ps*2];
528 C[1+ldc*2] = A[1+ps*2];
529 C[2+ldc*2] = A[2+ps*2];
530 C[3+ldc*2] = A[3+ps*2];
531
532 C[0+ldc*3] = A[0+ps*3];
533 C[1+ldc*3] = A[1+ps*3];
534 C[2+ldc*3] = A[2+ps*3];
535 C[3+ldc*3] = A[3+ps*3];
536
537 A += 4*ps;
538 C += 4*ldc;
539 }
540 for(; ii<kmax; ii++)
541 {
542 C[0+ldc*0] = A[0+ps*0];
543 C[1+ldc*0] = A[1+ps*0];
544 C[2+ldc*0] = A[2+ps*0];
545 C[3+ldc*0] = A[3+ps*0];
546
547 A += 1*ps;
548 C += 1*ldc;
549 }
550
551 return;
552
553 }
554 #endif
555
556
557
kernel_dunpack_nn_4_vs_lib4(int kmax,double * A,double * C,int ldc,int m1)558 void kernel_dunpack_nn_4_vs_lib4(int kmax, double *A, double *C, int ldc, int m1)
559 {
560
561 if(m1<=0)
562 return;
563
564 const int ps = 4;
565
566 int ii;
567 ii = 0;
568
569 if(m1>=4)
570 {
571 kernel_dunpack_nn_4_lib4(kmax, A, C, ldc);
572 return;
573 }
574 else if(m1==1)
575 {
576 goto l1;
577 }
578 else if(m1==2)
579 {
580 goto l2;
581 }
582 else //if(m1==3)
583 {
584 goto l3;
585 }
586 return;
587
588 l1:
589 ii = 0;
590 for(; ii<kmax; ii++)
591 {
592 C[0+ldc*0] = A[0+ps*0];
593
594 A += 1*ps;
595 C += 1*ldc;
596 }
597 return;
598
599 l2:
600 ii = 0;
601 for(; ii<kmax; ii++)
602 {
603 C[0+ldc*0] = A[0+ps*0];
604 C[1+ldc*0] = A[1+ps*0];
605
606 A += 1*ps;
607 C += 1*ldc;
608 }
609 return;
610
611 l3:
612 ii = 0;
613 for(; ii<kmax; ii++)
614 {
615 C[0+ldc*0] = A[0+ps*0];
616 C[1+ldc*0] = A[1+ps*0];
617 C[2+ldc*0] = A[2+ps*0];
618
619 A += 1*ps;
620 C += 1*ldc;
621 }
622 return;
623
624 }
625
626
627
kernel_dunpack_nt_4_lib4(int kmax,double * C,double * A,int lda)628 void kernel_dunpack_nt_4_lib4(int kmax, double *C, double *A, int lda)
629 {
630
631 const int ps = 4;
632
633 int ii;
634 ii = 0;
635
636 for(; ii<kmax-3; ii+=4)
637 {
638 A[0+lda*0] = C[0+ps*0];
639 A[0+lda*1] = C[1+ps*0];
640 A[0+lda*2] = C[2+ps*0];
641 A[0+lda*3] = C[3+ps*0];
642
643 A[1+lda*0] = C[0+ps*1];
644 A[1+lda*1] = C[1+ps*1];
645 A[1+lda*2] = C[2+ps*1];
646 A[1+lda*3] = C[3+ps*1];
647
648 A[2+lda*0] = C[0+ps*2];
649 A[2+lda*1] = C[1+ps*2];
650 A[2+lda*2] = C[2+ps*2];
651 A[2+lda*3] = C[3+ps*2];
652
653 A[3+lda*0] = C[0+ps*3];
654 A[3+lda*1] = C[1+ps*3];
655 A[3+lda*2] = C[2+ps*3];
656 A[3+lda*3] = C[3+ps*3];
657
658 A += 4;
659 C += 4*ps;
660 }
661 for(; ii<kmax; ii++)
662 {
663 A[0+lda*0] = C[0+ps*0];
664 A[0+lda*1] = C[1+ps*0];
665 A[0+lda*2] = C[2+ps*0];
666 A[0+lda*3] = C[3+ps*0];
667
668 A += 1;
669 C += 1*ps;
670 }
671
672 return;
673
674 }
675
676
677
kernel_dunpack_nt_4_vs_lib4(int kmax,double * C,double * A,int lda,int m1)678 void kernel_dunpack_nt_4_vs_lib4(int kmax, double *C, double *A, int lda, int m1)
679 {
680
681 if(m1<=0)
682 return;
683
684 const int ps = 4;
685
686 int ii;
687 ii = 0;
688
689 if(m1>=4)
690 {
691 kernel_dunpack_nt_4_lib4(kmax, C, A, lda);
692 return;
693 }
694 else if(m1==1)
695 {
696 goto l1;
697 }
698 else if(m1==2)
699 {
700 goto l2;
701 }
702 else //if(m1==3)
703 {
704 goto l3;
705 }
706 return;
707
708 l1:
709 ii = 0;
710 for(; ii<kmax; ii++)
711 {
712 A[0+lda*0] = C[0+ps*0];
713
714 A += 1;
715 C += 1*ps;
716 }
717 return;
718
719 l2:
720 ii = 0;
721 for(; ii<kmax; ii++)
722 {
723 A[0+lda*0] = C[0+ps*0];
724 A[0+lda*1] = C[1+ps*0];
725
726 A += 1;
727 C += 1*ps;
728 }
729 return;
730
731 l3:
732 ii = 0;
733 for(; ii<kmax; ii++)
734 {
735 A[0+lda*0] = C[0+ps*0];
736 A[0+lda*1] = C[1+ps*0];
737 A[0+lda*2] = C[2+ps*0];
738
739 A += 1;
740 C += 1*ps;
741 }
742 return;
743
744 }
745
746
747
kernel_dunpack_tt_4_lib4(int kmax,double * A,int sda,double * C,int ldc)748 void kernel_dunpack_tt_4_lib4(int kmax, double *A, int sda, double *C, int ldc)
749 {
750
751 const int ps = 4;
752
753 int ii;
754 ii = 0;
755
756 for(; ii<kmax-3; ii+=4)
757 {
758 C[0+ldc*0] = A[0+ps*0];
759 C[1+ldc*0] = A[1+ps*0];
760 C[2+ldc*0] = A[2+ps*0];
761 C[3+ldc*0] = A[3+ps*0];
762
763 C[0+ldc*1] = A[0+ps*1];
764 C[1+ldc*1] = A[1+ps*1];
765 C[2+ldc*1] = A[2+ps*1];
766 C[3+ldc*1] = A[3+ps*1];
767
768 C[0+ldc*2] = A[0+ps*2];
769 C[1+ldc*2] = A[1+ps*2];
770 C[2+ldc*2] = A[2+ps*2];
771 C[3+ldc*2] = A[3+ps*2];
772
773 C[0+ldc*3] = A[0+ps*3];
774 C[1+ldc*3] = A[1+ps*3];
775 C[2+ldc*3] = A[2+ps*3];
776 C[3+ldc*3] = A[3+ps*3];
777
778 A += 4*sda;
779 C += 4;
780 }
781 for(; ii<kmax; ii++)
782 {
783 C[0+ldc*0] = A[0+ps*0];
784 C[1+ldc*0] = A[1+ps*0];
785 C[2+ldc*0] = A[2+ps*0];
786 C[3+ldc*0] = A[3+ps*0];
787
788 A += 1;
789 C += 1;
790 }
791
792 return;
793
794 }
795
796
797
798 // copy transposed panel into normal panel
kernel_dpacp_tn_4_lib4(int kmax,int offsetA,double * A,int sda,double * B)799 void kernel_dpacp_tn_4_lib4(int kmax, int offsetA, double *A, int sda, double *B)
800 {
801
802 const int ps = 4;
803
804 int k;
805
806 int kna = (ps-offsetA)%ps;
807 kna = kmax<kna ? kmax : kna;
808
809 k = 0;
810 if(kna>0)
811 {
812 A += offsetA;
813 for( ; k<kna; k++)
814 {
815 //
816 B[0+ps*0] = A[0+ps*0];
817 B[1+ps*0] = A[0+ps*1];
818 B[2+ps*0] = A[0+ps*2];
819 B[3+ps*0] = A[0+ps*3];
820
821 A += 1;
822 B += ps;
823 }
824 A += ps*(sda-1);
825 }
826 for(; k<kmax-3; k+=4)
827 {
828 //
829 B[0+ps*0] = A[0+ps*0];
830 B[0+ps*1] = A[1+ps*0];
831 B[0+ps*2] = A[2+ps*0];
832 B[0+ps*3] = A[3+ps*0];
833 //
834 B[1+ps*0] = A[0+ps*1];
835 B[1+ps*1] = A[1+ps*1];
836 B[1+ps*2] = A[2+ps*1];
837 B[1+ps*3] = A[3+ps*1];
838 //
839 B[2+ps*0] = A[0+ps*2];
840 B[2+ps*1] = A[1+ps*2];
841 B[2+ps*2] = A[2+ps*2];
842 B[2+ps*3] = A[3+ps*2];
843 //
844 B[3+ps*0] = A[0+ps*3];
845 B[3+ps*1] = A[1+ps*3];
846 B[3+ps*2] = A[2+ps*3];
847 B[3+ps*3] = A[3+ps*3];
848
849 A += ps*sda;
850 B += ps*ps;
851 }
852 for( ; k<kmax; k++)
853 {
854 //
855 B[0+ps*0] = A[0+ps*0];
856 B[1+ps*0] = A[0+ps*1];
857 B[2+ps*0] = A[0+ps*2];
858 B[3+ps*0] = A[0+ps*3];
859
860 A += 1;
861 B += ps;
862 }
863 return;
864 }
865
866
867
868 // copy transposed panel into normal panel
kernel_dpacp_nt_4_lib4(int kmax,double * A,int offsetB,double * B,int sdb)869 void kernel_dpacp_nt_4_lib4(int kmax, double *A, int offsetB, double *B, int sdb)
870 {
871
872 const int ps = 4;
873
874 int k;
875
876 int kna = (ps-offsetB)%ps;
877 kna = kmax<kna ? kmax : kna;
878
879 k = 0;
880 if(kna>0)
881 {
882 B += offsetB;
883 for( ; k<kna; k++)
884 {
885 //
886 B[0+ps*0] = A[0+ps*0];
887 B[0+ps*1] = A[1+ps*0];
888 B[0+ps*2] = A[2+ps*0];
889 B[0+ps*3] = A[3+ps*0];
890
891 B += 1;
892 A += ps;
893 }
894 B += ps*(sdb-1);
895 }
896 for(; k<kmax-3; k+=4)
897 {
898 //
899 B[0+ps*0] = A[0+ps*0];
900 B[1+ps*0] = A[0+ps*1];
901 B[2+ps*0] = A[0+ps*2];
902 B[3+ps*0] = A[0+ps*3];
903 //
904 B[0+ps*1] = A[1+ps*0];
905 B[1+ps*1] = A[1+ps*1];
906 B[2+ps*1] = A[1+ps*2];
907 B[3+ps*1] = A[1+ps*3];
908 //
909 B[0+ps*2] = A[2+ps*0];
910 B[1+ps*2] = A[2+ps*1];
911 B[2+ps*2] = A[2+ps*2];
912 B[3+ps*2] = A[2+ps*3];
913 //
914 B[0+ps*3] = A[3+ps*0];
915 B[1+ps*3] = A[3+ps*1];
916 B[2+ps*3] = A[3+ps*2];
917 B[3+ps*3] = A[3+ps*3];
918
919 B += ps*sdb;
920 A += ps*ps;
921 }
922 for( ; k<kmax; k++)
923 {
924 //
925 B[0+ps*0] = A[0+ps*0];
926 B[0+ps*1] = A[1+ps*0];
927 B[0+ps*2] = A[2+ps*0];
928 B[0+ps*3] = A[3+ps*0];
929
930 B += 1;
931 A += ps;
932 }
933 return;
934 }
935
936
937
kernel_dpacp_nn_4_lib4(int kmax,int offsetA,double * A,int sda,double * B)938 void kernel_dpacp_nn_4_lib4(int kmax, int offsetA, double *A, int sda, double *B)
939 {
940
941 const int ps = 4;
942
943 int k;
944
945 int air = offsetA%ps;
946
947 double *A0 = A;
948 double *A1 = A0 + ps*sda;
949
950 if(offsetA==0)
951 goto air_0;
952 else if(offsetA==1)
953 goto air_1;
954 else if(offsetA==2)
955 goto air_2;
956 else //if(offsetA==3)
957 goto air_3;
958
959 air_0:
960 k = 0;
961 for(k=0; k<kmax-3; k+=4)
962 {
963 B[0+ps*0] = A0[0+ps*0];
964 B[1+ps*0] = A0[1+ps*0];
965 B[2+ps*0] = A0[2+ps*0];
966 B[3+ps*0] = A0[3+ps*0];
967
968 B[0+ps*1] = A0[0+ps*1];
969 B[1+ps*1] = A0[1+ps*1];
970 B[2+ps*1] = A0[2+ps*1];
971 B[3+ps*1] = A0[3+ps*1];
972
973 B[0+ps*2] = A0[0+ps*2];
974 B[1+ps*2] = A0[1+ps*2];
975 B[2+ps*2] = A0[2+ps*2];
976 B[3+ps*2] = A0[3+ps*2];
977
978 B[0+ps*3] = A0[0+ps*3];
979 B[1+ps*3] = A0[1+ps*3];
980 B[2+ps*3] = A0[2+ps*3];
981 B[3+ps*3] = A0[3+ps*3];
982
983 A0 += 16;
984 B += 16;
985 }
986 for(; k<kmax; k++)
987 {
988 B[0+ps*0] = A0[0+ps*0];
989 B[1+ps*0] = A0[1+ps*0];
990 B[2+ps*0] = A0[2+ps*0];
991 B[3+ps*0] = A0[3+ps*0];
992
993 A0 += 4;
994 B += 4;
995 }
996 goto end;
997
998
999
1000 air_1:
1001 k = 0;
1002 for(k=0; k<kmax-3; k+=4)
1003 {
1004 B[0+ps*0] = A0[1+ps*0];
1005 B[1+ps*0] = A0[2+ps*0];
1006 B[2+ps*0] = A0[3+ps*0];
1007 B[3+ps*0] = A1[0+ps*0];
1008
1009 B[0+ps*1] = A0[1+ps*1];
1010 B[1+ps*1] = A0[2+ps*1];
1011 B[2+ps*1] = A0[3+ps*1];
1012 B[3+ps*1] = A1[0+ps*1];
1013
1014 B[0+ps*2] = A0[1+ps*2];
1015 B[1+ps*2] = A0[2+ps*2];
1016 B[2+ps*2] = A0[3+ps*2];
1017 B[3+ps*2] = A1[0+ps*2];
1018
1019 B[0+ps*3] = A0[1+ps*3];
1020 B[1+ps*3] = A0[2+ps*3];
1021 B[2+ps*3] = A0[3+ps*3];
1022 B[3+ps*3] = A1[0+ps*3];
1023
1024 A0 += 16;
1025 A1 += 16;
1026 B += 16;
1027 }
1028 for(; k<kmax; k++)
1029 {
1030 B[0+ps*0] = A0[1+ps*0];
1031 B[1+ps*0] = A0[2+ps*0];
1032 B[2+ps*0] = A0[3+ps*0];
1033 B[3+ps*0] = A1[0+ps*0];
1034
1035 A0 += 4;
1036 A1 += 4;
1037 B += 4;
1038 }
1039 goto end;
1040
1041
1042
1043 air_2:
1044 k = 0;
1045 for(k=0; k<kmax-3; k+=4)
1046 {
1047 B[0+ps*0] = A0[2+ps*0];
1048 B[1+ps*0] = A0[3+ps*0];
1049 B[2+ps*0] = A1[0+ps*0];
1050 B[3+ps*0] = A1[1+ps*0];
1051
1052 B[0+ps*1] = A0[2+ps*1];
1053 B[1+ps*1] = A0[3+ps*1];
1054 B[2+ps*1] = A1[0+ps*1];
1055 B[3+ps*1] = A1[1+ps*1];
1056
1057 B[0+ps*2] = A0[2+ps*2];
1058 B[1+ps*2] = A0[3+ps*2];
1059 B[2+ps*2] = A1[0+ps*2];
1060 B[3+ps*2] = A1[1+ps*2];
1061
1062 B[0+ps*3] = A0[2+ps*3];
1063 B[1+ps*3] = A0[3+ps*3];
1064 B[2+ps*3] = A1[0+ps*3];
1065 B[3+ps*3] = A1[1+ps*3];
1066
1067 A0 += 16;
1068 A1 += 16;
1069 B += 16;
1070 }
1071 for(; k<kmax; k++)
1072 {
1073 B[0+ps*0] = A0[2+ps*0];
1074 B[1+ps*0] = A0[3+ps*0];
1075 B[2+ps*0] = A1[0+ps*0];
1076 B[3+ps*0] = A1[1+ps*0];
1077
1078 A0 += 4;
1079 A1 += 4;
1080 B += 4;
1081 }
1082 goto end;
1083
1084
1085
1086 air_3:
1087 k = 0;
1088 for(k=0; k<kmax-3; k+=4)
1089 {
1090 B[0+ps*0] = A0[3+ps*0];
1091 B[1+ps*0] = A1[0+ps*0];
1092 B[2+ps*0] = A1[1+ps*0];
1093 B[3+ps*0] = A1[2+ps*0];
1094
1095 B[0+ps*1] = A0[3+ps*1];
1096 B[1+ps*1] = A1[0+ps*1];
1097 B[2+ps*1] = A1[1+ps*1];
1098 B[3+ps*1] = A1[2+ps*1];
1099
1100 B[0+ps*2] = A0[3+ps*2];
1101 B[1+ps*2] = A1[0+ps*2];
1102 B[2+ps*2] = A1[1+ps*2];
1103 B[3+ps*2] = A1[2+ps*2];
1104
1105 B[0+ps*3] = A0[3+ps*3];
1106 B[1+ps*3] = A1[0+ps*3];
1107 B[2+ps*3] = A1[1+ps*3];
1108 B[3+ps*3] = A1[2+ps*3];
1109
1110 A0 += 16;
1111 A1 += 16;
1112 B += 16;
1113 }
1114 for(; k<kmax; k++)
1115 {
1116 B[0+ps*0] = A0[3+ps*0];
1117 B[1+ps*0] = A1[0+ps*0];
1118 B[2+ps*0] = A1[1+ps*0];
1119 B[3+ps*0] = A1[2+ps*0];
1120
1121 A0 += 4;
1122 A1 += 4;
1123 B += 4;
1124 }
1125 goto end;
1126
1127 end:
1128 return;
1129
1130 }
1131
1132
1133
kernel_dpacp_nn_4_vs_lib4(int kmax,int offsetA,double * A,int sda,double * B,int m1)1134 void kernel_dpacp_nn_4_vs_lib4(int kmax, int offsetA, double *A, int sda, double *B, int m1)
1135 {
1136
1137 if(m1<=0)
1138 {
1139 return;
1140 }
1141
1142 if(m1>=4)
1143 {
1144 kernel_dpacp_nn_4_lib4(kmax, offsetA, A, sda, B);
1145 return;
1146 }
1147
1148
1149 const int ps = 4;
1150
1151 int k;
1152
1153 int air = offsetA%ps;
1154
1155 double *A0 = A;
1156 double *A1 = A0 + ps*sda;
1157
1158 if(offsetA==0)
1159 goto air_0;
1160 else if(offsetA==1)
1161 goto air_1;
1162 else if(offsetA==2)
1163 goto air_2;
1164 else //if(offsetA==3)
1165 goto air_3;
1166
1167 air_0:
1168
1169 if(m1==1)
1170 {
1171 goto air_0_1;
1172 }
1173 else if(m1==2)
1174 {
1175 goto air_0_2;
1176 }
1177 else //if(m1==3)
1178 {
1179 goto air_0_3;
1180 }
1181
1182 air_1:
1183
1184 if(m1==1)
1185 {
1186 A0 += air;
1187 goto air_0_1;
1188 }
1189 else if(m1==2)
1190 {
1191 A0 += air;
1192 goto air_0_2;
1193 }
1194 else //if(m1==3)
1195 {
1196 A0 += air;
1197 goto air_0_3;
1198 }
1199
1200 air_2:
1201
1202 if(m1==1)
1203 {
1204 A0 += air;
1205 goto air_0_1;
1206 }
1207 else if(m1==2)
1208 {
1209 A0 += air;
1210 goto air_0_2;
1211 }
1212 else //if(m1==3)
1213 {
1214 goto air_2_3;
1215 }
1216
1217 air_3:
1218
1219 if(m1==1)
1220 {
1221 A0 += air;
1222 goto air_0_1;
1223 }
1224 else if(m1==2)
1225 {
1226 goto air_3_2;
1227 }
1228 else //if(m1==3)
1229 {
1230 goto air_3_3;
1231 }
1232
1233
1234
1235 air_0_3:
1236 k = 0;
1237 for(k=0; k<kmax-3; k+=4)
1238 {
1239 B[0+ps*0] = A0[0+ps*0];
1240 B[1+ps*0] = A0[1+ps*0];
1241 B[2+ps*0] = A0[2+ps*0];
1242
1243 B[0+ps*1] = A0[0+ps*1];
1244 B[1+ps*1] = A0[1+ps*1];
1245 B[2+ps*1] = A0[2+ps*1];
1246
1247 B[0+ps*2] = A0[0+ps*2];
1248 B[1+ps*2] = A0[1+ps*2];
1249 B[2+ps*2] = A0[2+ps*2];
1250
1251 B[0+ps*3] = A0[0+ps*3];
1252 B[1+ps*3] = A0[1+ps*3];
1253 B[2+ps*3] = A0[2+ps*3];
1254
1255 A0 += 16;
1256 B += 16;
1257 }
1258 for(; k<kmax; k++)
1259 {
1260 B[0+ps*0] = A0[0+ps*0];
1261 B[1+ps*0] = A0[1+ps*0];
1262 B[2+ps*0] = A0[2+ps*0];
1263
1264 A0 += 4;
1265 B += 4;
1266 }
1267 goto end;
1268
1269 air_0_2:
1270 k = 0;
1271 for(k=0; k<kmax-3; k+=4)
1272 {
1273 B[0+ps*0] = A0[0+ps*0];
1274 B[1+ps*0] = A0[1+ps*0];
1275
1276 B[0+ps*1] = A0[0+ps*1];
1277 B[1+ps*1] = A0[1+ps*1];
1278
1279 B[0+ps*2] = A0[0+ps*2];
1280 B[1+ps*2] = A0[1+ps*2];
1281
1282 B[0+ps*3] = A0[0+ps*3];
1283 B[1+ps*3] = A0[1+ps*3];
1284
1285 A0 += 16;
1286 B += 16;
1287 }
1288 for(; k<kmax; k++)
1289 {
1290 B[0+ps*0] = A0[0+ps*0];
1291 B[1+ps*0] = A0[1+ps*0];
1292
1293 A0 += 4;
1294 B += 4;
1295 }
1296 goto end;
1297
1298 air_0_1:
1299 k = 0;
1300 for(k=0; k<kmax-3; k+=4)
1301 {
1302 B[0+ps*0] = A0[0+ps*0];
1303
1304 B[0+ps*1] = A0[0+ps*1];
1305
1306 B[0+ps*2] = A0[0+ps*2];
1307
1308 B[0+ps*3] = A0[0+ps*3];
1309
1310 A0 += 16;
1311 B += 16;
1312 }
1313 for(; k<kmax; k++)
1314 {
1315 B[0+ps*0] = A0[0+ps*0];
1316
1317 A0 += 4;
1318 B += 4;
1319 }
1320 goto end;
1321
1322 air_2_3:
1323 k = 0;
1324 for(k=0; k<kmax-3; k+=4)
1325 {
1326 B[0+ps*0] = A0[2+ps*0];
1327 B[1+ps*0] = A0[3+ps*0];
1328 B[2+ps*0] = A1[0+ps*0];
1329
1330 B[0+ps*1] = A0[2+ps*1];
1331 B[1+ps*1] = A0[3+ps*1];
1332 B[2+ps*1] = A1[0+ps*1];
1333
1334 B[0+ps*2] = A0[2+ps*2];
1335 B[1+ps*2] = A0[3+ps*2];
1336 B[2+ps*2] = A1[0+ps*2];
1337
1338 B[0+ps*3] = A0[2+ps*3];
1339 B[1+ps*3] = A0[3+ps*3];
1340 B[2+ps*3] = A1[0+ps*3];
1341
1342 A0 += 16;
1343 A1 += 16;
1344 B += 16;
1345 }
1346 for(; k<kmax; k++)
1347 {
1348 B[0+ps*0] = A0[2+ps*0];
1349 B[1+ps*0] = A0[3+ps*0];
1350 B[2+ps*0] = A1[0+ps*0];
1351
1352 A0 += 4;
1353 A1 += 4;
1354 B += 4;
1355 }
1356 goto end;
1357
1358 air_3_3:
1359 k = 0;
1360 for(k=0; k<kmax-3; k+=4)
1361 {
1362 B[0+ps*0] = A0[3+ps*0];
1363 B[1+ps*0] = A1[0+ps*0];
1364 B[2+ps*0] = A1[1+ps*0];
1365
1366 B[0+ps*1] = A0[3+ps*1];
1367 B[1+ps*1] = A1[0+ps*1];
1368 B[2+ps*1] = A1[1+ps*1];
1369
1370 B[0+ps*2] = A0[3+ps*2];
1371 B[1+ps*2] = A1[0+ps*2];
1372 B[2+ps*2] = A1[1+ps*2];
1373
1374 B[0+ps*3] = A0[3+ps*3];
1375 B[1+ps*3] = A1[0+ps*3];
1376 B[2+ps*3] = A1[1+ps*3];
1377
1378 A0 += 16;
1379 A1 += 16;
1380 B += 16;
1381 }
1382 for(; k<kmax; k++)
1383 {
1384 B[0+ps*0] = A0[3+ps*0];
1385 B[1+ps*0] = A1[0+ps*0];
1386 B[2+ps*0] = A1[1+ps*0];
1387
1388 A0 += 4;
1389 A1 += 4;
1390 B += 4;
1391 }
1392 goto end;
1393
1394 air_3_2:
1395 k = 0;
1396 for(k=0; k<kmax-3; k+=4)
1397 {
1398 B[0+ps*0] = A0[3+ps*0];
1399 B[1+ps*0] = A1[0+ps*0];
1400
1401 B[0+ps*1] = A0[3+ps*1];
1402 B[1+ps*1] = A1[0+ps*1];
1403
1404 B[0+ps*2] = A0[3+ps*2];
1405 B[1+ps*2] = A1[0+ps*2];
1406
1407 B[0+ps*3] = A0[3+ps*3];
1408 B[1+ps*3] = A1[0+ps*3];
1409
1410 A0 += 16;
1411 A1 += 16;
1412 B += 16;
1413 }
1414 for(; k<kmax; k++)
1415 {
1416 B[0+ps*0] = A0[3+ps*0];
1417 B[1+ps*0] = A1[0+ps*0];
1418
1419 A0 += 4;
1420 A1 += 4;
1421 B += 4;
1422 }
1423 goto end;
1424
1425 end:
1426 return;
1427
1428 }
1429
1430
1431