1 /**************************************************************************************************
2 * *
3 * This file is part of BLASFEO. *
4 * *
5 * BLASFEO -- BLAS For Embedded Optimization. *
6 * Copyright (C) 2019 by Gianluca Frison. *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8 * All rights reserved. *
9 * *
10 * The 2-Clause BSD License *
11 * *
12 * Redistribution and use in source and binary forms, with or without *
13 * modification, are permitted provided that the following conditions are met: *
14 * *
15 * 1. Redistributions of source code must retain the above copyright notice, this *
16 * list of conditions and the following disclaimer. *
17 * 2. Redistributions in binary form must reproduce the above copyright notice, *
18 * this list of conditions and the following disclaimer in the documentation *
19 * and/or other materials provided with the distribution. *
20 * *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
31 * *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de *
33 * *
34 **************************************************************************************************/
35
36
37
38 #include "../../include/blasfeo_s_kernel.h"
39
40
41
42 #if 0//defined(TARGET_ARMV8A_ARM_CORTEX_A53)
43 void kernel_spack_nn_12_vs_lib4(int kmax, float *A, int lda, float *C, int sdc, int m1)
44 {
45
46 const int ps = 4;
47
48 kernel_spack_nn_8_lib4(kmax, A+0, lda, C+0*sdc, sdc);
49 kernel_spack_nn_4_vs_lib4(kmax, A+8, lda, C+8*sdc, m1-8);
50
51 return;
52
53 }
54 #endif
55
56
57
58 #if defined(TARGET_ARMV7A_ARM_CORTEX_A15) | defined(TARGET_ARMV7A_ARM_CORTEX_A9) | defined(TARGET_ARMV7A_ARM_CORTEX_A7)
kernel_spack_nn_8_lib4(int kmax,float * A,int lda,float * C,int sdc)59 void kernel_spack_nn_8_lib4(int kmax, float *A, int lda, float *C, int sdc)
60 {
61
62 kernel_spack_nn_4_lib4(kmax, A+0, lda, C+0*sdc);
63 kernel_spack_nn_4_lib4(kmax, A+4, lda, C+4*sdc);
64
65 return;
66
67 }
68 #endif
69
70
71
72 #if defined(TARGET_ARMV7A_ARM_CORTEX_A15) | defined(TARGET_ARMV7A_ARM_CORTEX_A9) | defined(TARGET_ARMV7A_ARM_CORTEX_A7)
kernel_spack_nn_8_vs_lib4(int kmax,float * A,int lda,float * C,int sdc,int m1)73 void kernel_spack_nn_8_vs_lib4(int kmax, float *A, int lda, float *C, int sdc, int m1)
74 {
75
76 kernel_spack_nn_4_lib4(kmax, A+0, lda, C+0*sdc);
77 kernel_spack_nn_4_vs_lib4(kmax, A+4, lda, C+4*sdc, m1-4);
78
79 return;
80
81 }
82 #endif
83
84
85
86 //#if ! ( defined(TARGET_ARMV8A_ARM_CORTEX_A57) | defined(TARGET_ARMV8A_ARM_CORTEX_A53) )
kernel_spack_nn_4_lib4(int kmax,float * A,int lda,float * C)87 void kernel_spack_nn_4_lib4(int kmax, float *A, int lda, float *C)
88 {
89
90 const int ps = 4;
91
92 int ii;
93 ii = 0;
94
95 for(; ii<kmax-3; ii+=4)
96 {
97 C[0+ps*0] = A[0+lda*0];
98 C[1+ps*0] = A[1+lda*0];
99 C[2+ps*0] = A[2+lda*0];
100 C[3+ps*0] = A[3+lda*0];
101
102 C[0+ps*1] = A[0+lda*1];
103 C[1+ps*1] = A[1+lda*1];
104 C[2+ps*1] = A[2+lda*1];
105 C[3+ps*1] = A[3+lda*1];
106
107 C[0+ps*2] = A[0+lda*2];
108 C[1+ps*2] = A[1+lda*2];
109 C[2+ps*2] = A[2+lda*2];
110 C[3+ps*2] = A[3+lda*2];
111
112 C[0+ps*3] = A[0+lda*3];
113 C[1+ps*3] = A[1+lda*3];
114 C[2+ps*3] = A[2+lda*3];
115 C[3+ps*3] = A[3+lda*3];
116
117 A += 4*lda;
118 C += 4*ps;
119 }
120 for(; ii<kmax; ii++)
121 {
122 C[0+ps*0] = A[0+lda*0];
123 C[1+ps*0] = A[1+lda*0];
124 C[2+ps*0] = A[2+lda*0];
125 C[3+ps*0] = A[3+lda*0];
126
127 A += 1*lda;
128 C += 1*ps;
129 }
130
131 return;
132
133 }
134 //#endif
135
136
137
kernel_spack_nn_4_vs_lib4(int kmax,float * A,int lda,float * C,int m1)138 void kernel_spack_nn_4_vs_lib4(int kmax, float *A, int lda, float *C, int m1)
139 {
140
141 if(m1<=0)
142 return;
143
144 const int ps = 4;
145
146 int ii;
147 ii = 0;
148
149 if(m1>=4)
150 {
151 kernel_spack_nn_4_lib4(kmax, A, lda, C);
152 return;
153 }
154 else if(m1==1)
155 {
156 goto l1;
157 }
158 else if(m1==2)
159 {
160 goto l2;
161 }
162 else //if(m1==3)
163 {
164 goto l3;
165 }
166 return;
167
168 l1:
169 ii = 0;
170 for(; ii<kmax; ii++)
171 {
172 C[0+ps*0] = A[0+lda*0];
173
174 A += 1*lda;
175 C += 1*ps;
176 }
177 return;
178
179 l2:
180 ii = 0;
181 for(; ii<kmax; ii++)
182 {
183 C[0+ps*0] = A[0+lda*0];
184 C[1+ps*0] = A[1+lda*0];
185
186 A += 1*lda;
187 C += 1*ps;
188 }
189 return;
190
191 l3:
192 ii = 0;
193 for(; ii<kmax; ii++)
194 {
195 C[0+ps*0] = A[0+lda*0];
196 C[1+ps*0] = A[1+lda*0];
197 C[2+ps*0] = A[2+lda*0];
198
199 A += 1*lda;
200 C += 1*ps;
201 }
202 return;
203
204 }
205
206
207
kernel_spack_tn_4_lib4(int kmax,float * A,int lda,float * C)208 void kernel_spack_tn_4_lib4(int kmax, float *A, int lda, float *C)
209 {
210
211 const int ps = 4;
212
213 int ii;
214 ii = 0;
215
216 for(; ii<kmax-3; ii+=4)
217 {
218 C[0+ps*0] = A[0+lda*0];
219 C[1+ps*0] = A[0+lda*1];
220 C[2+ps*0] = A[0+lda*2];
221 C[3+ps*0] = A[0+lda*3];
222
223 C[0+ps*1] = A[1+lda*0];
224 C[1+ps*1] = A[1+lda*1];
225 C[2+ps*1] = A[1+lda*2];
226 C[3+ps*1] = A[1+lda*3];
227
228 C[0+ps*2] = A[2+lda*0];
229 C[1+ps*2] = A[2+lda*1];
230 C[2+ps*2] = A[2+lda*2];
231 C[3+ps*2] = A[2+lda*3];
232
233 C[0+ps*3] = A[3+lda*0];
234 C[1+ps*3] = A[3+lda*1];
235 C[2+ps*3] = A[3+lda*2];
236 C[3+ps*3] = A[3+lda*3];
237
238 A += 4;
239 C += 4*ps;
240 }
241 for(; ii<kmax; ii++)
242 {
243 C[0+ps*0] = A[0+lda*0];
244 C[1+ps*0] = A[0+lda*1];
245 C[2+ps*0] = A[0+lda*2];
246 C[3+ps*0] = A[0+lda*3];
247
248 A += 1;
249 C += 1*ps;
250 }
251
252 return;
253
254 }
255
256
257
kernel_spack_tn_4_vs_lib4(int kmax,float * A,int lda,float * C,int m1)258 void kernel_spack_tn_4_vs_lib4(int kmax, float *A, int lda, float *C, int m1)
259 {
260
261 if(m1<=0)
262 return;
263
264 const int ps = 4;
265
266 int ii;
267 ii = 0;
268
269 if(m1>=4)
270 {
271 kernel_spack_tn_4_lib4(kmax, A, lda, C);
272 return;
273 }
274 else if(m1==1)
275 {
276 goto l1;
277 }
278 else if(m1==2)
279 {
280 goto l2;
281 }
282 else //if(m1==3)
283 {
284 goto l3;
285 }
286 return;
287
288 l1:
289 ii = 0;
290 for(; ii<kmax; ii++)
291 {
292 C[0+ps*0] = A[0+lda*0];
293
294 A += 1;
295 C += 1*ps;
296 }
297 return;
298
299 l2:
300 ii = 0;
301 for(; ii<kmax; ii++)
302 {
303 C[0+ps*0] = A[0+lda*0];
304 C[1+ps*0] = A[0+lda*1];
305
306 A += 1;
307 C += 1*ps;
308 }
309 return;
310
311 l3:
312 ii = 0;
313 for(; ii<kmax; ii++)
314 {
315 C[0+ps*0] = A[0+lda*0];
316 C[1+ps*0] = A[0+lda*1];
317 C[2+ps*0] = A[0+lda*2];
318
319 A += 1;
320 C += 1*ps;
321 }
322 return;
323
324 }
325
326
327
328 //#if ! ( defined(TARGET_ARMV8A_ARM_CORTEX_A57) | defined(TARGET_ARMV8A_ARM_CORTEX_A53) )
kernel_spack_tt_4_lib4(int kmax,float * A,int lda,float * C,int sdc)329 void kernel_spack_tt_4_lib4(int kmax, float *A, int lda, float *C, int sdc)
330 {
331
332 const int ps = 4;
333
334 int ii;
335
336 ii = 0;
337 for(; ii<kmax-3; ii+=4)
338 {
339 C[0+ps*0] = A[0+lda*0];
340 C[1+ps*0] = A[1+lda*0];
341 C[2+ps*0] = A[2+lda*0];
342 C[3+ps*0] = A[3+lda*0];
343
344 C[0+ps*1] = A[0+lda*1];
345 C[1+ps*1] = A[1+lda*1];
346 C[2+ps*1] = A[2+lda*1];
347 C[3+ps*1] = A[3+lda*1];
348
349 C[0+ps*2] = A[0+lda*2];
350 C[1+ps*2] = A[1+lda*2];
351 C[2+ps*2] = A[2+lda*2];
352 C[3+ps*2] = A[3+lda*2];
353
354 C[0+ps*3] = A[0+lda*3];
355 C[1+ps*3] = A[1+lda*3];
356 C[2+ps*3] = A[2+lda*3];
357 C[3+ps*3] = A[3+lda*3];
358
359 A += 4;
360 C += 4*sdc;
361 }
362 for(; ii<kmax; ii++)
363 {
364 C[0+ps*0] = A[0+lda*0];
365 C[0+ps*1] = A[0+lda*1];
366 C[0+ps*2] = A[0+lda*2];
367 C[0+ps*3] = A[0+lda*3];
368
369 A += 1;
370 C += 1;
371 }
372
373 return;
374
375 }
376 //#endif
377
378
379
kernel_spack_tt_4_vs_lib4(int kmax,float * A,int lda,float * C,int sdc,int m1)380 void kernel_spack_tt_4_vs_lib4(int kmax, float *A, int lda, float *C, int sdc, int m1)
381 {
382
383 const int ps = 4;
384
385 int ii;
386 ii = 0;
387
388 if(m1>=4)
389 {
390 kernel_spack_tt_4_lib4(kmax, A, lda, C, sdc);
391 return;
392 }
393 else if(m1==1)
394 {
395 goto l1;
396 }
397 else if(m1==2)
398 {
399 goto l2;
400 }
401 else //if(m1==3)
402 {
403 goto l3;
404 }
405 return;
406
407 l1:
408 ii = 0;
409 for(; ii<kmax-3; ii+=4)
410 {
411 C[0+ps*0] = A[0+lda*0];
412 C[1+ps*0] = A[1+lda*0];
413 C[2+ps*0] = A[2+lda*0];
414 C[3+ps*0] = A[3+lda*0];
415
416 A += 4;
417 C += 4*sdc;
418 }
419 for(; ii<kmax; ii++)
420 {
421 C[0+ps*0] = A[0+lda*0];
422
423 A += 1;
424 C += 1;
425 }
426 return;
427
428 l2:
429 ii = 0;
430 for(; ii<kmax-3; ii+=4)
431 {
432 C[0+ps*0] = A[0+lda*0];
433 C[1+ps*0] = A[1+lda*0];
434 C[2+ps*0] = A[2+lda*0];
435 C[3+ps*0] = A[3+lda*0];
436
437 C[0+ps*1] = A[0+lda*1];
438 C[1+ps*1] = A[1+lda*1];
439 C[2+ps*1] = A[2+lda*1];
440 C[3+ps*1] = A[3+lda*1];
441
442 A += 4;
443 C += 4*sdc;
444 }
445 for(; ii<kmax; ii++)
446 {
447 C[0+ps*0] = A[0+lda*0];
448 C[0+ps*1] = A[0+lda*1];
449
450 A += 1;
451 C += 1;
452 }
453 return;
454
455 l3:
456 ii = 0;
457 for(; ii<kmax-3; ii+=4)
458 {
459 C[0+ps*0] = A[0+lda*0];
460 C[1+ps*0] = A[1+lda*0];
461 C[2+ps*0] = A[2+lda*0];
462 C[3+ps*0] = A[3+lda*0];
463
464 C[0+ps*1] = A[0+lda*1];
465 C[1+ps*1] = A[1+lda*1];
466 C[2+ps*1] = A[2+lda*1];
467 C[3+ps*1] = A[3+lda*1];
468
469 C[0+ps*2] = A[0+lda*2];
470 C[1+ps*2] = A[1+lda*2];
471 C[2+ps*2] = A[2+lda*2];
472 C[3+ps*2] = A[3+lda*2];
473
474 A += 4;
475 C += 4*sdc;
476 }
477 for(; ii<kmax; ii++)
478 {
479 C[0+ps*0] = A[0+lda*0];
480 C[0+ps*1] = A[0+lda*1];
481 C[0+ps*2] = A[0+lda*2];
482
483 A += 1;
484 C += 1;
485 }
486 return;
487
488 }
489
490
491
492 #if 0//defined(TARGET_ARMV8A_ARM_CORTEX_A53)
493 void kernel_sunpack_nn_12_vs_lib4(int kmax, float *A, int sda, float *C, int ldc, int m1)
494 {
495
496 kernel_sunpack_nn_8_lib4(kmax, A+0*sda, sda, C+0, ldc);
497 kernel_sunpack_nn_4_vs_lib4(kmax, A+8*sda, C+8, ldc, m1-8);
498
499 return;
500
501 }
502 #endif
503
504
505
506 #if defined(TARGET_ARMV7A_ARM_CORTEX_A15) | defined(TARGET_ARMV7A_ARM_CORTEX_A9) | defined(TARGET_ARMV7A_ARM_CORTEX_A7)
kernel_sunpack_nn_8_lib4(int kmax,float * A,int sda,float * C,int ldc)507 void kernel_sunpack_nn_8_lib4(int kmax, float *A, int sda, float *C, int ldc)
508 {
509
510 kernel_sunpack_nn_4_lib4(kmax, A+0*sda, C+0, ldc);
511 kernel_sunpack_nn_4_lib4(kmax, A+4*sda, C+4, ldc);
512
513 return;
514
515 }
516 #endif
517
518
519
520 #if defined(TARGET_ARMV7A_ARM_CORTEX_A15) | defined(TARGET_ARMV7A_ARM_CORTEX_A9) | defined(TARGET_ARMV7A_ARM_CORTEX_A7)
kernel_sunpack_nn_8_vs_lib4(int kmax,float * A,int sda,float * C,int ldc,int m1)521 void kernel_sunpack_nn_8_vs_lib4(int kmax, float *A, int sda, float *C, int ldc, int m1)
522 {
523
524 kernel_sunpack_nn_4_lib4(kmax, A+0*sda, C+0, ldc);
525 kernel_sunpack_nn_4_vs_lib4(kmax, A+4*sda, C+4, ldc, m1-4);
526
527 return;
528
529 }
530 #endif
531
532
533
534 //#if ! ( defined(TARGET_ARMV8A_ARM_CORTEX_A57) | defined(TARGET_ARMV8A_ARM_CORTEX_A53) )
kernel_sunpack_nn_4_lib4(int kmax,float * A,float * C,int ldc)535 void kernel_sunpack_nn_4_lib4(int kmax, float *A, float *C, int ldc)
536 {
537
538 const int ps = 4;
539
540 int ii;
541 ii = 0;
542
543 for(; ii<kmax-3; ii+=4)
544 {
545 C[0+ldc*0] = A[0+ps*0];
546 C[1+ldc*0] = A[1+ps*0];
547 C[2+ldc*0] = A[2+ps*0];
548 C[3+ldc*0] = A[3+ps*0];
549
550 C[0+ldc*1] = A[0+ps*1];
551 C[1+ldc*1] = A[1+ps*1];
552 C[2+ldc*1] = A[2+ps*1];
553 C[3+ldc*1] = A[3+ps*1];
554
555 C[0+ldc*2] = A[0+ps*2];
556 C[1+ldc*2] = A[1+ps*2];
557 C[2+ldc*2] = A[2+ps*2];
558 C[3+ldc*2] = A[3+ps*2];
559
560 C[0+ldc*3] = A[0+ps*3];
561 C[1+ldc*3] = A[1+ps*3];
562 C[2+ldc*3] = A[2+ps*3];
563 C[3+ldc*3] = A[3+ps*3];
564
565 A += 4*ps;
566 C += 4*ldc;
567 }
568 for(; ii<kmax; ii++)
569 {
570 C[0+ldc*0] = A[0+ps*0];
571 C[1+ldc*0] = A[1+ps*0];
572 C[2+ldc*0] = A[2+ps*0];
573 C[3+ldc*0] = A[3+ps*0];
574
575 A += 1*ps;
576 C += 1*ldc;
577 }
578
579 return;
580
581 }
582 //#endif
583
584
585
kernel_sunpack_nn_4_vs_lib4(int kmax,float * A,float * C,int ldc,int m1)586 void kernel_sunpack_nn_4_vs_lib4(int kmax, float *A, float *C, int ldc, int m1)
587 {
588
589 if(m1<=0)
590 return;
591
592 const int ps = 4;
593
594 int ii;
595 ii = 0;
596
597 if(m1>=4)
598 {
599 kernel_sunpack_nn_4_lib4(kmax, A, C, ldc);
600 return;
601 }
602 else if(m1==1)
603 {
604 goto l1;
605 }
606 else if(m1==2)
607 {
608 goto l2;
609 }
610 else //if(m1==3)
611 {
612 goto l3;
613 }
614 return;
615
616 l1:
617 ii = 0;
618 for(; ii<kmax; ii++)
619 {
620 C[0+ldc*0] = A[0+ps*0];
621
622 A += 1*ps;
623 C += 1*ldc;
624 }
625 return;
626
627 l2:
628 ii = 0;
629 for(; ii<kmax; ii++)
630 {
631 C[0+ldc*0] = A[0+ps*0];
632 C[1+ldc*0] = A[1+ps*0];
633
634 A += 1*ps;
635 C += 1*ldc;
636 }
637 return;
638
639 l3:
640 ii = 0;
641 for(; ii<kmax; ii++)
642 {
643 C[0+ldc*0] = A[0+ps*0];
644 C[1+ldc*0] = A[1+ps*0];
645 C[2+ldc*0] = A[2+ps*0];
646
647 A += 1*ps;
648 C += 1*ldc;
649 }
650 return;
651
652 }
653
654
655
kernel_sunpack_nt_4_lib4(int kmax,float * C,float * A,int lda)656 void kernel_sunpack_nt_4_lib4(int kmax, float *C, float *A, int lda)
657 {
658
659 const int ps = 4;
660
661 int ii;
662 ii = 0;
663
664 for(; ii<kmax-3; ii+=4)
665 {
666 A[0+lda*0] = C[0+ps*0];
667 A[0+lda*1] = C[1+ps*0];
668 A[0+lda*2] = C[2+ps*0];
669 A[0+lda*3] = C[3+ps*0];
670
671 A[1+lda*0] = C[0+ps*1];
672 A[1+lda*1] = C[1+ps*1];
673 A[1+lda*2] = C[2+ps*1];
674 A[1+lda*3] = C[3+ps*1];
675
676 A[2+lda*0] = C[0+ps*2];
677 A[2+lda*1] = C[1+ps*2];
678 A[2+lda*2] = C[2+ps*2];
679 A[2+lda*3] = C[3+ps*2];
680
681 A[3+lda*0] = C[0+ps*3];
682 A[3+lda*1] = C[1+ps*3];
683 A[3+lda*2] = C[2+ps*3];
684 A[3+lda*3] = C[3+ps*3];
685
686 A += 4;
687 C += 4*ps;
688 }
689 for(; ii<kmax; ii++)
690 {
691 A[0+lda*0] = C[0+ps*0];
692 A[0+lda*1] = C[1+ps*0];
693 A[0+lda*2] = C[2+ps*0];
694 A[0+lda*3] = C[3+ps*0];
695
696 A += 1;
697 C += 1*ps;
698 }
699
700 return;
701
702 }
703
704
705
kernel_sunpack_nt_4_vs_lib4(int kmax,float * C,float * A,int lda,int m1)706 void kernel_sunpack_nt_4_vs_lib4(int kmax, float *C, float *A, int lda, int m1)
707 {
708
709 if(m1<=0)
710 return;
711
712 const int ps = 4;
713
714 int ii;
715 ii = 0;
716
717 if(m1>=4)
718 {
719 kernel_sunpack_nt_4_lib4(kmax, C, A, lda);
720 return;
721 }
722 else if(m1==1)
723 {
724 goto l1;
725 }
726 else if(m1==2)
727 {
728 goto l2;
729 }
730 else //if(m1==3)
731 {
732 goto l3;
733 }
734 return;
735
736 l1:
737 ii = 0;
738 for(; ii<kmax; ii++)
739 {
740 A[0+lda*0] = C[0+ps*0];
741
742 A += 1;
743 C += 1*ps;
744 }
745 return;
746
747 l2:
748 ii = 0;
749 for(; ii<kmax; ii++)
750 {
751 A[0+lda*0] = C[0+ps*0];
752 A[0+lda*1] = C[1+ps*0];
753
754 A += 1;
755 C += 1*ps;
756 }
757 return;
758
759 l3:
760 ii = 0;
761 for(; ii<kmax; ii++)
762 {
763 A[0+lda*0] = C[0+ps*0];
764 A[0+lda*1] = C[1+ps*0];
765 A[0+lda*2] = C[2+ps*0];
766
767 A += 1;
768 C += 1*ps;
769 }
770 return;
771
772 }
773
774
775
kernel_sunpack_tt_4_lib4(int kmax,float * A,int sda,float * C,int ldc)776 void kernel_sunpack_tt_4_lib4(int kmax, float *A, int sda, float *C, int ldc)
777 {
778
779 const int ps = 4;
780
781 int ii;
782 ii = 0;
783
784 for(; ii<kmax-3; ii+=4)
785 {
786 C[0+ldc*0] = A[0+ps*0];
787 C[1+ldc*0] = A[1+ps*0];
788 C[2+ldc*0] = A[2+ps*0];
789 C[3+ldc*0] = A[3+ps*0];
790
791 C[0+ldc*1] = A[0+ps*1];
792 C[1+ldc*1] = A[1+ps*1];
793 C[2+ldc*1] = A[2+ps*1];
794 C[3+ldc*1] = A[3+ps*1];
795
796 C[0+ldc*2] = A[0+ps*2];
797 C[1+ldc*2] = A[1+ps*2];
798 C[2+ldc*2] = A[2+ps*2];
799 C[3+ldc*2] = A[3+ps*2];
800
801 C[0+ldc*3] = A[0+ps*3];
802 C[1+ldc*3] = A[1+ps*3];
803 C[2+ldc*3] = A[2+ps*3];
804 C[3+ldc*3] = A[3+ps*3];
805
806 A += 4*sda;
807 C += 4;
808 }
809 for(; ii<kmax; ii++)
810 {
811 C[0+ldc*0] = A[0+ps*0];
812 C[1+ldc*0] = A[1+ps*0];
813 C[2+ldc*0] = A[2+ps*0];
814 C[3+ldc*0] = A[3+ps*0];
815
816 A += 1;
817 C += 1;
818 }
819
820 return;
821
822 }
823
824
825
826 // copy transposed panel into normal panel
kernel_spacp_tn_4_lib4(int kmax,int offsetA,float * A,int sda,float * B)827 void kernel_spacp_tn_4_lib4(int kmax, int offsetA, float *A, int sda, float *B)
828 {
829
830 const int ps = 4;
831
832 int k;
833
834 int kna = (ps-offsetA)%ps;
835 kna = kmax<kna ? kmax : kna;
836
837 k = 0;
838 if(kna>0)
839 {
840 A += offsetA;
841 for( ; k<kna; k++)
842 {
843 //
844 B[0+ps*0] = A[0+ps*0];
845 B[1+ps*0] = A[0+ps*1];
846 B[2+ps*0] = A[0+ps*2];
847 B[3+ps*0] = A[0+ps*3];
848
849 A += 1;
850 B += ps;
851 }
852 A += ps*(sda-1);
853 }
854 for(; k<kmax-3; k+=4)
855 {
856 //
857 B[0+ps*0] = A[0+ps*0];
858 B[0+ps*1] = A[1+ps*0];
859 B[0+ps*2] = A[2+ps*0];
860 B[0+ps*3] = A[3+ps*0];
861 //
862 B[1+ps*0] = A[0+ps*1];
863 B[1+ps*1] = A[1+ps*1];
864 B[1+ps*2] = A[2+ps*1];
865 B[1+ps*3] = A[3+ps*1];
866 //
867 B[2+ps*0] = A[0+ps*2];
868 B[2+ps*1] = A[1+ps*2];
869 B[2+ps*2] = A[2+ps*2];
870 B[2+ps*3] = A[3+ps*2];
871 //
872 B[3+ps*0] = A[0+ps*3];
873 B[3+ps*1] = A[1+ps*3];
874 B[3+ps*2] = A[2+ps*3];
875 B[3+ps*3] = A[3+ps*3];
876
877 A += ps*sda;
878 B += ps*ps;
879 }
880 for( ; k<kmax; k++)
881 {
882 //
883 B[0+ps*0] = A[0+ps*0];
884 B[1+ps*0] = A[0+ps*1];
885 B[2+ps*0] = A[0+ps*2];
886 B[3+ps*0] = A[0+ps*3];
887
888 A += 1;
889 B += ps;
890 }
891 return;
892 }
893
894
895
896 // copy transposed panel into normal panel
kernel_spacp_nt_4_lib4(int kmax,float * A,int offsetB,float * B,int sdb)897 void kernel_spacp_nt_4_lib4(int kmax, float *A, int offsetB, float *B, int sdb)
898 {
899
900 const int ps = 4;
901
902 int k;
903
904 int kna = (ps-offsetB)%ps;
905 kna = kmax<kna ? kmax : kna;
906
907 k = 0;
908 if(kna>0)
909 {
910 B += offsetB;
911 for( ; k<kna; k++)
912 {
913 //
914 B[0+ps*0] = A[0+ps*0];
915 B[0+ps*1] = A[1+ps*0];
916 B[0+ps*2] = A[2+ps*0];
917 B[0+ps*3] = A[3+ps*0];
918
919 B += 1;
920 A += ps;
921 }
922 B += ps*(sdb-1);
923 }
924 for(; k<kmax-3; k+=4)
925 {
926 //
927 B[0+ps*0] = A[0+ps*0];
928 B[1+ps*0] = A[0+ps*1];
929 B[2+ps*0] = A[0+ps*2];
930 B[3+ps*0] = A[0+ps*3];
931 //
932 B[0+ps*1] = A[1+ps*0];
933 B[1+ps*1] = A[1+ps*1];
934 B[2+ps*1] = A[1+ps*2];
935 B[3+ps*1] = A[1+ps*3];
936 //
937 B[0+ps*2] = A[2+ps*0];
938 B[1+ps*2] = A[2+ps*1];
939 B[2+ps*2] = A[2+ps*2];
940 B[3+ps*2] = A[2+ps*3];
941 //
942 B[0+ps*3] = A[3+ps*0];
943 B[1+ps*3] = A[3+ps*1];
944 B[2+ps*3] = A[3+ps*2];
945 B[3+ps*3] = A[3+ps*3];
946
947 B += ps*sdb;
948 A += ps*ps;
949 }
950 for( ; k<kmax; k++)
951 {
952 //
953 B[0+ps*0] = A[0+ps*0];
954 B[0+ps*1] = A[1+ps*0];
955 B[0+ps*2] = A[2+ps*0];
956 B[0+ps*3] = A[3+ps*0];
957
958 B += 1;
959 A += ps;
960 }
961 return;
962 }
963
964
965
kernel_spacp_nn_4_lib4(int kmax,int offsetA,float * A,int sda,float * B)966 void kernel_spacp_nn_4_lib4(int kmax, int offsetA, float *A, int sda, float *B)
967 {
968
969 const int ps = 4;
970
971 int k;
972
973 int air = offsetA%ps;
974
975 float *A0 = A;
976 float *A1 = A0 + ps*sda;
977
978 if(offsetA==0)
979 goto air_0;
980 else if(offsetA==1)
981 goto air_1;
982 else if(offsetA==2)
983 goto air_2;
984 else //if(offsetA==3)
985 goto air_3;
986
987 air_0:
988 k = 0;
989 for(k=0; k<kmax-3; k+=4)
990 {
991 B[0+ps*0] = A0[0+ps*0];
992 B[1+ps*0] = A0[1+ps*0];
993 B[2+ps*0] = A0[2+ps*0];
994 B[3+ps*0] = A0[3+ps*0];
995
996 B[0+ps*1] = A0[0+ps*1];
997 B[1+ps*1] = A0[1+ps*1];
998 B[2+ps*1] = A0[2+ps*1];
999 B[3+ps*1] = A0[3+ps*1];
1000
1001 B[0+ps*2] = A0[0+ps*2];
1002 B[1+ps*2] = A0[1+ps*2];
1003 B[2+ps*2] = A0[2+ps*2];
1004 B[3+ps*2] = A0[3+ps*2];
1005
1006 B[0+ps*3] = A0[0+ps*3];
1007 B[1+ps*3] = A0[1+ps*3];
1008 B[2+ps*3] = A0[2+ps*3];
1009 B[3+ps*3] = A0[3+ps*3];
1010
1011 A0 += 16;
1012 B += 16;
1013 }
1014 for(; k<kmax; k++)
1015 {
1016 B[0+ps*0] = A0[0+ps*0];
1017 B[1+ps*0] = A0[1+ps*0];
1018 B[2+ps*0] = A0[2+ps*0];
1019 B[3+ps*0] = A0[3+ps*0];
1020
1021 A0 += 4;
1022 B += 4;
1023 }
1024 goto end;
1025
1026
1027
1028 air_1:
1029 k = 0;
1030 for(k=0; k<kmax-3; k+=4)
1031 {
1032 B[0+ps*0] = A0[1+ps*0];
1033 B[1+ps*0] = A0[2+ps*0];
1034 B[2+ps*0] = A0[3+ps*0];
1035 B[3+ps*0] = A1[0+ps*0];
1036
1037 B[0+ps*1] = A0[1+ps*1];
1038 B[1+ps*1] = A0[2+ps*1];
1039 B[2+ps*1] = A0[3+ps*1];
1040 B[3+ps*1] = A1[0+ps*1];
1041
1042 B[0+ps*2] = A0[1+ps*2];
1043 B[1+ps*2] = A0[2+ps*2];
1044 B[2+ps*2] = A0[3+ps*2];
1045 B[3+ps*2] = A1[0+ps*2];
1046
1047 B[0+ps*3] = A0[1+ps*3];
1048 B[1+ps*3] = A0[2+ps*3];
1049 B[2+ps*3] = A0[3+ps*3];
1050 B[3+ps*3] = A1[0+ps*3];
1051
1052 A0 += 16;
1053 A1 += 16;
1054 B += 16;
1055 }
1056 for(; k<kmax; k++)
1057 {
1058 B[0+ps*0] = A0[1+ps*0];
1059 B[1+ps*0] = A0[2+ps*0];
1060 B[2+ps*0] = A0[3+ps*0];
1061 B[3+ps*0] = A1[0+ps*0];
1062
1063 A0 += 4;
1064 A1 += 4;
1065 B += 4;
1066 }
1067 goto end;
1068
1069
1070
1071 air_2:
1072 k = 0;
1073 for(k=0; k<kmax-3; k+=4)
1074 {
1075 B[0+ps*0] = A0[2+ps*0];
1076 B[1+ps*0] = A0[3+ps*0];
1077 B[2+ps*0] = A1[0+ps*0];
1078 B[3+ps*0] = A1[1+ps*0];
1079
1080 B[0+ps*1] = A0[2+ps*1];
1081 B[1+ps*1] = A0[3+ps*1];
1082 B[2+ps*1] = A1[0+ps*1];
1083 B[3+ps*1] = A1[1+ps*1];
1084
1085 B[0+ps*2] = A0[2+ps*2];
1086 B[1+ps*2] = A0[3+ps*2];
1087 B[2+ps*2] = A1[0+ps*2];
1088 B[3+ps*2] = A1[1+ps*2];
1089
1090 B[0+ps*3] = A0[2+ps*3];
1091 B[1+ps*3] = A0[3+ps*3];
1092 B[2+ps*3] = A1[0+ps*3];
1093 B[3+ps*3] = A1[1+ps*3];
1094
1095 A0 += 16;
1096 A1 += 16;
1097 B += 16;
1098 }
1099 for(; k<kmax; k++)
1100 {
1101 B[0+ps*0] = A0[2+ps*0];
1102 B[1+ps*0] = A0[3+ps*0];
1103 B[2+ps*0] = A1[0+ps*0];
1104 B[3+ps*0] = A1[1+ps*0];
1105
1106 A0 += 4;
1107 A1 += 4;
1108 B += 4;
1109 }
1110 goto end;
1111
1112
1113
1114 air_3:
1115 k = 0;
1116 for(k=0; k<kmax-3; k+=4)
1117 {
1118 B[0+ps*0] = A0[3+ps*0];
1119 B[1+ps*0] = A1[0+ps*0];
1120 B[2+ps*0] = A1[1+ps*0];
1121 B[3+ps*0] = A1[2+ps*0];
1122
1123 B[0+ps*1] = A0[3+ps*1];
1124 B[1+ps*1] = A1[0+ps*1];
1125 B[2+ps*1] = A1[1+ps*1];
1126 B[3+ps*1] = A1[2+ps*1];
1127
1128 B[0+ps*2] = A0[3+ps*2];
1129 B[1+ps*2] = A1[0+ps*2];
1130 B[2+ps*2] = A1[1+ps*2];
1131 B[3+ps*2] = A1[2+ps*2];
1132
1133 B[0+ps*3] = A0[3+ps*3];
1134 B[1+ps*3] = A1[0+ps*3];
1135 B[2+ps*3] = A1[1+ps*3];
1136 B[3+ps*3] = A1[2+ps*3];
1137
1138 A0 += 16;
1139 A1 += 16;
1140 B += 16;
1141 }
1142 for(; k<kmax; k++)
1143 {
1144 B[0+ps*0] = A0[3+ps*0];
1145 B[1+ps*0] = A1[0+ps*0];
1146 B[2+ps*0] = A1[1+ps*0];
1147 B[3+ps*0] = A1[2+ps*0];
1148
1149 A0 += 4;
1150 A1 += 4;
1151 B += 4;
1152 }
1153 goto end;
1154
1155 end:
1156 return;
1157
1158 }
1159
1160
1161
kernel_spacp_nn_4_vs_lib4(int kmax,int offsetA,float * A,int sda,float * B,int m1)1162 void kernel_spacp_nn_4_vs_lib4(int kmax, int offsetA, float *A, int sda, float *B, int m1)
1163 {
1164
1165 if(m1<=0)
1166 {
1167 return;
1168 }
1169
1170 if(m1>=4)
1171 {
1172 kernel_spacp_nn_4_lib4(kmax, offsetA, A, sda, B);
1173 return;
1174 }
1175
1176
1177 const int ps = 4;
1178
1179 int k;
1180
1181 int air = offsetA%ps;
1182
1183 float *A0 = A;
1184 float *A1 = A0 + ps*sda;
1185
1186 if(offsetA==0)
1187 goto air_0;
1188 else if(offsetA==1)
1189 goto air_1;
1190 else if(offsetA==2)
1191 goto air_2;
1192 else //if(offsetA==3)
1193 goto air_3;
1194
1195 air_0:
1196
1197 if(m1==1)
1198 {
1199 goto air_0_1;
1200 }
1201 else if(m1==2)
1202 {
1203 goto air_0_2;
1204 }
1205 else //if(m1==3)
1206 {
1207 goto air_0_3;
1208 }
1209
1210 air_1:
1211
1212 if(m1==1)
1213 {
1214 A0 += air;
1215 goto air_0_1;
1216 }
1217 else if(m1==2)
1218 {
1219 A0 += air;
1220 goto air_0_2;
1221 }
1222 else //if(m1==3)
1223 {
1224 A0 += air;
1225 goto air_0_3;
1226 }
1227
1228 air_2:
1229
1230 if(m1==1)
1231 {
1232 A0 += air;
1233 goto air_0_1;
1234 }
1235 else if(m1==2)
1236 {
1237 A0 += air;
1238 goto air_0_2;
1239 }
1240 else //if(m1==3)
1241 {
1242 goto air_2_3;
1243 }
1244
1245 air_3:
1246
1247 if(m1==1)
1248 {
1249 A0 += air;
1250 goto air_0_1;
1251 }
1252 else if(m1==2)
1253 {
1254 goto air_3_2;
1255 }
1256 else //if(m1==3)
1257 {
1258 goto air_3_3;
1259 }
1260
1261
1262
1263 air_0_3:
1264 k = 0;
1265 for(k=0; k<kmax-3; k+=4)
1266 {
1267 B[0+ps*0] = A0[0+ps*0];
1268 B[1+ps*0] = A0[1+ps*0];
1269 B[2+ps*0] = A0[2+ps*0];
1270
1271 B[0+ps*1] = A0[0+ps*1];
1272 B[1+ps*1] = A0[1+ps*1];
1273 B[2+ps*1] = A0[2+ps*1];
1274
1275 B[0+ps*2] = A0[0+ps*2];
1276 B[1+ps*2] = A0[1+ps*2];
1277 B[2+ps*2] = A0[2+ps*2];
1278
1279 B[0+ps*3] = A0[0+ps*3];
1280 B[1+ps*3] = A0[1+ps*3];
1281 B[2+ps*3] = A0[2+ps*3];
1282
1283 A0 += 16;
1284 B += 16;
1285 }
1286 for(; k<kmax; k++)
1287 {
1288 B[0+ps*0] = A0[0+ps*0];
1289 B[1+ps*0] = A0[1+ps*0];
1290 B[2+ps*0] = A0[2+ps*0];
1291
1292 A0 += 4;
1293 B += 4;
1294 }
1295 goto end;
1296
1297 air_0_2:
1298 k = 0;
1299 for(k=0; k<kmax-3; k+=4)
1300 {
1301 B[0+ps*0] = A0[0+ps*0];
1302 B[1+ps*0] = A0[1+ps*0];
1303
1304 B[0+ps*1] = A0[0+ps*1];
1305 B[1+ps*1] = A0[1+ps*1];
1306
1307 B[0+ps*2] = A0[0+ps*2];
1308 B[1+ps*2] = A0[1+ps*2];
1309
1310 B[0+ps*3] = A0[0+ps*3];
1311 B[1+ps*3] = A0[1+ps*3];
1312
1313 A0 += 16;
1314 B += 16;
1315 }
1316 for(; k<kmax; k++)
1317 {
1318 B[0+ps*0] = A0[0+ps*0];
1319 B[1+ps*0] = A0[1+ps*0];
1320
1321 A0 += 4;
1322 B += 4;
1323 }
1324 goto end;
1325
1326 air_0_1:
1327 k = 0;
1328 for(k=0; k<kmax-3; k+=4)
1329 {
1330 B[0+ps*0] = A0[0+ps*0];
1331
1332 B[0+ps*1] = A0[0+ps*1];
1333
1334 B[0+ps*2] = A0[0+ps*2];
1335
1336 B[0+ps*3] = A0[0+ps*3];
1337
1338 A0 += 16;
1339 B += 16;
1340 }
1341 for(; k<kmax; k++)
1342 {
1343 B[0+ps*0] = A0[0+ps*0];
1344
1345 A0 += 4;
1346 B += 4;
1347 }
1348 goto end;
1349
1350 air_2_3:
1351 k = 0;
1352 for(k=0; k<kmax-3; k+=4)
1353 {
1354 B[0+ps*0] = A0[2+ps*0];
1355 B[1+ps*0] = A0[3+ps*0];
1356 B[2+ps*0] = A1[0+ps*0];
1357
1358 B[0+ps*1] = A0[2+ps*1];
1359 B[1+ps*1] = A0[3+ps*1];
1360 B[2+ps*1] = A1[0+ps*1];
1361
1362 B[0+ps*2] = A0[2+ps*2];
1363 B[1+ps*2] = A0[3+ps*2];
1364 B[2+ps*2] = A1[0+ps*2];
1365
1366 B[0+ps*3] = A0[2+ps*3];
1367 B[1+ps*3] = A0[3+ps*3];
1368 B[2+ps*3] = A1[0+ps*3];
1369
1370 A0 += 16;
1371 A1 += 16;
1372 B += 16;
1373 }
1374 for(; k<kmax; k++)
1375 {
1376 B[0+ps*0] = A0[2+ps*0];
1377 B[1+ps*0] = A0[3+ps*0];
1378 B[2+ps*0] = A1[0+ps*0];
1379
1380 A0 += 4;
1381 A1 += 4;
1382 B += 4;
1383 }
1384 goto end;
1385
1386 air_3_3:
1387 k = 0;
1388 for(k=0; k<kmax-3; k+=4)
1389 {
1390 B[0+ps*0] = A0[3+ps*0];
1391 B[1+ps*0] = A1[0+ps*0];
1392 B[2+ps*0] = A1[1+ps*0];
1393
1394 B[0+ps*1] = A0[3+ps*1];
1395 B[1+ps*1] = A1[0+ps*1];
1396 B[2+ps*1] = A1[1+ps*1];
1397
1398 B[0+ps*2] = A0[3+ps*2];
1399 B[1+ps*2] = A1[0+ps*2];
1400 B[2+ps*2] = A1[1+ps*2];
1401
1402 B[0+ps*3] = A0[3+ps*3];
1403 B[1+ps*3] = A1[0+ps*3];
1404 B[2+ps*3] = A1[1+ps*3];
1405
1406 A0 += 16;
1407 A1 += 16;
1408 B += 16;
1409 }
1410 for(; k<kmax; k++)
1411 {
1412 B[0+ps*0] = A0[3+ps*0];
1413 B[1+ps*0] = A1[0+ps*0];
1414 B[2+ps*0] = A1[1+ps*0];
1415
1416 A0 += 4;
1417 A1 += 4;
1418 B += 4;
1419 }
1420 goto end;
1421
1422 air_3_2:
1423 k = 0;
1424 for(k=0; k<kmax-3; k+=4)
1425 {
1426 B[0+ps*0] = A0[3+ps*0];
1427 B[1+ps*0] = A1[0+ps*0];
1428
1429 B[0+ps*1] = A0[3+ps*1];
1430 B[1+ps*1] = A1[0+ps*1];
1431
1432 B[0+ps*2] = A0[3+ps*2];
1433 B[1+ps*2] = A1[0+ps*2];
1434
1435 B[0+ps*3] = A0[3+ps*3];
1436 B[1+ps*3] = A1[0+ps*3];
1437
1438 A0 += 16;
1439 A1 += 16;
1440 B += 16;
1441 }
1442 for(; k<kmax; k++)
1443 {
1444 B[0+ps*0] = A0[3+ps*0];
1445 B[1+ps*0] = A1[0+ps*0];
1446
1447 A0 += 4;
1448 A1 += 4;
1449 B += 4;
1450 }
1451 goto end;
1452
1453 end:
1454 return;
1455
1456 }
1457
1458
1459
1460