1 #include "common.h"
2 #include <stdbool.h>
3
CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT * ba,FLOAT * bb,FLOAT * C,BLASLONG ldc,BLASLONG offset)4 int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
5 {
6
7 BLASLONG i,j,k;
8 FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
9
10 FLOAT res0_0;
11 FLOAT res0_1;
12 FLOAT res0_2;
13 FLOAT res0_3;
14
15 FLOAT res1_0;
16 FLOAT res1_1;
17 FLOAT res1_2;
18 FLOAT res1_3;
19
20 FLOAT res2_0;
21 FLOAT res2_1;
22 FLOAT res2_2;
23 FLOAT res2_3;
24
25 FLOAT res3_0;
26 FLOAT res3_1;
27 FLOAT res3_2;
28 FLOAT res3_3;
29
30 FLOAT a0;
31 FLOAT a1;
32
33 FLOAT b0;
34 FLOAT b1;
35 FLOAT b2;
36 FLOAT b3;
37
38 BLASLONG off, temp;
39
40 bool left;
41 bool transposed;
42 bool backwards;
43
44 #ifdef LEFT
45 left = true;
46 #else
47 left = false;
48 #endif
49
50 #ifdef TRANSA
51 transposed = true;
52 #else
53 transposed = false;
54 #endif
55
56 backwards = left != transposed;
57
58 if (!left) {
59 off = -offset;
60 }
61
62
63 for (j=0; j<bn/4; j+=1) // do blocks of the Mx4 loops
64 {
65 C0 = C;
66 C1 = C0+ldc;
67 C2 = C1+ldc;
68 C3 = C2+ldc;
69
70
71 if (left) {
72 off = offset;
73 }
74
75 ptrba = ba;
76
77 for (i=0; i<bm/4; i+=1) // do blocks of 4x4
78 {
79
80 ptrbb = bb;
81 if (backwards)
82 {
83 ptrba += off*4; // number of values in A
84 ptrbb += off*4; // number of values in B
85 }
86
87 res0_0 = 0;
88 res0_1 = 0;
89 res0_2 = 0;
90 res0_3 = 0;
91
92 res1_0 = 0;
93 res1_1 = 0;
94 res1_2 = 0;
95 res1_3 = 0;
96
97 res2_0 = 0;
98 res2_1 = 0;
99 res2_2 = 0;
100 res2_3 = 0;
101
102 res3_0 = 0;
103 res3_1 = 0;
104 res3_2 = 0;
105 res3_3 = 0;
106
107 temp = backwards ? bk-off :
108 left ? off + 4 : // number of values in A
109 off + 4; // number of values in B
110
111 for (k=0; k<temp; k++)
112 {
113 b0 = ptrbb[0];
114 b1 = ptrbb[1];
115 b2 = ptrbb[2];
116 b3 = ptrbb[3];
117
118 a0 = ptrba[0];
119 res0_0 += a0*b0;
120 res1_0 += a0*b1;
121 res2_0 += a0*b2;
122 res3_0 += a0*b3;
123
124 a1 = ptrba[1];
125 res0_1 += a1*b0;
126 res1_1 += a1*b1;
127 res2_1 += a1*b2;
128 res3_1 += a1*b3;
129
130 a0 = ptrba[2];
131 res0_2 += a0*b0;
132 res1_2 += a0*b1;
133 res2_2 += a0*b2;
134 res3_2 += a0*b3;
135
136 a1 = ptrba[3];
137 res0_3 += a1*b0;
138 res1_3 += a1*b1;
139 res2_3 += a1*b2;
140 res3_3 += a1*b3;
141
142 ptrba = ptrba+4;
143 ptrbb = ptrbb+4;
144 }
145
146 res0_0 *= alpha;
147 res0_1 *= alpha;
148 res0_2 *= alpha;
149 res0_3 *= alpha;
150
151 res1_0 *= alpha;
152 res1_1 *= alpha;
153 res1_2 *= alpha;
154 res1_3 *= alpha;
155
156 res2_0 *= alpha;
157 res2_1 *= alpha;
158 res2_2 *= alpha;
159 res2_3 *= alpha;
160
161 res3_0 *= alpha;
162 res3_1 *= alpha;
163 res3_2 *= alpha;
164 res3_3 *= alpha;
165
166 C0[0] = res0_0;
167 C0[1] = res0_1;
168 C0[2] = res0_2;
169 C0[3] = res0_3;
170
171 C1[0] = res1_0;
172 C1[1] = res1_1;
173 C1[2] = res1_2;
174 C1[3] = res1_3;
175
176 C2[0] = res2_0;
177 C2[1] = res2_1;
178 C2[2] = res2_2;
179 C2[3] = res2_3;
180
181 C3[0] = res3_0;
182 C3[1] = res3_1;
183 C3[2] = res3_2;
184 C3[3] = res3_3;
185
186 if (!backwards) {
187 temp = bk-off;
188 temp = left ? temp - 4 : // number of values in A
189 temp - 4; // number of values in B
190
191 ptrba += temp*4; // number of values in A
192 ptrbb += temp*4; // number of values in B
193 }
194 #ifdef LEFT
195 off += 4; // number of values in A
196 #endif
197
198 C0 = C0+4;
199 C1 = C1+4;
200 C2 = C2+4;
201 C3 = C3+4;
202
203 }
204
205 if ( bm & 2 ) // do any 2x4 loop
206 {
207
208 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
209 ptrbb = bb;
210 #else
211 ptrba += off*2;
212 ptrbb = bb + off*4;
213 #endif
214
215 res0_0 = 0;
216 res0_1 = 0;
217
218 res1_0 = 0;
219 res1_1 = 0;
220
221 res2_0 = 0;
222 res2_1 = 0;
223
224 res3_0 = 0;
225 res3_1 = 0;
226
227
228 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
229 temp = bk-off;
230 #elif defined(LEFT)
231 temp = off+2; // number of values in A
232 #else
233 temp = off+4; // number of values in B
234 #endif
235
236 for (k=0; k<temp; k++)
237 {
238 b0 = ptrbb[0];
239 b1 = ptrbb[1];
240 b2 = ptrbb[2];
241 b3 = ptrbb[3];
242
243 a0 = ptrba[0];
244 res0_0 += a0*b0;
245 res1_0 += a0*b1;
246 res2_0 += a0*b2;
247 res3_0 += a0*b3;
248
249 a1 = ptrba[1];
250 res0_1 += a1*b0;
251 res1_1 += a1*b1;
252 res2_1 += a1*b2;
253 res3_1 += a1*b3;
254
255 ptrba = ptrba+2;
256 ptrbb = ptrbb+4;
257 }
258
259 res0_0 *= alpha;
260 res0_1 *= alpha;
261
262 res1_0 *= alpha;
263 res1_1 *= alpha;
264
265 res2_0 *= alpha;
266 res2_1 *= alpha;
267
268 res3_0 *= alpha;
269 res3_1 *= alpha;
270
271 C0[0] = res0_0;
272 C0[1] = res0_1;
273
274 C1[0] = res1_0;
275 C1[1] = res1_1;
276
277 C2[0] = res2_0;
278 C2[1] = res2_1;
279
280 C3[0] = res3_0;
281 C3[1] = res3_1;
282
283
284 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
285 temp = bk - off;
286 #ifdef LEFT
287 temp -= 2; // number of values in A
288 #else
289 temp -= 4; // number of values in B
290 #endif
291 ptrba += temp*2;
292 ptrbb += temp*4;
293 #endif
294
295 #ifdef LEFT
296 off += 2; // number of values in A
297 #endif
298
299 C0 = C0+2;
300 C1 = C1+2;
301 C2 = C2+2;
302 C3 = C3+2;
303
304 }
305
306 if ( bm & 1 ) // do any 1x4 loop
307 {
308
309 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
310 ptrbb = bb;
311 #else
312 ptrba += off*1;
313 ptrbb = bb + off*4;
314 #endif
315
316 res0_0 = 0;
317 res1_0 = 0;
318 res2_0 = 0;
319 res3_0 = 0;
320
321
322 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
323 temp = bk-off;
324 #elif defined(LEFT)
325 temp = off+1; // number of values in A
326 #else
327 temp = off+4; // number of values in B
328 #endif
329
330 for (k=0; k<temp; k++)
331 {
332 b0 = ptrbb[0];
333 b1 = ptrbb[1];
334 b2 = ptrbb[2];
335 b3 = ptrbb[3];
336
337 a0 = ptrba[0];
338 res0_0 += a0*b0;
339 res1_0 += a0*b1;
340 res2_0 += a0*b2;
341 res3_0 += a0*b3;
342
343 ptrba = ptrba+1;
344 ptrbb = ptrbb+4;
345 }
346
347 res0_0 *= alpha;
348
349 res1_0 *= alpha;
350
351 res2_0 *= alpha;
352
353 res3_0 *= alpha;
354
355 C0[0] = res0_0;
356
357 C1[0] = res1_0;
358
359 C2[0] = res2_0;
360
361 C3[0] = res3_0;
362
363
364 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
365 temp = bk - off;
366 #ifdef LEFT
367 temp -= 1; // number of values in A
368 #else
369 temp -= 4; // number of values in B
370 #endif
371 ptrba += temp*1;
372 ptrbb += temp*4;
373 #endif
374
375 #ifdef LEFT
376 off += 1; // number of values in A
377 #endif
378
379 C0 = C0+1;
380 C1 = C1+1;
381 C2 = C2+1;
382 C3 = C3+1;
383
384 }
385
386
387 #if defined(TRMMKERNEL) && !defined(LEFT)
388 off += 4;
389 #endif
390
391 k = (bk<<2);
392 bb = bb+k;
393 i = (ldc<<2);
394 C = C+i;
395 }
396
397 for (j=0; j<(bn&2); j+=2) // do the Mx2 loops
398 {
399 C0 = C;
400 C1 = C0+ldc;
401
402 #if defined(TRMMKERNEL) && defined(LEFT)
403 off = offset;
404 #endif
405
406
407 ptrba = ba;
408
409 for (i=0; i<bm/4; i+=1) // do blocks of 4x2
410 {
411
412 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
413 ptrbb = bb;
414 #else
415 ptrba += off*4;
416 ptrbb = bb + off*2;
417 #endif
418
419 res0_0 = 0;
420 res0_1 = 0;
421 res0_2 = 0;
422 res0_3 = 0;
423
424 res1_0 = 0;
425 res1_1 = 0;
426 res1_2 = 0;
427 res1_3 = 0;
428
429
430 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
431 temp = bk-off;
432 #elif defined(LEFT)
433 temp = off+4; // number of values in A
434 #else
435 temp = off+2; // number of values in B
436 #endif
437
438 for (k=0; k<temp; k++)
439 {
440 b0 = ptrbb[0];
441 b1 = ptrbb[1];
442
443 a0 = ptrba[0];
444 res0_0 += a0*b0;
445 res1_0 += a0*b1;
446
447 a1 = ptrba[1];
448 res0_1 += a1*b0;
449 res1_1 += a1*b1;
450
451 a0 = ptrba[2];
452 res0_2 += a0*b0;
453 res1_2 += a0*b1;
454
455 a1 = ptrba[3];
456 res0_3 += a1*b0;
457 res1_3 += a1*b1;
458
459 ptrba = ptrba+4;
460 ptrbb = ptrbb+2;
461 }
462
463 res0_0 *= alpha;
464 res0_1 *= alpha;
465 res0_2 *= alpha;
466 res0_3 *= alpha;
467
468 res1_0 *= alpha;
469 res1_1 *= alpha;
470 res1_2 *= alpha;
471 res1_3 *= alpha;
472
473 C0[0] = res0_0;
474 C0[1] = res0_1;
475 C0[2] = res0_2;
476 C0[3] = res0_3;
477
478 C1[0] = res1_0;
479 C1[1] = res1_1;
480 C1[2] = res1_2;
481 C1[3] = res1_3;
482
483
484 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
485 temp = bk - off;
486 #ifdef LEFT
487 temp -= 4; // number of values in A
488 #else
489 temp -= 2; // number of values in B
490 #endif
491 ptrba += temp*4;
492 ptrbb += temp*2;
493 #endif
494
495 #ifdef LEFT
496 off += 4; // number of values in A
497 #endif
498
499 C0 = C0+4;
500 C1 = C1+4;
501
502 }
503
504 if ( bm & 2 ) // do any 2x2 loop
505 {
506
507 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
508 ptrbb = bb;
509 #else
510 ptrba += off*2;
511 ptrbb = bb + off*2;
512 #endif
513
514 res0_0 = 0;
515 res0_1 = 0;
516
517 res1_0 = 0;
518 res1_1 = 0;
519
520
521 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
522 temp = bk-off;
523 #elif defined(LEFT)
524 temp = off+2; // number of values in A
525 #else
526 temp = off+2; // number of values in B
527 #endif
528
529 for (k=0; k<temp; k++)
530 {
531 b0 = ptrbb[0];
532 b1 = ptrbb[1];
533
534 a0 = ptrba[0];
535 res0_0 += a0*b0;
536 res1_0 += a0*b1;
537
538 a1 = ptrba[1];
539 res0_1 += a1*b0;
540 res1_1 += a1*b1;
541
542 ptrba = ptrba+2;
543 ptrbb = ptrbb+2;
544 }
545
546 res0_0 *= alpha;
547 res0_1 *= alpha;
548
549 res1_0 *= alpha;
550 res1_1 *= alpha;
551
552 C0[0] = res0_0;
553 C0[1] = res0_1;
554
555 C1[0] = res1_0;
556 C1[1] = res1_1;
557
558
559 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
560 temp = bk - off;
561 #ifdef LEFT
562 temp -= 2; // number of values in A
563 #else
564 temp -= 2; // number of values in B
565 #endif
566 ptrba += temp*2;
567 ptrbb += temp*2;
568 #endif
569
570 #ifdef LEFT
571 off += 2; // number of values in A
572 #endif
573
574 C0 = C0+2;
575 C1 = C1+2;
576
577 }
578
579 if ( bm & 1 ) // do any 1x2 loop
580 {
581
582 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
583 ptrbb = bb;
584 #else
585 ptrba += off*1;
586 ptrbb = bb + off*2;
587 #endif
588
589 res0_0 = 0;
590
591 res1_0 = 0;
592
593
594 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
595 temp = bk-off;
596 #elif defined(LEFT)
597 temp = off+1; // number of values in A
598 #else
599 temp = off+2; // number of values in B
600 #endif
601
602 for (k=0; k<temp; k++)
603 {
604 b0 = ptrbb[0];
605 b1 = ptrbb[1];
606
607 a0 = ptrba[0];
608 res0_0 += a0*b0;
609 res1_0 += a0*b1;
610
611 ptrba = ptrba+1;
612 ptrbb = ptrbb+2;
613 }
614
615 res0_0 *= alpha;
616
617 res1_0 *= alpha;
618
619 C0[0] = res0_0;
620
621 C1[0] = res1_0;
622
623
624 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
625 temp = bk - off;
626 #ifdef LEFT
627 temp -= 1; // number of values in A
628 #else
629 temp -= 2; // number of values in B
630 #endif
631 ptrba += temp*1;
632 ptrbb += temp*2;
633 #endif
634
635 #ifdef LEFT
636 off += 1; // number of values in A
637 #endif
638
639 C0 = C0+1;
640 C1 = C1+1;
641
642 }
643
644
645 #if defined(TRMMKERNEL) && !defined(LEFT)
646 off += 2;
647 #endif
648
649 k = (bk<<1);
650 bb = bb+k;
651 i = (ldc<<1);
652 C = C+i;
653 }
654
655
656
657
658
659
660
661 for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
662 {
663 C0 = C;
664
665 #if defined(TRMMKERNEL) && defined(LEFT)
666 off = offset;
667 #endif
668
669 ptrba = ba;
670
671 for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
672 {
673
674 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
675 ptrbb = bb;
676 #else
677 ptrba += off*4;
678 ptrbb = bb + off*1;
679 #endif
680
681 res0_0 = 0;
682 res0_1 = 0;
683 res0_2 = 0;
684 res0_3 = 0;
685
686
687 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
688 temp = bk-off;
689 #elif defined(LEFT)
690 temp = off+4; // number of values in A
691 #else
692 temp = off+1; // number of values in B
693 #endif
694
695 for (k=0; k<temp; k++)
696 {
697 b0 = ptrbb[0];
698
699 a0 = ptrba[0];
700 res0_0 += a0*b0;
701
702 a1 = ptrba[1];
703 res0_1 += a1*b0;
704
705 a0 = ptrba[2];
706 res0_2 += a0*b0;
707
708 a1 = ptrba[3];
709 res0_3 += a1*b0;
710
711 ptrba = ptrba+4;
712 ptrbb = ptrbb+1;
713 }
714
715 res0_0 *= alpha;
716 res0_1 *= alpha;
717 res0_2 *= alpha;
718 res0_3 *= alpha;
719
720 C0[0] = res0_0;
721 C0[1] = res0_1;
722 C0[2] = res0_2;
723 C0[3] = res0_3;
724
725
726 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
727 temp = bk - off;
728 #ifdef LEFT
729 temp -= 4; // number of values in A
730 #else
731 temp -= 1; // number of values in B
732 #endif
733 ptrba += temp*4;
734 ptrbb += temp*1;
735 #endif
736
737 #ifdef LEFT
738 off += 4; // number of values in A
739 #endif
740
741 C0 = C0+4;
742
743 }
744
745 if ( bm & 2 ) // do any 2x1 loop
746 {
747
748 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
749 ptrbb = bb;
750 #else
751 ptrba += off*2;
752 ptrbb = bb + off*1;
753 #endif
754
755 res0_0 = 0;
756 res0_1 = 0;
757
758
759
760 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
761 temp = bk-off;
762 #elif defined(LEFT)
763 temp = off+2; // number of values in A
764 #else
765 temp = off+1; // number of values in B
766 #endif
767
768 for (k=0; k<temp; k++)
769 {
770 b0 = ptrbb[0];
771
772 a0 = ptrba[0];
773 res0_0 += a0*b0;
774
775 a1 = ptrba[1];
776 res0_1 += a1*b0;
777
778 ptrba = ptrba+2;
779 ptrbb = ptrbb+1;
780 }
781
782 res0_0 *= alpha;
783 res0_1 *= alpha;
784
785 C0[0] = res0_0;
786 C0[1] = res0_1;
787
788
789 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
790 temp = bk - off;
791 #ifdef LEFT
792 temp -= 2; // number of values in A
793 #else
794 temp -= 1; // number of values in B
795 #endif
796 ptrba += temp*2;
797 ptrbb += temp*1;
798 #endif
799
800 #ifdef LEFT
801 off += 2; // number of values in A
802 #endif
803
804 C0 = C0+2;
805
806 }
807
808 if ( bm & 1 ) // do any 1x1 loop
809 {
810
811 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
812 ptrbb = bb;
813 #else
814 ptrba += off*1;
815 ptrbb = bb + off*1;
816 #endif
817
818 res0_0 = 0;
819
820
821 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
822 temp = bk-off;
823 #elif defined(LEFT)
824 temp = off+1; // number of values in A
825 #else
826 temp = off+1; // number of values in B
827 #endif
828
829 for (k=0; k<temp; k++)
830 {
831 b0 = ptrbb[0];
832
833 a0 = ptrba[0];
834 res0_0 += a0*b0;
835
836 ptrba = ptrba+1;
837 ptrbb = ptrbb+1;
838 }
839
840 res0_0 *= alpha;
841
842 C0[0] = res0_0;
843
844
845 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
846 temp = bk - off;
847 #ifdef LEFT
848 temp -= 1; // number of values in A
849 #else
850 temp -= 1; // number of values in B
851 #endif
852 ptrba += temp*1;
853 ptrbb += temp*1;
854 #endif
855
856 #ifdef LEFT
857 off += 1; // number of values in A
858 #endif
859
860 C0 = C0+1;
861
862 }
863
864
865
866 #if defined(TRMMKERNEL) && !defined(LEFT)
867 off += 1;
868 #endif
869
870 k = (bk<<0);
871 bb = bb+k;
872 C = C+ldc;
873 }
874 return 0;
875 }
876