1 #include "common.h"
2 #include <stdbool.h>
3 
CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT * ba,FLOAT * bb,FLOAT * C,BLASLONG ldc,BLASLONG offset)4 int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
5 {
6 
7    BLASLONG i,j,k;
8    FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
9 
10    FLOAT res0_0;
11    FLOAT res0_1;
12    FLOAT res0_2;
13    FLOAT res0_3;
14 
15    FLOAT res1_0;
16    FLOAT res1_1;
17    FLOAT res1_2;
18    FLOAT res1_3;
19 
20    FLOAT res2_0;
21    FLOAT res2_1;
22    FLOAT res2_2;
23    FLOAT res2_3;
24 
25    FLOAT res3_0;
26    FLOAT res3_1;
27    FLOAT res3_2;
28    FLOAT res3_3;
29 
30    FLOAT a0;
31    FLOAT a1;
32 
33    FLOAT b0;
34    FLOAT b1;
35    FLOAT b2;
36    FLOAT b3;
37 
38    BLASLONG off, temp;
39 
40    bool left;
41    bool transposed;
42    bool backwards;
43 
44 #ifdef LEFT
45    left = true;
46 #else
47    left = false;
48 #endif
49 
50 #ifdef TRANSA
51    transposed = true;
52 #else
53    transposed = false;
54 #endif
55 
56    backwards = left != transposed;
57 
58    if (!left) {
59       off = -offset;
60    }
61 
62 
63    for (j=0; j<bn/4; j+=1) // do blocks of the Mx4 loops
64    {
65         C0 = C;
66         C1 = C0+ldc;
67         C2 = C1+ldc;
68         C3 = C2+ldc;
69 
70 
71         if (left) {
72             off = offset;
73         }
74 
75         ptrba = ba;
76 
77         for (i=0; i<bm/4; i+=1) // do blocks of 4x4
78 	{
79 
80 		ptrbb = bb;
81                 if (backwards)
82                 {
83 		   ptrba += off*4; // number of values in A
84 		   ptrbb += off*4; // number of values in B
85                 }
86 
87 		res0_0 = 0;
88 		res0_1 = 0;
89 		res0_2 = 0;
90 		res0_3 = 0;
91 
92 		res1_0 = 0;
93 		res1_1 = 0;
94 		res1_2 = 0;
95 		res1_3 = 0;
96 
97 		res2_0 = 0;
98 		res2_1 = 0;
99 		res2_2 = 0;
100 		res2_3 = 0;
101 
102 		res3_0 = 0;
103 		res3_1 = 0;
104 		res3_2 = 0;
105 		res3_3 = 0;
106 
107                 temp = backwards ? bk-off :
108                              left ? off + 4 : // number of values in A
109                                     off + 4;  // number of values in B
110 
111 		for (k=0; k<temp; k++)
112                 {
113 			b0 = ptrbb[0];
114 			b1 = ptrbb[1];
115 			b2 = ptrbb[2];
116 			b3 = ptrbb[3];
117 
118 			a0 = ptrba[0];
119 			res0_0 += a0*b0;
120 			res1_0 += a0*b1;
121 			res2_0 += a0*b2;
122 			res3_0 += a0*b3;
123 
124 			a1 = ptrba[1];
125 			res0_1 += a1*b0;
126 			res1_1 += a1*b1;
127 			res2_1 += a1*b2;
128 			res3_1 += a1*b3;
129 
130 			a0 = ptrba[2];
131 			res0_2 += a0*b0;
132 			res1_2 += a0*b1;
133 			res2_2 += a0*b2;
134 			res3_2 += a0*b3;
135 
136 			a1 = ptrba[3];
137 			res0_3 += a1*b0;
138 			res1_3 += a1*b1;
139 			res2_3 += a1*b2;
140 			res3_3 += a1*b3;
141 
142 			ptrba = ptrba+4;
143 			ptrbb = ptrbb+4;
144                 }
145 
146 		res0_0 *= alpha;
147 		res0_1 *= alpha;
148 		res0_2 *= alpha;
149 		res0_3 *= alpha;
150 
151 		res1_0 *= alpha;
152 		res1_1 *= alpha;
153 		res1_2 *= alpha;
154 		res1_3 *= alpha;
155 
156 		res2_0 *= alpha;
157 		res2_1 *= alpha;
158 		res2_2 *= alpha;
159 		res2_3 *= alpha;
160 
161 		res3_0 *= alpha;
162 		res3_1 *= alpha;
163 		res3_2 *= alpha;
164 		res3_3 *= alpha;
165 
166 		C0[0] = res0_0;
167 		C0[1] = res0_1;
168 		C0[2] = res0_2;
169 		C0[3] = res0_3;
170 
171 		C1[0] = res1_0;
172 		C1[1] = res1_1;
173 		C1[2] = res1_2;
174 		C1[3] = res1_3;
175 
176 		C2[0] = res2_0;
177 		C2[1] = res2_1;
178 		C2[2] = res2_2;
179 		C2[3] = res2_3;
180 
181 		C3[0] = res3_0;
182 		C3[1] = res3_1;
183 		C3[2] = res3_2;
184 		C3[3] = res3_3;
185 
186 		if (!backwards) {
187                     temp = bk-off;
188                     temp = left ? temp - 4 : // number of values in A
189                                   temp - 4;  // number of values in B
190 
191                     ptrba += temp*4; // number of values in A
192 		    ptrbb += temp*4; // number of values in B
193                 }
194 #ifdef LEFT
195 		off += 4; // number of values in A
196 #endif
197 
198 		C0 = C0+4;
199 		C1 = C1+4;
200 		C2 = C2+4;
201 		C3 = C3+4;
202 
203 	}
204 
205 	if ( bm & 2 ) // do any 2x4 loop
206 	{
207 
208 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
209 		ptrbb = bb;
210 #else
211 		ptrba += off*2;
212 		ptrbb = bb + off*4;
213 #endif
214 
215 		res0_0 = 0;
216 		res0_1 = 0;
217 
218 		res1_0 = 0;
219 		res1_1 = 0;
220 
221 		res2_0 = 0;
222 		res2_1 = 0;
223 
224 		res3_0 = 0;
225 		res3_1 = 0;
226 
227 
228 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
229 		temp = bk-off;
230 #elif defined(LEFT)
231 		temp = off+2;	// number of values in A
232 #else
233 		temp = off+4;	// number of values in B
234 #endif
235 
236 		for (k=0; k<temp; k++)
237                 {
238 			b0 = ptrbb[0];
239 			b1 = ptrbb[1];
240 			b2 = ptrbb[2];
241 			b3 = ptrbb[3];
242 
243 			a0 = ptrba[0];
244 			res0_0 += a0*b0;
245 			res1_0 += a0*b1;
246 			res2_0 += a0*b2;
247 			res3_0 += a0*b3;
248 
249 			a1 = ptrba[1];
250 			res0_1 += a1*b0;
251 			res1_1 += a1*b1;
252 			res2_1 += a1*b2;
253 			res3_1 += a1*b3;
254 
255 			ptrba = ptrba+2;
256 			ptrbb = ptrbb+4;
257                 }
258 
259 		res0_0 *= alpha;
260 		res0_1 *= alpha;
261 
262 		res1_0 *= alpha;
263 		res1_1 *= alpha;
264 
265 		res2_0 *= alpha;
266 		res2_1 *= alpha;
267 
268 		res3_0 *= alpha;
269 		res3_1 *= alpha;
270 
271 		C0[0] = res0_0;
272 		C0[1] = res0_1;
273 
274 		C1[0] = res1_0;
275 		C1[1] = res1_1;
276 
277 		C2[0] = res2_0;
278 		C2[1] = res2_1;
279 
280 		C3[0] = res3_0;
281 		C3[1] = res3_1;
282 
283 
284 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
285 		temp = bk - off;
286 #ifdef LEFT
287 		temp -= 2; // number of values in A
288 #else
289 		temp -= 4; // number of values in B
290 #endif
291 		ptrba += temp*2;
292 		ptrbb += temp*4;
293 #endif
294 
295 #ifdef LEFT
296 		off += 2; // number of values in A
297 #endif
298 
299 		C0 = C0+2;
300 		C1 = C1+2;
301 		C2 = C2+2;
302 		C3 = C3+2;
303 
304 	}
305 
306 	if ( bm & 1 ) // do any 1x4 loop
307 	{
308 
309 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
310 		ptrbb = bb;
311 #else
312 		ptrba += off*1;
313 		ptrbb = bb + off*4;
314 #endif
315 
316 		res0_0 = 0;
317 		res1_0 = 0;
318 		res2_0 = 0;
319 		res3_0 = 0;
320 
321 
322 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
323 		temp = bk-off;
324 #elif defined(LEFT)
325 		temp = off+1;	// number of values in A
326 #else
327 		temp = off+4;	// number of values in B
328 #endif
329 
330 		for (k=0; k<temp; k++)
331                 {
332 			b0 = ptrbb[0];
333 			b1 = ptrbb[1];
334 			b2 = ptrbb[2];
335 			b3 = ptrbb[3];
336 
337 			a0 = ptrba[0];
338 			res0_0 += a0*b0;
339 			res1_0 += a0*b1;
340 			res2_0 += a0*b2;
341 			res3_0 += a0*b3;
342 
343 			ptrba = ptrba+1;
344 			ptrbb = ptrbb+4;
345                 }
346 
347 		res0_0 *= alpha;
348 
349 		res1_0 *= alpha;
350 
351 		res2_0 *= alpha;
352 
353 		res3_0 *= alpha;
354 
355 		C0[0] = res0_0;
356 
357 		C1[0] = res1_0;
358 
359 		C2[0] = res2_0;
360 
361 		C3[0] = res3_0;
362 
363 
364 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
365 		temp = bk - off;
366 #ifdef LEFT
367 		temp -= 1; // number of values in A
368 #else
369 		temp -= 4; // number of values in B
370 #endif
371 		ptrba += temp*1;
372 		ptrbb += temp*4;
373 #endif
374 
375 #ifdef LEFT
376 		off += 1; // number of values in A
377 #endif
378 
379 		C0 = C0+1;
380 		C1 = C1+1;
381 		C2 = C2+1;
382 		C3 = C3+1;
383 
384 	}
385 
386 
387 #if defined(TRMMKERNEL) && !defined(LEFT)
388 		off += 4;
389 #endif
390 
391         k = (bk<<2);
392         bb = bb+k;
393         i = (ldc<<2);
394         C = C+i;
395     }
396 
397    for (j=0; j<(bn&2); j+=2) // do the Mx2 loops
398    {
399         C0 = C;
400         C1 = C0+ldc;
401 
402 #if defined(TRMMKERNEL) && defined(LEFT)
403 		off = offset;
404 #endif
405 
406 
407         ptrba = ba;
408 
409         for (i=0; i<bm/4; i+=1) // do blocks of 4x2
410 	{
411 
412 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
413 		ptrbb = bb;
414 #else
415 		ptrba += off*4;
416 		ptrbb = bb + off*2;
417 #endif
418 
419 		res0_0 = 0;
420 		res0_1 = 0;
421 		res0_2 = 0;
422 		res0_3 = 0;
423 
424 		res1_0 = 0;
425 		res1_1 = 0;
426 		res1_2 = 0;
427 		res1_3 = 0;
428 
429 
430 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
431 		temp = bk-off;
432 #elif defined(LEFT)
433 		temp = off+4;	// number of values in A
434 #else
435 		temp = off+2;	// number of values in B
436 #endif
437 
438 		for (k=0; k<temp; k++)
439                 {
440 			b0 = ptrbb[0];
441 			b1 = ptrbb[1];
442 
443 			a0 = ptrba[0];
444 			res0_0 += a0*b0;
445 			res1_0 += a0*b1;
446 
447 			a1 = ptrba[1];
448 			res0_1 += a1*b0;
449 			res1_1 += a1*b1;
450 
451 			a0 = ptrba[2];
452 			res0_2 += a0*b0;
453 			res1_2 += a0*b1;
454 
455 			a1 = ptrba[3];
456 			res0_3 += a1*b0;
457 			res1_3 += a1*b1;
458 
459 			ptrba = ptrba+4;
460 			ptrbb = ptrbb+2;
461                 }
462 
463 		res0_0 *= alpha;
464 		res0_1 *= alpha;
465 		res0_2 *= alpha;
466 		res0_3 *= alpha;
467 
468 		res1_0 *= alpha;
469 		res1_1 *= alpha;
470 		res1_2 *= alpha;
471 		res1_3 *= alpha;
472 
473 		C0[0] = res0_0;
474 		C0[1] = res0_1;
475 		C0[2] = res0_2;
476 		C0[3] = res0_3;
477 
478 		C1[0] = res1_0;
479 		C1[1] = res1_1;
480 		C1[2] = res1_2;
481 		C1[3] = res1_3;
482 
483 
484 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
485 		temp = bk - off;
486 #ifdef LEFT
487 		temp -= 4; // number of values in A
488 #else
489 		temp -= 2; // number of values in B
490 #endif
491 		ptrba += temp*4;
492 		ptrbb += temp*2;
493 #endif
494 
495 #ifdef LEFT
496 		off += 4; // number of values in A
497 #endif
498 
499 		C0 = C0+4;
500 		C1 = C1+4;
501 
502 	}
503 
504 	if ( bm & 2 ) // do any 2x2 loop
505 	{
506 
507 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
508 		ptrbb = bb;
509 #else
510 		ptrba += off*2;
511 		ptrbb = bb + off*2;
512 #endif
513 
514 		res0_0 = 0;
515 		res0_1 = 0;
516 
517 		res1_0 = 0;
518 		res1_1 = 0;
519 
520 
521 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
522 		temp = bk-off;
523 #elif defined(LEFT)
524 		temp = off+2;	// number of values in A
525 #else
526 		temp = off+2;	// number of values in B
527 #endif
528 
529 		for (k=0; k<temp; k++)
530                 {
531 			b0 = ptrbb[0];
532 			b1 = ptrbb[1];
533 
534 			a0 = ptrba[0];
535 			res0_0 += a0*b0;
536 			res1_0 += a0*b1;
537 
538 			a1 = ptrba[1];
539 			res0_1 += a1*b0;
540 			res1_1 += a1*b1;
541 
542 			ptrba = ptrba+2;
543 			ptrbb = ptrbb+2;
544                 }
545 
546 		res0_0 *= alpha;
547 		res0_1 *= alpha;
548 
549 		res1_0 *= alpha;
550 		res1_1 *= alpha;
551 
552 		C0[0] = res0_0;
553 		C0[1] = res0_1;
554 
555 		C1[0] = res1_0;
556 		C1[1] = res1_1;
557 
558 
559 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
560 		temp = bk - off;
561 #ifdef LEFT
562 		temp -= 2; // number of values in A
563 #else
564 		temp -= 2; // number of values in B
565 #endif
566 		ptrba += temp*2;
567 		ptrbb += temp*2;
568 #endif
569 
570 #ifdef LEFT
571 		off += 2; // number of values in A
572 #endif
573 
574 		C0 = C0+2;
575 		C1 = C1+2;
576 
577 	}
578 
579 	if ( bm & 1 ) // do any 1x2 loop
580 	{
581 
582 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
583 		ptrbb = bb;
584 #else
585 		ptrba += off*1;
586 		ptrbb = bb + off*2;
587 #endif
588 
589 		res0_0 = 0;
590 
591 		res1_0 = 0;
592 
593 
594 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
595 		temp = bk-off;
596 #elif defined(LEFT)
597 		temp = off+1;	// number of values in A
598 #else
599 		temp = off+2;	// number of values in B
600 #endif
601 
602 		for (k=0; k<temp; k++)
603                 {
604 			b0 = ptrbb[0];
605 			b1 = ptrbb[1];
606 
607 			a0 = ptrba[0];
608 			res0_0 += a0*b0;
609 			res1_0 += a0*b1;
610 
611 			ptrba = ptrba+1;
612 			ptrbb = ptrbb+2;
613                 }
614 
615 		res0_0 *= alpha;
616 
617 		res1_0 *= alpha;
618 
619 		C0[0] = res0_0;
620 
621 		C1[0] = res1_0;
622 
623 
624 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
625 		temp = bk - off;
626 #ifdef LEFT
627 		temp -= 1; // number of values in A
628 #else
629 		temp -= 2; // number of values in B
630 #endif
631 		ptrba += temp*1;
632 		ptrbb += temp*2;
633 #endif
634 
635 #ifdef LEFT
636 		off += 1; // number of values in A
637 #endif
638 
639 		C0 = C0+1;
640 		C1 = C1+1;
641 
642 	}
643 
644 
645 #if defined(TRMMKERNEL) && !defined(LEFT)
646 		off += 2;
647 #endif
648 
649         k = (bk<<1);
650         bb = bb+k;
651         i = (ldc<<1);
652         C = C+i;
653     }
654 
655 
656 
657 
658 
659 
660 
661    for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
662    {
663         C0 = C;
664 
665 #if defined(TRMMKERNEL) &&  defined(LEFT)
666 	off = offset;
667 #endif
668 
669         ptrba = ba;
670 
671         for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
672 	{
673 
674 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
675 		ptrbb = bb;
676 #else
677 		ptrba += off*4;
678 		ptrbb = bb + off*1;
679 #endif
680 
681 		res0_0 = 0;
682 		res0_1 = 0;
683 		res0_2 = 0;
684 		res0_3 = 0;
685 
686 
687 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
688 		temp = bk-off;
689 #elif defined(LEFT)
690 		temp = off+4;	// number of values in A
691 #else
692 		temp = off+1;	// number of values in B
693 #endif
694 
695 		for (k=0; k<temp; k++)
696                 {
697 			b0 = ptrbb[0];
698 
699 			a0 = ptrba[0];
700 			res0_0 += a0*b0;
701 
702 			a1 = ptrba[1];
703 			res0_1 += a1*b0;
704 
705 			a0 = ptrba[2];
706 			res0_2 += a0*b0;
707 
708 			a1 = ptrba[3];
709 			res0_3 += a1*b0;
710 
711 			ptrba = ptrba+4;
712 			ptrbb = ptrbb+1;
713                 }
714 
715 		res0_0 *= alpha;
716 		res0_1 *= alpha;
717 		res0_2 *= alpha;
718 		res0_3 *= alpha;
719 
720 		C0[0] = res0_0;
721 		C0[1] = res0_1;
722 		C0[2] = res0_2;
723 		C0[3] = res0_3;
724 
725 
726 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
727 		temp = bk - off;
728 #ifdef LEFT
729 		temp -= 4; // number of values in A
730 #else
731 		temp -= 1; // number of values in B
732 #endif
733 		ptrba += temp*4;
734 		ptrbb += temp*1;
735 #endif
736 
737 #ifdef LEFT
738 		off += 4; // number of values in A
739 #endif
740 
741 		C0 = C0+4;
742 
743 	}
744 
745 	if ( bm & 2 ) // do any 2x1 loop
746 	{
747 
748 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
749 		ptrbb = bb;
750 #else
751 		ptrba += off*2;
752 		ptrbb = bb + off*1;
753 #endif
754 
755 		res0_0 = 0;
756 		res0_1 = 0;
757 
758 
759 
760 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
761 		temp = bk-off;
762 #elif defined(LEFT)
763 		temp = off+2;	// number of values in A
764 #else
765 		temp = off+1;	// number of values in B
766 #endif
767 
768 		for (k=0; k<temp; k++)
769                 {
770 			b0 = ptrbb[0];
771 
772 			a0 = ptrba[0];
773 			res0_0 += a0*b0;
774 
775 			a1 = ptrba[1];
776 			res0_1 += a1*b0;
777 
778 			ptrba = ptrba+2;
779 			ptrbb = ptrbb+1;
780                 }
781 
782 		res0_0 *= alpha;
783 		res0_1 *= alpha;
784 
785 		C0[0] = res0_0;
786 		C0[1] = res0_1;
787 
788 
789 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
790 		temp = bk - off;
791 #ifdef LEFT
792 		temp -= 2; // number of values in A
793 #else
794 		temp -= 1; // number of values in B
795 #endif
796 		ptrba += temp*2;
797 		ptrbb += temp*1;
798 #endif
799 
800 #ifdef LEFT
801 		off += 2; // number of values in A
802 #endif
803 
804 		C0 = C0+2;
805 
806 	}
807 
808 	if ( bm & 1 ) // do any 1x1 loop
809 	{
810 
811 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
812 		ptrbb = bb;
813 #else
814 		ptrba += off*1;
815 		ptrbb = bb + off*1;
816 #endif
817 
818 		res0_0 = 0;
819 
820 
821 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
822 		temp = bk-off;
823 #elif defined(LEFT)
824 		temp = off+1;	// number of values in A
825 #else
826 		temp = off+1;	// number of values in B
827 #endif
828 
829 		for (k=0; k<temp; k++)
830                 {
831 			b0 = ptrbb[0];
832 
833 			a0 = ptrba[0];
834 			res0_0 += a0*b0;
835 
836 			ptrba = ptrba+1;
837 			ptrbb = ptrbb+1;
838                 }
839 
840 		res0_0 *= alpha;
841 
842 		C0[0] = res0_0;
843 
844 
845 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
846 		temp = bk - off;
847 #ifdef LEFT
848 		temp -= 1; // number of values in A
849 #else
850 		temp -= 1; // number of values in B
851 #endif
852 		ptrba += temp*1;
853 		ptrbb += temp*1;
854 #endif
855 
856 #ifdef LEFT
857 		off += 1; // number of values in A
858 #endif
859 
860 		C0 = C0+1;
861 
862 	}
863 
864 
865 
866 #if defined(TRMMKERNEL) && !defined(LEFT)
867 		off += 1;
868 #endif
869 
870         k = (bk<<0);
871         bb = bb+k;
872         C = C+ldc;
873    }
874    return 0;
875 }
876