1 #include "common.h"
2
CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT * ba,FLOAT * bb,FLOAT * C,BLASLONG ldc,BLASLONG offset)3 int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
4 {
5
6 BLASLONG i,j,k;
7 FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
8
9 FLOAT res0_0;
10 FLOAT res0_1;
11 FLOAT res0_2;
12 FLOAT res0_3;
13 FLOAT res0_4;
14 FLOAT res0_5;
15 FLOAT res0_6;
16 FLOAT res0_7;
17
18 FLOAT res0_8;
19 FLOAT res0_9;
20 FLOAT res0_10;
21 FLOAT res0_11;
22 FLOAT res0_12;
23 FLOAT res0_13;
24 FLOAT res0_14;
25 FLOAT res0_15;
26
27 FLOAT res1_0;
28 FLOAT res1_1;
29 FLOAT res1_2;
30 FLOAT res1_3;
31 FLOAT res1_4;
32 FLOAT res1_5;
33 FLOAT res1_6;
34 FLOAT res1_7;
35
36 FLOAT res1_8;
37 FLOAT res1_9;
38 FLOAT res1_10;
39 FLOAT res1_11;
40 FLOAT res1_12;
41 FLOAT res1_13;
42 FLOAT res1_14;
43 FLOAT res1_15;
44
45 FLOAT res2_0;
46 FLOAT res2_1;
47 FLOAT res2_2;
48 FLOAT res2_3;
49 FLOAT res2_4;
50 FLOAT res2_5;
51 FLOAT res2_6;
52 FLOAT res2_7;
53
54 FLOAT res2_8;
55 FLOAT res2_9;
56 FLOAT res2_10;
57 FLOAT res2_11;
58 FLOAT res2_12;
59 FLOAT res2_13;
60 FLOAT res2_14;
61 FLOAT res2_15;
62
63 FLOAT res3_0;
64 FLOAT res3_1;
65 FLOAT res3_2;
66 FLOAT res3_3;
67 FLOAT res3_4;
68 FLOAT res3_5;
69 FLOAT res3_6;
70 FLOAT res3_7;
71
72 FLOAT res3_8;
73 FLOAT res3_9;
74 FLOAT res3_10;
75 FLOAT res3_11;
76 FLOAT res3_12;
77 FLOAT res3_13;
78 FLOAT res3_14;
79 FLOAT res3_15;
80
81 FLOAT a0;
82 FLOAT a1;
83
84 FLOAT b0;
85 FLOAT b1;
86 FLOAT b2;
87 FLOAT b3;
88
89 BLASLONG off, temp;
90
91 #if !defined(LEFT)
92 off = -offset;
93 #else
94 off = 0;
95 #endif
96
97 for (j=0; j<bn/4; j+=1)
98 {
99 C0 = C;
100 C1 = C0+ldc;
101 C2 = C0+2*ldc;
102 C3 = C0+3*ldc;
103
104 #if defined(TRMMKERNEL) && defined(LEFT)
105 off = offset;
106 #endif
107
108
109 ptrba = ba;
110
111
112 for (i=0; i<bm/16; i+=1)
113 {
114
115 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
116 ptrbb = bb;
117 #else
118 ptrba += off*16;
119 ptrbb = bb + off*4;
120 #endif
121
122 res0_0 = 0;
123 res0_1 = 0;
124 res0_2 = 0;
125 res0_3 = 0;
126 res0_4 = 0;
127 res0_5 = 0;
128 res0_6 = 0;
129 res0_7 = 0;
130
131 res0_8 = 0;
132 res0_9 = 0;
133 res0_10 = 0;
134 res0_11 = 0;
135 res0_12 = 0;
136 res0_13 = 0;
137 res0_14 = 0;
138 res0_15 = 0;
139
140 res1_0 = 0;
141 res1_1 = 0;
142 res1_2 = 0;
143 res1_3 = 0;
144 res1_4 = 0;
145 res1_5 = 0;
146 res1_6 = 0;
147 res1_7 = 0;
148
149 res1_8 = 0;
150 res1_9 = 0;
151 res1_10 = 0;
152 res1_11 = 0;
153 res1_12 = 0;
154 res1_13 = 0;
155 res1_14 = 0;
156 res1_15 = 0;
157
158 res2_0 = 0;
159 res2_1 = 0;
160 res2_2 = 0;
161 res2_3 = 0;
162 res2_4 = 0;
163 res2_5 = 0;
164 res2_6 = 0;
165 res2_7 = 0;
166
167 res2_8 = 0;
168 res2_9 = 0;
169 res2_10 = 0;
170 res2_11 = 0;
171 res2_12 = 0;
172 res2_13 = 0;
173 res2_14 = 0;
174 res2_15 = 0;
175
176 res3_0 = 0;
177 res3_1 = 0;
178 res3_2 = 0;
179 res3_3 = 0;
180 res3_4 = 0;
181 res3_5 = 0;
182 res3_6 = 0;
183 res3_7 = 0;
184
185 res3_8 = 0;
186 res3_9 = 0;
187 res3_10 = 0;
188 res3_11 = 0;
189 res3_12 = 0;
190 res3_13 = 0;
191 res3_14 = 0;
192 res3_15 = 0;
193
194
195 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
196 temp = bk-off;
197 #elif defined(LEFT)
198 temp = off+16; // number of values in A
199 #else
200 temp = off+4; // number of values in B
201 #endif
202
203 for (k=0; k<temp; k++)
204 {
205 b0 = ptrbb[0];
206 b1 = ptrbb[1];
207 b2 = ptrbb[2];
208 b3 = ptrbb[3];
209
210 a0 = ptrba[0];
211 res0_0 += a0*b0;
212 res1_0 += a0*b1;
213 res2_0 += a0*b2;
214 res3_0 += a0*b3;
215
216 a1 = ptrba[1];
217 res0_1 += a1*b0;
218 res1_1 += a1*b1;
219 res2_1 += a1*b2;
220 res3_1 += a1*b3;
221
222 a0 = ptrba[2];
223 res0_2 += a0*b0;
224 res1_2 += a0*b1;
225 res2_2 += a0*b2;
226 res3_2 += a0*b3;
227
228 a1 = ptrba[3];
229 res0_3 += a1*b0;
230 res1_3 += a1*b1;
231 res2_3 += a1*b2;
232 res3_3 += a1*b3;
233
234 a0 = ptrba[4];
235 res0_4 += a0*b0;
236 res1_4 += a0*b1;
237 res2_4 += a0*b2;
238 res3_4 += a0*b3;
239
240 a1 = ptrba[5];
241 res0_5 += a1*b0;
242 res1_5 += a1*b1;
243 res2_5 += a1*b2;
244 res3_5 += a1*b3;
245
246 a0 = ptrba[6];
247 res0_6 += a0*b0;
248 res1_6 += a0*b1;
249 res2_6 += a0*b2;
250 res3_6 += a0*b3;
251
252 a1 = ptrba[7];
253 res0_7 += a1*b0;
254 res1_7 += a1*b1;
255 res2_7 += a1*b2;
256 res3_7 += a1*b3;
257
258 a0 = ptrba[8];
259 res0_8 += a0*b0;
260 res1_8 += a0*b1;
261 res2_8 += a0*b2;
262 res3_8 += a0*b3;
263
264 a1 = ptrba[9];
265 res0_9 += a1*b0;
266 res1_9 += a1*b1;
267 res2_9 += a1*b2;
268 res3_9 += a1*b3;
269
270 a0 = ptrba[10];
271 res0_10 += a0*b0;
272 res1_10 += a0*b1;
273 res2_10 += a0*b2;
274 res3_10 += a0*b3;
275
276 a1 = ptrba[11];
277 res0_11 += a1*b0;
278 res1_11 += a1*b1;
279 res2_11 += a1*b2;
280 res3_11 += a1*b3;
281
282 a0 = ptrba[12];
283 res0_12 += a0*b0;
284 res1_12 += a0*b1;
285 res2_12 += a0*b2;
286 res3_12 += a0*b3;
287
288 a1 = ptrba[13];
289 res0_13 += a1*b0;
290 res1_13 += a1*b1;
291 res2_13 += a1*b2;
292 res3_13 += a1*b3;
293
294 a0 = ptrba[14];
295 res0_14 += a0*b0;
296 res1_14 += a0*b1;
297 res2_14 += a0*b2;
298 res3_14 += a0*b3;
299
300 a1 = ptrba[15];
301 res0_15 += a1*b0;
302 res1_15 += a1*b1;
303 res2_15 += a1*b2;
304 res3_15 += a1*b3;
305
306
307 ptrba = ptrba+16;
308 ptrbb = ptrbb+4;
309 }
310
311 res0_0 *= alpha;
312 res0_1 *= alpha;
313 res0_2 *= alpha;
314 res0_3 *= alpha;
315 res0_4 *= alpha;
316 res0_5 *= alpha;
317 res0_6 *= alpha;
318 res0_7 *= alpha;
319
320 res0_8 *= alpha;
321 res0_9 *= alpha;
322 res0_10 *= alpha;
323 res0_11 *= alpha;
324 res0_12 *= alpha;
325 res0_13 *= alpha;
326 res0_14 *= alpha;
327 res0_15 *= alpha;
328
329 res1_0 *= alpha;
330 res1_1 *= alpha;
331 res1_2 *= alpha;
332 res1_3 *= alpha;
333 res1_4 *= alpha;
334 res1_5 *= alpha;
335 res1_6 *= alpha;
336 res1_7 *= alpha;
337
338 res1_8 *= alpha;
339 res1_9 *= alpha;
340 res1_10 *= alpha;
341 res1_11 *= alpha;
342 res1_12 *= alpha;
343 res1_13 *= alpha;
344 res1_14 *= alpha;
345 res1_15 *= alpha;
346
347 res2_0 *= alpha;
348 res2_1 *= alpha;
349 res2_2 *= alpha;
350 res2_3 *= alpha;
351 res2_4 *= alpha;
352 res2_5 *= alpha;
353 res2_6 *= alpha;
354 res2_7 *= alpha;
355
356 res2_8 *= alpha;
357 res2_9 *= alpha;
358 res2_10 *= alpha;
359 res2_11 *= alpha;
360 res2_12 *= alpha;
361 res2_13 *= alpha;
362 res2_14 *= alpha;
363 res2_15 *= alpha;
364
365 res3_0 *= alpha;
366 res3_1 *= alpha;
367 res3_2 *= alpha;
368 res3_3 *= alpha;
369 res3_4 *= alpha;
370 res3_5 *= alpha;
371 res3_6 *= alpha;
372 res3_7 *= alpha;
373
374 res3_8 *= alpha;
375 res3_9 *= alpha;
376 res3_10 *= alpha;
377 res3_11 *= alpha;
378 res3_12 *= alpha;
379 res3_13 *= alpha;
380 res3_14 *= alpha;
381 res3_15 *= alpha;
382
383 C0[0] = res0_0;
384 C0[1] = res0_1;
385 C0[2] = res0_2;
386 C0[3] = res0_3;
387 C0[4] = res0_4;
388 C0[5] = res0_5;
389 C0[6] = res0_6;
390 C0[7] = res0_7;
391
392 C0[8] = res0_8;
393 C0[9] = res0_9;
394 C0[10] = res0_10;
395 C0[11] = res0_11;
396 C0[12] = res0_12;
397 C0[13] = res0_13;
398 C0[14] = res0_14;
399 C0[15] = res0_15;
400
401 C1[0] = res1_0;
402 C1[1] = res1_1;
403 C1[2] = res1_2;
404 C1[3] = res1_3;
405 C1[4] = res1_4;
406 C1[5] = res1_5;
407 C1[6] = res1_6;
408 C1[7] = res1_7;
409
410 C1[8] = res1_8;
411 C1[9] = res1_9;
412 C1[10] = res1_10;
413 C1[11] = res1_11;
414 C1[12] = res1_12;
415 C1[13] = res1_13;
416 C1[14] = res1_14;
417 C1[15] = res1_15;
418
419 C2[0] = res2_0;
420 C2[1] = res2_1;
421 C2[2] = res2_2;
422 C2[3] = res2_3;
423 C2[4] = res2_4;
424 C2[5] = res2_5;
425 C2[6] = res2_6;
426 C2[7] = res2_7;
427
428 C2[8] = res2_8;
429 C2[9] = res2_9;
430 C2[10] = res2_10;
431 C2[11] = res2_11;
432 C2[12] = res2_12;
433 C2[13] = res2_13;
434 C2[14] = res2_14;
435 C2[15] = res2_15;
436
437 C3[0] = res3_0;
438 C3[1] = res3_1;
439 C3[2] = res3_2;
440 C3[3] = res3_3;
441 C3[4] = res3_4;
442 C3[5] = res3_5;
443 C3[6] = res3_6;
444 C3[7] = res3_7;
445
446 C3[8] = res3_8;
447 C3[9] = res3_9;
448 C3[10] = res3_10;
449 C3[11] = res3_11;
450 C3[12] = res3_12;
451 C3[13] = res3_13;
452 C3[14] = res3_14;
453 C3[15] = res3_15;
454
455
456 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
457 temp = bk - off;
458 #ifdef LEFT
459 temp -= 16; // number of values in A
460 #else
461 temp -= 4; // number of values in B
462 #endif
463 ptrba += temp*16;
464 ptrbb += temp*4;
465 #endif
466
467 #ifdef LEFT
468 off += 16; // number of values in A
469 #endif
470
471 C0 = C0+16;
472 C1 = C1+16;
473 C2 = C2+16;
474 C3 = C3+16;
475 }
476
477
478 if ( bm & 8)
479 {
480
481 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
482 ptrbb = bb;
483 #else
484 ptrba += off*8;
485 ptrbb = bb + off*4;
486 #endif
487
488 res0_0 = 0;
489 res0_1 = 0;
490 res0_2 = 0;
491 res0_3 = 0;
492 res0_4 = 0;
493 res0_5 = 0;
494 res0_6 = 0;
495 res0_7 = 0;
496
497 res1_0 = 0;
498 res1_1 = 0;
499 res1_2 = 0;
500 res1_3 = 0;
501 res1_4 = 0;
502 res1_5 = 0;
503 res1_6 = 0;
504 res1_7 = 0;
505
506 res2_0 = 0;
507 res2_1 = 0;
508 res2_2 = 0;
509 res2_3 = 0;
510 res2_4 = 0;
511 res2_5 = 0;
512 res2_6 = 0;
513 res2_7 = 0;
514
515 res3_0 = 0;
516 res3_1 = 0;
517 res3_2 = 0;
518 res3_3 = 0;
519 res3_4 = 0;
520 res3_5 = 0;
521 res3_6 = 0;
522 res3_7 = 0;
523
524 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
525 temp = bk-off;
526 #elif defined(LEFT)
527 temp = off+8; // number of values in A
528 #else
529 temp = off+4; // number of values in B
530 #endif
531
532 for (k=0; k<temp; k++)
533 {
534 b0 = ptrbb[0];
535 b1 = ptrbb[1];
536 b2 = ptrbb[2];
537 b3 = ptrbb[3];
538
539 a0 = ptrba[0];
540 res0_0 += a0*b0;
541 res1_0 += a0*b1;
542 res2_0 += a0*b2;
543 res3_0 += a0*b3;
544
545 a1 = ptrba[1];
546 res0_1 += a1*b0;
547 res1_1 += a1*b1;
548 res2_1 += a1*b2;
549 res3_1 += a1*b3;
550
551 a0 = ptrba[2];
552 res0_2 += a0*b0;
553 res1_2 += a0*b1;
554 res2_2 += a0*b2;
555 res3_2 += a0*b3;
556
557 a1 = ptrba[3];
558 res0_3 += a1*b0;
559 res1_3 += a1*b1;
560 res2_3 += a1*b2;
561 res3_3 += a1*b3;
562
563 a0 = ptrba[4];
564 res0_4 += a0*b0;
565 res1_4 += a0*b1;
566 res2_4 += a0*b2;
567 res3_4 += a0*b3;
568
569 a1 = ptrba[5];
570 res0_5 += a1*b0;
571 res1_5 += a1*b1;
572 res2_5 += a1*b2;
573 res3_5 += a1*b3;
574
575 a0 = ptrba[6];
576 res0_6 += a0*b0;
577 res1_6 += a0*b1;
578 res2_6 += a0*b2;
579 res3_6 += a0*b3;
580
581 a1 = ptrba[7];
582 res0_7 += a1*b0;
583 res1_7 += a1*b1;
584 res2_7 += a1*b2;
585 res3_7 += a1*b3;
586
587 ptrba = ptrba+8;
588 ptrbb = ptrbb+4;
589
590 }
591
592 res0_0 *= alpha;
593 res0_1 *= alpha;
594 res0_2 *= alpha;
595 res0_3 *= alpha;
596 res0_4 *= alpha;
597 res0_5 *= alpha;
598 res0_6 *= alpha;
599 res0_7 *= alpha;
600
601 res1_0 *= alpha;
602 res1_1 *= alpha;
603 res1_2 *= alpha;
604 res1_3 *= alpha;
605 res1_4 *= alpha;
606 res1_5 *= alpha;
607 res1_6 *= alpha;
608 res1_7 *= alpha;
609
610 res2_0 *= alpha;
611 res2_1 *= alpha;
612 res2_2 *= alpha;
613 res2_3 *= alpha;
614 res2_4 *= alpha;
615 res2_5 *= alpha;
616 res2_6 *= alpha;
617 res2_7 *= alpha;
618
619 res3_0 *= alpha;
620 res3_1 *= alpha;
621 res3_2 *= alpha;
622 res3_3 *= alpha;
623 res3_4 *= alpha;
624 res3_5 *= alpha;
625 res3_6 *= alpha;
626 res3_7 *= alpha;
627
628 C0[0] = res0_0;
629 C0[1] = res0_1;
630 C0[2] = res0_2;
631 C0[3] = res0_3;
632 C0[4] = res0_4;
633 C0[5] = res0_5;
634 C0[6] = res0_6;
635 C0[7] = res0_7;
636
637 C1[0] = res1_0;
638 C1[1] = res1_1;
639 C1[2] = res1_2;
640 C1[3] = res1_3;
641 C1[4] = res1_4;
642 C1[5] = res1_5;
643 C1[6] = res1_6;
644 C1[7] = res1_7;
645
646 C2[0] = res2_0;
647 C2[1] = res2_1;
648 C2[2] = res2_2;
649 C2[3] = res2_3;
650 C2[4] = res2_4;
651 C2[5] = res2_5;
652 C2[6] = res2_6;
653 C2[7] = res2_7;
654
655 C3[0] = res3_0;
656 C3[1] = res3_1;
657 C3[2] = res3_2;
658 C3[3] = res3_3;
659 C3[4] = res3_4;
660 C3[5] = res3_5;
661 C3[6] = res3_6;
662 C3[7] = res3_7;
663
664 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
665 temp = bk - off;
666 #ifdef LEFT
667 temp -= 8; // number of values in A
668 #else
669 temp -= 4; // number of values in B
670 #endif
671 ptrba += temp*8;
672 ptrbb += temp*4;
673 #endif
674
675 #ifdef LEFT
676 off += 8; // number of values in A
677 #endif
678
679 C0 = C0+8;
680 C1 = C1+8;
681 C2 = C2+8;
682 C3 = C3+8;
683 }
684
685 if ( bm & 4 )
686 {
687
688 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
689 ptrbb = bb;
690 #else
691 ptrba += off*4;
692 ptrbb = bb + off*4;
693 #endif
694
695 res0_0 = 0;
696 res0_1 = 0;
697 res0_2 = 0;
698 res0_3 = 0;
699
700 res1_0 = 0;
701 res1_1 = 0;
702 res1_2 = 0;
703 res1_3 = 0;
704
705 res2_0 = 0;
706 res2_1 = 0;
707 res2_2 = 0;
708 res2_3 = 0;
709
710 res3_0 = 0;
711 res3_1 = 0;
712 res3_2 = 0;
713 res3_3 = 0;
714
715
716 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
717 temp = bk-off;
718 #elif defined(LEFT)
719 temp = off+4; // number of values in A
720 #else
721 temp = off+4; // number of values in B
722 #endif
723
724 for (k=0; k<temp; k++)
725 {
726 b0 = ptrbb[0];
727 b1 = ptrbb[1];
728 b2 = ptrbb[2];
729 b3 = ptrbb[3];
730
731 a0 = ptrba[0];
732 res0_0 += a0*b0;
733 res1_0 += a0*b1;
734 res2_0 += a0*b2;
735 res3_0 += a0*b3;
736
737 a1 = ptrba[1];
738 res0_1 += a1*b0;
739 res1_1 += a1*b1;
740 res2_1 += a1*b2;
741 res3_1 += a1*b3;
742
743 a0 = ptrba[2];
744 res0_2 += a0*b0;
745 res1_2 += a0*b1;
746 res2_2 += a0*b2;
747 res3_2 += a0*b3;
748
749 a1 = ptrba[3];
750 res0_3 += a1*b0;
751 res1_3 += a1*b1;
752 res2_3 += a1*b2;
753 res3_3 += a1*b3;
754
755 ptrba = ptrba+4;
756 ptrbb = ptrbb+4;
757 }
758
759 res0_0 *= alpha;
760 res0_1 *= alpha;
761 res0_2 *= alpha;
762 res0_3 *= alpha;
763
764 res1_0 *= alpha;
765 res1_1 *= alpha;
766 res1_2 *= alpha;
767 res1_3 *= alpha;
768
769 res2_0 *= alpha;
770 res2_1 *= alpha;
771 res2_2 *= alpha;
772 res2_3 *= alpha;
773
774 res3_0 *= alpha;
775 res3_1 *= alpha;
776 res3_2 *= alpha;
777 res3_3 *= alpha;
778
779 C0[0] = res0_0;
780 C0[1] = res0_1;
781 C0[2] = res0_2;
782 C0[3] = res0_3;
783
784 C1[0] = res1_0;
785 C1[1] = res1_1;
786 C1[2] = res1_2;
787 C1[3] = res1_3;
788
789
790 C2[0] = res2_0;
791 C2[1] = res2_1;
792 C2[2] = res2_2;
793 C2[3] = res2_3;
794
795 C3[0] = res3_0;
796 C3[1] = res3_1;
797 C3[2] = res3_2;
798 C3[3] = res3_3;
799
800 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
801 temp = bk - off;
802 #ifdef LEFT
803 temp -= 4; // number of values in A
804 #else
805 temp -= 4; // number of values in B
806 #endif
807 ptrba += temp*4;
808 ptrbb += temp*4;
809 #endif
810
811 #ifdef LEFT
812 off += 4; // number of values in A
813 #endif
814
815 C0 = C0+4;
816 C1 = C1+4;
817 C2 = C2+4;
818 C3 = C3+4;
819 }
820
821 if ( bm & 2 )
822 {
823
824 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
825 ptrbb = bb;
826 #else
827 ptrba += off*2;
828 ptrbb = bb + off*4;
829 #endif
830
831 res0_0 = 0;
832 res0_1 = 0;
833
834 res1_0 = 0;
835 res1_1 = 0;
836
837 res2_0 = 0;
838 res2_1 = 0;
839
840 res3_0 = 0;
841 res3_1 = 0;
842
843
844 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
845 temp = bk-off;
846 #elif defined(LEFT)
847 temp = off+2; // number of values in A
848 #else
849 temp = off+4; // number of values in B
850 #endif
851
852 for (k=0; k<temp; k++)
853 {
854 b0 = ptrbb[0];
855 b1 = ptrbb[1];
856 b2 = ptrbb[2];
857 b3 = ptrbb[3];
858
859 a0 = ptrba[0];
860 res0_0 += a0*b0;
861 res1_0 += a0*b1;
862 res2_0 += a0*b2;
863 res3_0 += a0*b3;
864
865 a1 = ptrba[1];
866 res0_1 += a1*b0;
867 res1_1 += a1*b1;
868 res2_1 += a1*b2;
869 res3_1 += a1*b3;
870
871 ptrba = ptrba+2;
872 ptrbb = ptrbb+4;
873 }
874
875 res0_0 *= alpha;
876 res0_1 *= alpha;
877
878 res1_0 *= alpha;
879 res1_1 *= alpha;
880
881 res2_0 *= alpha;
882 res2_1 *= alpha;
883
884 res3_0 *= alpha;
885 res3_1 *= alpha;
886
887 C0[0] = res0_0;
888 C0[1] = res0_1;
889
890 C1[0] = res1_0;
891 C1[1] = res1_1;
892
893 C2[0] = res2_0;
894 C2[1] = res2_1;
895
896 C3[0] = res3_0;
897 C3[1] = res3_1;
898
899
900 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
901 temp = bk - off;
902 #ifdef LEFT
903 temp -= 2; // number of values in A
904 #else
905 temp -= 4; // number of values in B
906 #endif
907 ptrba += temp*2;
908 ptrbb += temp*4;
909 #endif
910
911 #ifdef LEFT
912 off += 2; // number of values in A
913 #endif
914
915 C0 = C0+2;
916 C1 = C1+2;
917 C2 = C2+2;
918 C3 = C3+2;
919 }
920
921 if ( bm & 1 )
922 {
923
924 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
925 ptrbb = bb;
926 #else
927 ptrba += off*1;
928 ptrbb = bb + off*4;
929 #endif
930
931 res0_0 = 0;
932 res1_0 = 0;
933 res2_0 = 0;
934 res3_0 = 0;
935
936
937 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
938 temp = bk-off;
939 #elif defined(LEFT)
940 temp = off+1; // number of values in A
941 #else
942 temp = off+4; // number of values in B
943 #endif
944
945 for (k=0; k<temp; k++)
946 {
947 b0 = ptrbb[0];
948 b1 = ptrbb[1];
949 b2 = ptrbb[2];
950 b3 = ptrbb[3];
951
952 a0 = ptrba[0];
953 res0_0 += a0*b0;
954 res1_0 += a0*b1;
955 res2_0 += a0*b2;
956 res3_0 += a0*b3;
957
958 ptrba = ptrba+1;
959 ptrbb = ptrbb+4;
960 }
961 res0_0 *= alpha;
962 res1_0 *= alpha;
963 res2_0 *= alpha;
964 res3_0 *= alpha;
965
966 C0[0] = res0_0;
967 C1[0] = res1_0;
968 C2[0] = res2_0;
969 C3[0] = res3_0;
970
971 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
972 temp = bk - off;
973 #ifdef LEFT
974 temp -= 1; // number of values in A
975 #else
976 temp -= 4; // number of values in B
977 #endif
978 ptrba += temp*1;
979 ptrbb += temp*4;
980 #endif
981
982 #ifdef LEFT
983 off += 1; // number of values in A
984 #endif
985
986 C0 = C0+1;
987 C1 = C1+1;
988 C2 = C2+1;
989 C3 = C3+1;
990
991 }
992
993
994 #if defined(TRMMKERNEL) && !defined(LEFT)
995 off += 4;
996 #endif
997
998 k = (bk<<2);
999 bb = bb+k;
1000 i = (ldc<<2);
1001 C = C+i;
1002 }
1003
1004
1005 if(bn&2)
1006 {
1007 C0 = C;
1008 C1 = C0+ldc;
1009
1010 #if defined(TRMMKERNEL) && defined(LEFT)
1011 off = offset;
1012 #endif
1013
1014
1015 ptrba = ba;
1016
1017
1018 for (i=0; i<bm/16; i+=1)
1019 {
1020
1021 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1022 ptrbb = bb;
1023 #else
1024 ptrba += off*16;
1025 ptrbb = bb + off*2;
1026 #endif
1027
1028 res0_0 = 0;
1029 res0_1 = 0;
1030 res0_2 = 0;
1031 res0_3 = 0;
1032 res0_4 = 0;
1033 res0_5 = 0;
1034 res0_6 = 0;
1035 res0_7 = 0;
1036
1037 res0_8 = 0;
1038 res0_9 = 0;
1039 res0_10 = 0;
1040 res0_11 = 0;
1041 res0_12 = 0;
1042 res0_13 = 0;
1043 res0_14 = 0;
1044 res0_15 = 0;
1045
1046 res1_0 = 0;
1047 res1_1 = 0;
1048 res1_2 = 0;
1049 res1_3 = 0;
1050 res1_4 = 0;
1051 res1_5 = 0;
1052 res1_6 = 0;
1053 res1_7 = 0;
1054
1055 res1_8 = 0;
1056 res1_9 = 0;
1057 res1_10 = 0;
1058 res1_11 = 0;
1059 res1_12 = 0;
1060 res1_13 = 0;
1061 res1_14 = 0;
1062 res1_15 = 0;
1063
1064
1065
1066
1067 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1068 temp = bk-off;
1069 #elif defined(LEFT)
1070 temp = off+16; // number of values in A
1071 #else
1072 temp = off+2; // number of values in B
1073 #endif
1074
1075 for (k=0; k<temp; k++)
1076 {
1077 b0 = ptrbb[0];
1078 b1 = ptrbb[1];
1079
1080 a0 = ptrba[0];
1081 res0_0 += a0*b0;
1082 res1_0 += a0*b1;
1083
1084 a1 = ptrba[1];
1085 res0_1 += a1*b0;
1086 res1_1 += a1*b1;
1087
1088 a0 = ptrba[2];
1089 res0_2 += a0*b0;
1090 res1_2 += a0*b1;
1091
1092 a1 = ptrba[3];
1093 res0_3 += a1*b0;
1094 res1_3 += a1*b1;
1095
1096 a0 = ptrba[4];
1097 res0_4 += a0*b0;
1098 res1_4 += a0*b1;
1099
1100 a1 = ptrba[5];
1101 res0_5 += a1*b0;
1102 res1_5 += a1*b1;
1103
1104 a0 = ptrba[6];
1105 res0_6 += a0*b0;
1106 res1_6 += a0*b1;
1107
1108 a1 = ptrba[7];
1109 res0_7 += a1*b0;
1110 res1_7 += a1*b1;
1111
1112 a0 = ptrba[8];
1113 res0_8 += a0*b0;
1114 res1_8 += a0*b1;
1115
1116 a1 = ptrba[9];
1117 res0_9 += a1*b0;
1118 res1_9 += a1*b1;
1119
1120 a0 = ptrba[10];
1121 res0_10 += a0*b0;
1122 res1_10 += a0*b1;
1123
1124 a1 = ptrba[11];
1125 res0_11 += a1*b0;
1126 res1_11 += a1*b1;
1127
1128 a0 = ptrba[12];
1129 res0_12 += a0*b0;
1130 res1_12 += a0*b1;
1131
1132 a1 = ptrba[13];
1133 res0_13 += a1*b0;
1134 res1_13 += a1*b1;
1135
1136 a0 = ptrba[14];
1137 res0_14 += a0*b0;
1138 res1_14 += a0*b1;
1139
1140 a1 = ptrba[15];
1141 res0_15 += a1*b0;
1142 res1_15 += a1*b1;
1143
1144
1145 ptrba = ptrba+16;
1146 ptrbb = ptrbb+2;
1147 }
1148
1149 res0_0 *= alpha;
1150 res0_1 *= alpha;
1151 res0_2 *= alpha;
1152 res0_3 *= alpha;
1153 res0_4 *= alpha;
1154 res0_5 *= alpha;
1155 res0_6 *= alpha;
1156 res0_7 *= alpha;
1157
1158 res0_8 *= alpha;
1159 res0_9 *= alpha;
1160 res0_10 *= alpha;
1161 res0_11 *= alpha;
1162 res0_12 *= alpha;
1163 res0_13 *= alpha;
1164 res0_14 *= alpha;
1165 res0_15 *= alpha;
1166
1167 res1_0 *= alpha;
1168 res1_1 *= alpha;
1169 res1_2 *= alpha;
1170 res1_3 *= alpha;
1171 res1_4 *= alpha;
1172 res1_5 *= alpha;
1173 res1_6 *= alpha;
1174 res1_7 *= alpha;
1175
1176 res1_8 *= alpha;
1177 res1_9 *= alpha;
1178 res1_10 *= alpha;
1179 res1_11 *= alpha;
1180 res1_12 *= alpha;
1181 res1_13 *= alpha;
1182 res1_14 *= alpha;
1183 res1_15 *= alpha;
1184
1185 C0[0] = res0_0;
1186 C0[1] = res0_1;
1187 C0[2] = res0_2;
1188 C0[3] = res0_3;
1189 C0[4] = res0_4;
1190 C0[5] = res0_5;
1191 C0[6] = res0_6;
1192 C0[7] = res0_7;
1193
1194 C0[8] = res0_8;
1195 C0[9] = res0_9;
1196 C0[10] = res0_10;
1197 C0[11] = res0_11;
1198 C0[12] = res0_12;
1199 C0[13] = res0_13;
1200 C0[14] = res0_14;
1201 C0[15] = res0_15;
1202
1203 C1[0] = res1_0;
1204 C1[1] = res1_1;
1205 C1[2] = res1_2;
1206 C1[3] = res1_3;
1207 C1[4] = res1_4;
1208 C1[5] = res1_5;
1209 C1[6] = res1_6;
1210 C1[7] = res1_7;
1211
1212 C1[8] = res1_8;
1213 C1[9] = res1_9;
1214 C1[10] = res1_10;
1215 C1[11] = res1_11;
1216 C1[12] = res1_12;
1217 C1[13] = res1_13;
1218 C1[14] = res1_14;
1219 C1[15] = res1_15;
1220
1221
1222
1223 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1224 temp = bk - off;
1225 #ifdef LEFT
1226 temp -= 16; // number of values in A
1227 #else
1228 temp -= 2; // number of values in B
1229 #endif
1230 ptrba += temp*16;
1231 ptrbb += temp*2;
1232 #endif
1233
1234 #ifdef LEFT
1235 off += 16; // number of values in A
1236 #endif
1237
1238 C0 = C0+16;
1239 C1 = C1+16;
1240 }
1241
1242
1243
1244
1245 if ( bm & 8)
1246 {
1247
1248 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1249 ptrbb = bb;
1250 #else
1251 ptrba += off*8;
1252 ptrbb = bb + off*2;
1253 #endif
1254
1255 res0_0 = 0;
1256 res0_1 = 0;
1257 res0_2 = 0;
1258 res0_3 = 0;
1259 res0_4 = 0;
1260 res0_5 = 0;
1261 res0_6 = 0;
1262 res0_7 = 0;
1263
1264 res1_0 = 0;
1265 res1_1 = 0;
1266 res1_2 = 0;
1267 res1_3 = 0;
1268 res1_4 = 0;
1269 res1_5 = 0;
1270 res1_6 = 0;
1271 res1_7 = 0;
1272
1273
1274
1275 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1276 temp = bk-off;
1277 #elif defined(LEFT)
1278 temp = off+8; // number of values in A
1279 #else
1280 temp = off+2; // number of values in B
1281 #endif
1282
1283 for (k=0; k<temp; k++)
1284 {
1285 b0 = ptrbb[0];
1286 b1 = ptrbb[1];
1287
1288 a0 = ptrba[0];
1289 res0_0 += a0*b0;
1290 res1_0 += a0*b1;
1291
1292 a1 = ptrba[1];
1293 res0_1 += a1*b0;
1294 res1_1 += a1*b1;
1295
1296 a0 = ptrba[2];
1297 res0_2 += a0*b0;
1298 res1_2 += a0*b1;
1299
1300 a1 = ptrba[3];
1301 res0_3 += a1*b0;
1302 res1_3 += a1*b1;
1303
1304 a0 = ptrba[4];
1305 res0_4 += a0*b0;
1306 res1_4 += a0*b1;
1307
1308 a1 = ptrba[5];
1309 res0_5 += a1*b0;
1310 res1_5 += a1*b1;
1311
1312 a0 = ptrba[6];
1313 res0_6 += a0*b0;
1314 res1_6 += a0*b1;
1315
1316 a1 = ptrba[7];
1317 res0_7 += a1*b0;
1318 res1_7 += a1*b1;
1319
1320 ptrba = ptrba+8;
1321 ptrbb = ptrbb+2;
1322 }
1323
1324 res0_0 *= alpha;
1325 res0_1 *= alpha;
1326 res0_2 *= alpha;
1327 res0_3 *= alpha;
1328 res0_4 *= alpha;
1329 res0_5 *= alpha;
1330 res0_6 *= alpha;
1331 res0_7 *= alpha;
1332
1333 res1_0 *= alpha;
1334 res1_1 *= alpha;
1335 res1_2 *= alpha;
1336 res1_3 *= alpha;
1337 res1_4 *= alpha;
1338 res1_5 *= alpha;
1339 res1_6 *= alpha;
1340 res1_7 *= alpha;
1341
1342 C0[0] = res0_0;
1343 C0[1] = res0_1;
1344 C0[2] = res0_2;
1345 C0[3] = res0_3;
1346 C0[4] = res0_4;
1347 C0[5] = res0_5;
1348 C0[6] = res0_6;
1349 C0[7] = res0_7;
1350
1351 C1[0] = res1_0;
1352 C1[1] = res1_1;
1353 C1[2] = res1_2;
1354 C1[3] = res1_3;
1355 C1[4] = res1_4;
1356 C1[5] = res1_5;
1357 C1[6] = res1_6;
1358 C1[7] = res1_7;
1359
1360
1361 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1362 temp = bk - off;
1363 #ifdef LEFT
1364 temp -= 8; // number of values in A
1365 #else
1366 temp -= 2; // number of values in B
1367 #endif
1368 ptrba += temp*8;
1369 ptrbb += temp*2;
1370 #endif
1371
1372 #ifdef LEFT
1373 off += 8; // number of values in A
1374 #endif
1375
1376 C0 = C0+8;
1377 C1 = C1+8;
1378 }
1379
1380 if ( bm & 4 )
1381 {
1382
1383 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1384 ptrbb = bb;
1385 #else
1386 ptrba += off*4;
1387 ptrbb = bb + off*2;
1388 #endif
1389
1390 res0_0 = 0;
1391 res0_1 = 0;
1392 res0_2 = 0;
1393 res0_3 = 0;
1394
1395 res1_0 = 0;
1396 res1_1 = 0;
1397 res1_2 = 0;
1398 res1_3 = 0;
1399
1400
1401 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1402 temp = bk-off;
1403 #elif defined(LEFT)
1404 temp = off+4; // number of values in A
1405 #else
1406 temp = off+2; // number of values in B
1407 #endif
1408
1409 for (k=0; k<temp; k++)
1410 {
1411 b0 = ptrbb[0];
1412 b1 = ptrbb[1];
1413
1414 a0 = ptrba[0];
1415 res0_0 += a0*b0;
1416 res1_0 += a0*b1;
1417
1418 a1 = ptrba[1];
1419 res0_1 += a1*b0;
1420 res1_1 += a1*b1;
1421
1422 a0 = ptrba[2];
1423 res0_2 += a0*b0;
1424 res1_2 += a0*b1;
1425
1426 a1 = ptrba[3];
1427 res0_3 += a1*b0;
1428 res1_3 += a1*b1;
1429
1430 ptrba = ptrba+4;
1431 ptrbb = ptrbb+2;
1432 }
1433
1434 res0_0 *= alpha;
1435 res0_1 *= alpha;
1436 res0_2 *= alpha;
1437 res0_3 *= alpha;
1438
1439 res1_0 *= alpha;
1440 res1_1 *= alpha;
1441 res1_2 *= alpha;
1442 res1_3 *= alpha;
1443
1444 C0[0] = res0_0;
1445 C0[1] = res0_1;
1446 C0[2] = res0_2;
1447 C0[3] = res0_3;
1448
1449 C1[0] = res1_0;
1450 C1[1] = res1_1;
1451 C1[2] = res1_2;
1452 C1[3] = res1_3;
1453
1454
1455 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1456 temp = bk - off;
1457 #ifdef LEFT
1458 temp -= 4; // number of values in A
1459 #else
1460 temp -= 2; // number of values in B
1461 #endif
1462 ptrba += temp*4;
1463 ptrbb += temp*2;
1464 #endif
1465
1466 #ifdef LEFT
1467 off += 4; // number of values in A
1468 #endif
1469
1470 C0 = C0+4;
1471 C1 = C1+4;
1472
1473 }
1474
1475 if ( bm & 2 )
1476 {
1477
1478 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1479 ptrbb = bb;
1480 #else
1481 ptrba += off*2;
1482 ptrbb = bb + off*2;
1483 #endif
1484
1485 res0_0 = 0;
1486 res0_1 = 0;
1487
1488 res1_0 = 0;
1489 res1_1 = 0;
1490
1491
1492 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1493 temp = bk-off;
1494 #elif defined(LEFT)
1495 temp = off+2; // number of values in A
1496 #else
1497 temp = off+2; // number of values in B
1498 #endif
1499
1500 for (k=0; k<temp; k++)
1501 {
1502 b0 = ptrbb[0];
1503 b1 = ptrbb[1];
1504
1505 a0 = ptrba[0];
1506 res0_0 += a0*b0;
1507 res1_0 += a0*b1;
1508
1509 a1 = ptrba[1];
1510 res0_1 += a1*b0;
1511 res1_1 += a1*b1;
1512
1513 ptrba = ptrba+2;
1514 ptrbb = ptrbb+2;
1515 }
1516
1517 res0_0 *= alpha;
1518 res0_1 *= alpha;
1519
1520 res1_0 *= alpha;
1521 res1_1 *= alpha;
1522
1523 C0[0] = res0_0;
1524 C0[1] = res0_1;
1525
1526 C1[0] = res1_0;
1527 C1[1] = res1_1;
1528
1529
1530 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1531 temp = bk - off;
1532 #ifdef LEFT
1533 temp -= 2; // number of values in A
1534 #else
1535 temp -= 2; // number of values in B
1536 #endif
1537 ptrba += temp*2;
1538 ptrbb += temp*2;
1539 #endif
1540
1541 #ifdef LEFT
1542 off += 2; // number of values in A
1543 #endif
1544
1545 C0 = C0+2;
1546 C1 = C1+2;
1547
1548 }
1549
1550 if ( bm & 1 )
1551 {
1552
1553 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1554 ptrbb = bb;
1555 #else
1556 ptrba += off*1;
1557 ptrbb = bb + off*2;
1558 #endif
1559
1560 res0_0 = 0;
1561
1562 res1_0 = 0;
1563
1564
1565 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1566 temp = bk-off;
1567 #elif defined(LEFT)
1568 temp = off+1; // number of values in A
1569 #else
1570 temp = off+2; // number of values in B
1571 #endif
1572
1573 for (k=0; k<temp; k++)
1574 {
1575 b0 = ptrbb[0];
1576 b1 = ptrbb[1];
1577
1578 a0 = ptrba[0];
1579 res0_0 += a0*b0;
1580 res1_0 += a0*b1;
1581
1582 ptrba = ptrba+1;
1583 ptrbb = ptrbb+2;
1584 }
1585
1586 res0_0 *= alpha;
1587
1588 res1_0 *= alpha;
1589
1590 C0[0] = res0_0;
1591
1592 C1[0] = res1_0;
1593
1594
1595 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1596 temp = bk - off;
1597 #ifdef LEFT
1598 temp -= 1; // number of values in A
1599 #else
1600 temp -= 2; // number of values in B
1601 #endif
1602 ptrba += temp*1;
1603 ptrbb += temp*2;
1604 #endif
1605
1606 #ifdef LEFT
1607 off += 1; // number of values in A
1608 #endif
1609
1610 C0 = C0+1;
1611 C1 = C1+1;
1612
1613 }
1614
1615
1616 #if defined(TRMMKERNEL) && !defined(LEFT)
1617 off += 2;
1618 #endif
1619
1620 k = (bk<<1);
1621 bb = bb+k;
1622 i = (ldc<<1);
1623 C = C+i;
1624 }
1625
1626
1627 for (j=0; j<(bn&1); j+=1)
1628 {
1629 C0 = C;
1630
1631 #if defined(TRMMKERNEL) && defined(LEFT)
1632 off = offset;
1633 #endif
1634
1635 ptrba = ba;
1636
1637
1638 for (i=0; i<bm/16; i+=1)
1639 {
1640
1641 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1642 ptrbb = bb;
1643 #else
1644 ptrba += off*16;
1645 ptrbb = bb + off*1;
1646 #endif
1647
1648 res0_0 = 0;
1649 res0_1 = 0;
1650 res0_2 = 0;
1651 res0_3 = 0;
1652 res0_4 = 0;
1653 res0_5 = 0;
1654 res0_6 = 0;
1655 res0_7 = 0;
1656
1657 res0_8 = 0;
1658 res0_9 = 0;
1659 res0_10 = 0;
1660 res0_11 = 0;
1661 res0_12 = 0;
1662 res0_13 = 0;
1663 res0_14 = 0;
1664 res0_15 = 0;
1665
1666
1667
1668
1669 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1670 temp = bk-off;
1671 #elif defined(LEFT)
1672 temp = off+16; // number of values in A
1673 #else
1674 temp = off+1; // number of values in B
1675 #endif
1676
1677 for (k=0; k<temp; k++)
1678 {
1679 b0 = ptrbb[0];
1680
1681 a0 = ptrba[0];
1682 res0_0 += a0*b0;
1683
1684 a1 = ptrba[1];
1685 res0_1 += a1*b0;
1686
1687 a0 = ptrba[2];
1688 res0_2 += a0*b0;
1689
1690 a1 = ptrba[3];
1691 res0_3 += a1*b0;
1692
1693 a0 = ptrba[4];
1694 res0_4 += a0*b0;
1695
1696 a1 = ptrba[5];
1697 res0_5 += a1*b0;
1698
1699 a0 = ptrba[6];
1700 res0_6 += a0*b0;
1701
1702 a1 = ptrba[7];
1703 res0_7 += a1*b0;
1704
1705 a0 = ptrba[8];
1706 res0_8 += a0*b0;
1707
1708 a1 = ptrba[9];
1709 res0_9 += a1*b0;
1710
1711 a0 = ptrba[10];
1712 res0_10 += a0*b0;
1713
1714 a1 = ptrba[11];
1715 res0_11 += a1*b0;
1716
1717 a0 = ptrba[12];
1718 res0_12 += a0*b0;
1719
1720 a1 = ptrba[13];
1721 res0_13 += a1*b0;
1722
1723 a0 = ptrba[14];
1724 res0_14 += a0*b0;
1725
1726 a1 = ptrba[15];
1727 res0_15 += a1*b0;
1728
1729
1730 ptrba = ptrba+16;
1731 ptrbb = ptrbb+1;
1732 }
1733
1734 res0_0 *= alpha;
1735 res0_1 *= alpha;
1736 res0_2 *= alpha;
1737 res0_3 *= alpha;
1738 res0_4 *= alpha;
1739 res0_5 *= alpha;
1740 res0_6 *= alpha;
1741 res0_7 *= alpha;
1742
1743 res0_8 *= alpha;
1744 res0_9 *= alpha;
1745 res0_10 *= alpha;
1746 res0_11 *= alpha;
1747 res0_12 *= alpha;
1748 res0_13 *= alpha;
1749 res0_14 *= alpha;
1750 res0_15 *= alpha;
1751
1752 C0[0] = res0_0;
1753 C0[1] = res0_1;
1754 C0[2] = res0_2;
1755 C0[3] = res0_3;
1756 C0[4] = res0_4;
1757 C0[5] = res0_5;
1758 C0[6] = res0_6;
1759 C0[7] = res0_7;
1760
1761 C0[8] = res0_8;
1762 C0[9] = res0_9;
1763 C0[10] = res0_10;
1764 C0[11] = res0_11;
1765 C0[12] = res0_12;
1766 C0[13] = res0_13;
1767 C0[14] = res0_14;
1768 C0[15] = res0_15;
1769
1770
1771 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1772 temp = bk - off;
1773 #ifdef LEFT
1774 temp -= 16; // number of values in A
1775 #else
1776 temp -= 1; // number of values in B
1777 #endif
1778 ptrba += temp*16;
1779 ptrbb += temp*1;
1780 #endif
1781
1782 #ifdef LEFT
1783 off += 16; // number of values in A
1784 #endif
1785
1786 C0 = C0+16;
1787 }
1788
1789
1790
1791
1792 if ( bm & 8 )
1793 {
1794
1795 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1796 ptrbb = bb;
1797 #else
1798 ptrba += off*8;
1799 ptrbb = bb + off*1;
1800 #endif
1801
1802 res0_0 = 0;
1803 res0_1 = 0;
1804 res0_2 = 0;
1805 res0_3 = 0;
1806 res0_4 = 0;
1807 res0_5 = 0;
1808 res0_6 = 0;
1809 res0_7 = 0;
1810
1811
1812 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1813 temp = bk-off;
1814 #elif defined(LEFT)
1815 temp = off+8; // number of values in A
1816 #else
1817 temp = off+1; // number of values in B
1818 #endif
1819
1820 for (k=0; k<temp; k++)
1821 {
1822 b0 = ptrbb[0];
1823
1824 a0 = ptrba[0];
1825 res0_0 += a0*b0;
1826
1827 a1 = ptrba[1];
1828 res0_1 += a1*b0;
1829
1830 a0 = ptrba[2];
1831 res0_2 += a0*b0;
1832
1833 a1 = ptrba[3];
1834 res0_3 += a1*b0;
1835
1836 a0 = ptrba[4];
1837 res0_4 += a0*b0;
1838
1839 a1 = ptrba[5];
1840 res0_5 += a1*b0;
1841
1842 a0 = ptrba[6];
1843 res0_6 += a0*b0;
1844
1845 a1 = ptrba[7];
1846 res0_7 += a1*b0;
1847
1848 ptrba = ptrba+8;
1849 ptrbb = ptrbb+1;
1850 }
1851
1852 res0_0 *= alpha;
1853 res0_1 *= alpha;
1854 res0_2 *= alpha;
1855 res0_3 *= alpha;
1856 res0_4 *= alpha;
1857 res0_5 *= alpha;
1858 res0_6 *= alpha;
1859 res0_7 *= alpha;
1860
1861 C0[0] = res0_0;
1862 C0[1] = res0_1;
1863 C0[2] = res0_2;
1864 C0[3] = res0_3;
1865 C0[4] = res0_4;
1866 C0[5] = res0_5;
1867 C0[6] = res0_6;
1868 C0[7] = res0_7;
1869
1870 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1871 temp = bk - off;
1872 #ifdef LEFT
1873 temp -= 8; // number of values in A
1874 #else
1875 temp -= 1; // number of values in B
1876 #endif
1877 ptrba += temp*8;
1878 ptrbb += temp*1;
1879 #endif
1880
1881 #ifdef LEFT
1882 off += 8; // number of values in A
1883 #endif
1884
1885 C0 = C0+8;
1886 }
1887
1888 if ( bm & 4 )
1889 {
1890
1891 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1892 ptrbb = bb;
1893 #else
1894 ptrba += off*4;
1895 ptrbb = bb + off*1;
1896 #endif
1897
1898 res0_0 = 0;
1899 res0_1 = 0;
1900 res0_2 = 0;
1901 res0_3 = 0;
1902
1903
1904 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1905 temp = bk-off;
1906 #elif defined(LEFT)
1907 temp = off+4; // number of values in A
1908 #else
1909 temp = off+1; // number of values in B
1910 #endif
1911
1912 for (k=0; k<temp; k++)
1913 {
1914 b0 = ptrbb[0];
1915
1916 a0 = ptrba[0];
1917 res0_0 += a0*b0;
1918
1919 a1 = ptrba[1];
1920 res0_1 += a1*b0;
1921
1922 a0 = ptrba[2];
1923 res0_2 += a0*b0;
1924
1925 a1 = ptrba[3];
1926 res0_3 += a1*b0;
1927
1928 ptrba = ptrba+4;
1929 ptrbb = ptrbb+1;
1930 }
1931
1932 res0_0 *= alpha;
1933 res0_1 *= alpha;
1934 res0_2 *= alpha;
1935 res0_3 *= alpha;
1936
1937 C0[0] = res0_0;
1938 C0[1] = res0_1;
1939 C0[2] = res0_2;
1940 C0[3] = res0_3;
1941
1942
1943 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1944 temp = bk - off;
1945 #ifdef LEFT
1946 temp -= 4; // number of values in A
1947 #else
1948 temp -= 1; // number of values in B
1949 #endif
1950 ptrba += temp*4;
1951 ptrbb += temp*1;
1952 #endif
1953
1954 #ifdef LEFT
1955 off += 4; // number of values in A
1956 #endif
1957
1958 C0 = C0+4;
1959
1960 }
1961
1962 if ( bm & 2 )
1963 {
1964
1965 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1966 ptrbb = bb;
1967 #else
1968 ptrba += off*2;
1969 ptrbb = bb + off*1;
1970 #endif
1971
1972 res0_0 = 0;
1973 res0_1 = 0;
1974
1975
1976
1977 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1978 temp = bk-off;
1979 #elif defined(LEFT)
1980 temp = off+2; // number of values in A
1981 #else
1982 temp = off+1; // number of values in B
1983 #endif
1984
1985 for (k=0; k<temp; k++)
1986 {
1987 b0 = ptrbb[0];
1988
1989 a0 = ptrba[0];
1990 res0_0 += a0*b0;
1991
1992 a1 = ptrba[1];
1993 res0_1 += a1*b0;
1994
1995 ptrba = ptrba+2;
1996 ptrbb = ptrbb+1;
1997 }
1998
1999 res0_0 *= alpha;
2000 res0_1 *= alpha;
2001
2002 C0[0] = res0_0;
2003 C0[1] = res0_1;
2004
2005
2006 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2007 temp = bk - off;
2008 #ifdef LEFT
2009 temp -= 2; // number of values in A
2010 #else
2011 temp -= 1; // number of values in B
2012 #endif
2013 ptrba += temp*2;
2014 ptrbb += temp*1;
2015 #endif
2016
2017 #ifdef LEFT
2018 off += 2; // number of values in A
2019 #endif
2020
2021 C0 = C0+2;
2022
2023 }
2024
2025 if ( bm & 1 )
2026 {
2027
2028 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2029 ptrbb = bb;
2030 #else
2031 ptrba += off*1;
2032 ptrbb = bb + off*1;
2033 #endif
2034
2035 res0_0 = 0;
2036
2037
2038 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2039 temp = bk-off;
2040 #elif defined(LEFT)
2041 temp = off+1; // number of values in A
2042 #else
2043 temp = off+1; // number of values in B
2044 #endif
2045
2046 for (k=0; k<temp; k++)
2047 {
2048 b0 = ptrbb[0];
2049
2050 a0 = ptrba[0];
2051 res0_0 += a0*b0;
2052
2053 ptrba = ptrba+1;
2054 ptrbb = ptrbb+1;
2055 }
2056
2057 res0_0 *= alpha;
2058
2059 C0[0] = res0_0;
2060
2061
2062 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2063 temp = bk - off;
2064 #ifdef LEFT
2065 temp -= 1; // number of values in A
2066 #else
2067 temp -= 1; // number of values in B
2068 #endif
2069 ptrba += temp*1;
2070 ptrbb += temp*1;
2071 #endif
2072
2073 #ifdef LEFT
2074 off += 1; // number of values in A
2075 #endif
2076
2077 C0 = C0+1;
2078
2079 }
2080
2081
2082
2083 #if defined(TRMMKERNEL) && !defined(LEFT)
2084 off += 1;
2085 #endif
2086
2087 k = (bk<<0);
2088 bb = bb+k;
2089 C = C+ldc;
2090 }
2091 return 0;
2092 }
2093