1 #include "common.h"
2 
CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT * ba,FLOAT * bb,FLOAT * C,BLASLONG ldc,BLASLONG offset)3 int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
4 {
5 
6    BLASLONG i,j,k;
7    FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
8 
9    FLOAT res0_0;
10    FLOAT res0_1;
11    FLOAT res0_2;
12    FLOAT res0_3;
13    FLOAT res0_4;
14    FLOAT res0_5;
15    FLOAT res0_6;
16    FLOAT res0_7;
17 
18    FLOAT res0_8;
19    FLOAT res0_9;
20    FLOAT res0_10;
21    FLOAT res0_11;
22    FLOAT res0_12;
23    FLOAT res0_13;
24    FLOAT res0_14;
25    FLOAT res0_15;
26 
27    FLOAT res1_0;
28    FLOAT res1_1;
29    FLOAT res1_2;
30    FLOAT res1_3;
31    FLOAT res1_4;
32    FLOAT res1_5;
33    FLOAT res1_6;
34    FLOAT res1_7;
35 
36    FLOAT res1_8;
37    FLOAT res1_9;
38    FLOAT res1_10;
39    FLOAT res1_11;
40    FLOAT res1_12;
41    FLOAT res1_13;
42    FLOAT res1_14;
43    FLOAT res1_15;
44 
45    FLOAT res2_0;
46    FLOAT res2_1;
47    FLOAT res2_2;
48    FLOAT res2_3;
49    FLOAT res2_4;
50    FLOAT res2_5;
51    FLOAT res2_6;
52    FLOAT res2_7;
53 
54    FLOAT res2_8;
55    FLOAT res2_9;
56    FLOAT res2_10;
57    FLOAT res2_11;
58    FLOAT res2_12;
59    FLOAT res2_13;
60    FLOAT res2_14;
61    FLOAT res2_15;
62 
63    FLOAT res3_0;
64    FLOAT res3_1;
65    FLOAT res3_2;
66    FLOAT res3_3;
67    FLOAT res3_4;
68    FLOAT res3_5;
69    FLOAT res3_6;
70    FLOAT res3_7;
71 
72    FLOAT res3_8;
73    FLOAT res3_9;
74    FLOAT res3_10;
75    FLOAT res3_11;
76    FLOAT res3_12;
77    FLOAT res3_13;
78    FLOAT res3_14;
79    FLOAT res3_15;
80 
81    FLOAT a0;
82    FLOAT a1;
83 
84    FLOAT b0;
85    FLOAT b1;
86    FLOAT b2;
87    FLOAT b3;
88 
89    BLASLONG off, temp;
90 
91 #if !defined(LEFT)
92    off = -offset;
93 #else
94    off = 0;
95 #endif
96 
97    for (j=0; j<bn/4; j+=1)
98    {
99         C0 = C;
100         C1 = C0+ldc;
101         C2 = C0+2*ldc;
102         C3 = C0+3*ldc;
103 
104 #if defined(TRMMKERNEL) && defined(LEFT)
105 	off = offset;
106 #endif
107 
108 
109         ptrba = ba;
110 
111 
112         for (i=0; i<bm/16; i+=1)
113         {
114 
115 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
116 		ptrbb = bb;
117 #else
118 		ptrba += off*16;
119 		ptrbb = bb + off*4;
120 #endif
121 
122 		res0_0 = 0;
123 		res0_1 = 0;
124 		res0_2 = 0;
125 		res0_3 = 0;
126 		res0_4 = 0;
127 		res0_5 = 0;
128 		res0_6 = 0;
129 		res0_7 = 0;
130 
131 		res0_8  = 0;
132 		res0_9  = 0;
133 		res0_10 = 0;
134 		res0_11 = 0;
135 		res0_12 = 0;
136 		res0_13 = 0;
137 		res0_14 = 0;
138 		res0_15 = 0;
139 
140 		res1_0 = 0;
141 		res1_1 = 0;
142 		res1_2 = 0;
143 		res1_3 = 0;
144 		res1_4 = 0;
145 		res1_5 = 0;
146 		res1_6 = 0;
147 		res1_7 = 0;
148 
149 		res1_8  = 0;
150 		res1_9  = 0;
151 		res1_10 = 0;
152 		res1_11 = 0;
153 		res1_12 = 0;
154 		res1_13 = 0;
155 		res1_14 = 0;
156 		res1_15 = 0;
157 
158 		res2_0 = 0;
159 		res2_1 = 0;
160 		res2_2 = 0;
161 		res2_3 = 0;
162 		res2_4 = 0;
163 		res2_5 = 0;
164 		res2_6 = 0;
165 		res2_7 = 0;
166 
167 		res2_8  = 0;
168 		res2_9  = 0;
169 		res2_10 = 0;
170 		res2_11 = 0;
171 		res2_12 = 0;
172 		res2_13 = 0;
173 		res2_14 = 0;
174 		res2_15 = 0;
175 
176 		res3_0 = 0;
177 		res3_1 = 0;
178 		res3_2 = 0;
179 		res3_3 = 0;
180 		res3_4 = 0;
181 		res3_5 = 0;
182 		res3_6 = 0;
183 		res3_7 = 0;
184 
185 		res3_8  = 0;
186 		res3_9  = 0;
187 		res3_10 = 0;
188 		res3_11 = 0;
189 		res3_12 = 0;
190 		res3_13 = 0;
191 		res3_14 = 0;
192 		res3_15 = 0;
193 
194 
195 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
196 		temp = bk-off;
197 #elif defined(LEFT)
198 		temp = off+16;	// number of values in A
199 #else
200 		temp = off+4;	// number of values in B
201 #endif
202 
203 		for (k=0; k<temp; k++)
204                 {
205 			b0 = ptrbb[0];
206 			b1 = ptrbb[1];
207 			b2 = ptrbb[2];
208 			b3 = ptrbb[3];
209 
210 			a0 = ptrba[0];
211 			res0_0 += a0*b0;
212 			res1_0 += a0*b1;
213 			res2_0 += a0*b2;
214 			res3_0 += a0*b3;
215 
216 			a1 = ptrba[1];
217 			res0_1 += a1*b0;
218 			res1_1 += a1*b1;
219 			res2_1 += a1*b2;
220 			res3_1 += a1*b3;
221 
222 			a0 = ptrba[2];
223 			res0_2 += a0*b0;
224 			res1_2 += a0*b1;
225 			res2_2 += a0*b2;
226 			res3_2 += a0*b3;
227 
228 			a1 = ptrba[3];
229 			res0_3 += a1*b0;
230 			res1_3 += a1*b1;
231 			res2_3 += a1*b2;
232 			res3_3 += a1*b3;
233 
234 			a0 = ptrba[4];
235 			res0_4 += a0*b0;
236 			res1_4 += a0*b1;
237 			res2_4 += a0*b2;
238 			res3_4 += a0*b3;
239 
240 			a1 = ptrba[5];
241 			res0_5 += a1*b0;
242 			res1_5 += a1*b1;
243 			res2_5 += a1*b2;
244 			res3_5 += a1*b3;
245 
246 			a0 = ptrba[6];
247 			res0_6 += a0*b0;
248 			res1_6 += a0*b1;
249 			res2_6 += a0*b2;
250 			res3_6 += a0*b3;
251 
252 			a1 = ptrba[7];
253 			res0_7 += a1*b0;
254 			res1_7 += a1*b1;
255 			res2_7 += a1*b2;
256 			res3_7 += a1*b3;
257 
258 			a0 = ptrba[8];
259 			res0_8 += a0*b0;
260 			res1_8 += a0*b1;
261 			res2_8 += a0*b2;
262 			res3_8 += a0*b3;
263 
264 			a1 = ptrba[9];
265 			res0_9 += a1*b0;
266 			res1_9 += a1*b1;
267 			res2_9 += a1*b2;
268 			res3_9 += a1*b3;
269 
270 			a0 = ptrba[10];
271 			res0_10 += a0*b0;
272 			res1_10 += a0*b1;
273 			res2_10 += a0*b2;
274 			res3_10 += a0*b3;
275 
276 			a1 = ptrba[11];
277 			res0_11 += a1*b0;
278 			res1_11 += a1*b1;
279 			res2_11 += a1*b2;
280 			res3_11 += a1*b3;
281 
282 			a0 = ptrba[12];
283 			res0_12 += a0*b0;
284 			res1_12 += a0*b1;
285 			res2_12 += a0*b2;
286 			res3_12 += a0*b3;
287 
288 			a1 = ptrba[13];
289 			res0_13 += a1*b0;
290 			res1_13 += a1*b1;
291 			res2_13 += a1*b2;
292 			res3_13 += a1*b3;
293 
294 			a0 = ptrba[14];
295 			res0_14 += a0*b0;
296 			res1_14 += a0*b1;
297 			res2_14 += a0*b2;
298 			res3_14 += a0*b3;
299 
300 			a1 = ptrba[15];
301 			res0_15 += a1*b0;
302 			res1_15 += a1*b1;
303 			res2_15 += a1*b2;
304 			res3_15 += a1*b3;
305 
306 
307 			ptrba = ptrba+16;
308 			ptrbb = ptrbb+4;
309                 }
310 
311 		res0_0 *= alpha;
312 		res0_1 *= alpha;
313 		res0_2 *= alpha;
314 		res0_3 *= alpha;
315 		res0_4 *= alpha;
316 		res0_5 *= alpha;
317 		res0_6 *= alpha;
318 		res0_7 *= alpha;
319 
320 		res0_8  *= alpha;
321 		res0_9  *= alpha;
322 		res0_10 *= alpha;
323 		res0_11 *= alpha;
324 		res0_12 *= alpha;
325 		res0_13 *= alpha;
326 		res0_14 *= alpha;
327 		res0_15 *= alpha;
328 
329 		res1_0 *= alpha;
330 		res1_1 *= alpha;
331 		res1_2 *= alpha;
332 		res1_3 *= alpha;
333 		res1_4 *= alpha;
334 		res1_5 *= alpha;
335 		res1_6 *= alpha;
336 		res1_7 *= alpha;
337 
338 		res1_8  *= alpha;
339 		res1_9  *= alpha;
340 		res1_10 *= alpha;
341 		res1_11 *= alpha;
342 		res1_12 *= alpha;
343 		res1_13 *= alpha;
344 		res1_14 *= alpha;
345 		res1_15 *= alpha;
346 
347 		res2_0 *= alpha;
348 		res2_1 *= alpha;
349 		res2_2 *= alpha;
350 		res2_3 *= alpha;
351 		res2_4 *= alpha;
352 		res2_5 *= alpha;
353 		res2_6 *= alpha;
354 		res2_7 *= alpha;
355 
356 		res2_8  *= alpha;
357 		res2_9  *= alpha;
358 		res2_10 *= alpha;
359 		res2_11 *= alpha;
360 		res2_12 *= alpha;
361 		res2_13 *= alpha;
362 		res2_14 *= alpha;
363 		res2_15 *= alpha;
364 
365 		res3_0 *= alpha;
366 		res3_1 *= alpha;
367 		res3_2 *= alpha;
368 		res3_3 *= alpha;
369 		res3_4 *= alpha;
370 		res3_5 *= alpha;
371 		res3_6 *= alpha;
372 		res3_7 *= alpha;
373 
374 		res3_8  *= alpha;
375 		res3_9  *= alpha;
376 		res3_10 *= alpha;
377 		res3_11 *= alpha;
378 		res3_12 *= alpha;
379 		res3_13 *= alpha;
380 		res3_14 *= alpha;
381 		res3_15 *= alpha;
382 
383 		C0[0] = res0_0;
384 		C0[1] = res0_1;
385 		C0[2] = res0_2;
386 		C0[3] = res0_3;
387 		C0[4] = res0_4;
388 		C0[5] = res0_5;
389 		C0[6] = res0_6;
390 		C0[7] = res0_7;
391 
392 		C0[8]  = res0_8;
393 		C0[9]  = res0_9;
394 		C0[10] = res0_10;
395 		C0[11] = res0_11;
396 		C0[12] = res0_12;
397 		C0[13] = res0_13;
398 		C0[14] = res0_14;
399 		C0[15] = res0_15;
400 
401 		C1[0] = res1_0;
402 		C1[1] = res1_1;
403 		C1[2] = res1_2;
404 		C1[3] = res1_3;
405 		C1[4] = res1_4;
406 		C1[5] = res1_5;
407 		C1[6] = res1_6;
408 		C1[7] = res1_7;
409 
410 		C1[8]  = res1_8;
411 		C1[9]  = res1_9;
412 		C1[10] = res1_10;
413 		C1[11] = res1_11;
414 		C1[12] = res1_12;
415 		C1[13] = res1_13;
416 		C1[14] = res1_14;
417 		C1[15] = res1_15;
418 
419 		C2[0] = res2_0;
420 		C2[1] = res2_1;
421 		C2[2] = res2_2;
422 		C2[3] = res2_3;
423 		C2[4] = res2_4;
424 		C2[5] = res2_5;
425 		C2[6] = res2_6;
426 		C2[7] = res2_7;
427 
428 		C2[8]  = res2_8;
429 		C2[9]  = res2_9;
430 		C2[10] = res2_10;
431 		C2[11] = res2_11;
432 		C2[12] = res2_12;
433 		C2[13] = res2_13;
434 		C2[14] = res2_14;
435 		C2[15] = res2_15;
436 
437 		C3[0] = res3_0;
438 		C3[1] = res3_1;
439 		C3[2] = res3_2;
440 		C3[3] = res3_3;
441 		C3[4] = res3_4;
442 		C3[5] = res3_5;
443 		C3[6] = res3_6;
444 		C3[7] = res3_7;
445 
446 		C3[8]  = res3_8;
447 		C3[9]  = res3_9;
448 		C3[10] = res3_10;
449 		C3[11] = res3_11;
450 		C3[12] = res3_12;
451 		C3[13] = res3_13;
452 		C3[14] = res3_14;
453 		C3[15] = res3_15;
454 
455 
456 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
457 		temp = bk - off;
458 #ifdef LEFT
459 		temp -= 16; // number of values in A
460 #else
461 		temp -= 4; // number of values in B
462 #endif
463 		ptrba += temp*16;
464 		ptrbb += temp*4;
465 #endif
466 
467 #ifdef LEFT
468 		off += 16; // number of values in A
469 #endif
470 
471 		C0 = C0+16;
472 		C1 = C1+16;
473 		C2 = C2+16;
474 		C3 = C3+16;
475 	}
476 
477 
478         if ( bm & 8)
479         {
480 
481 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
482 		ptrbb = bb;
483 #else
484 		ptrba += off*8;
485 		ptrbb = bb + off*4;
486 #endif
487 
488 		res0_0 = 0;
489 		res0_1 = 0;
490 		res0_2 = 0;
491 		res0_3 = 0;
492 		res0_4 = 0;
493 		res0_5 = 0;
494 		res0_6 = 0;
495 		res0_7 = 0;
496 
497 		res1_0 = 0;
498 		res1_1 = 0;
499 		res1_2 = 0;
500 		res1_3 = 0;
501 		res1_4 = 0;
502 		res1_5 = 0;
503 		res1_6 = 0;
504 		res1_7 = 0;
505 
506 		res2_0 = 0;
507 		res2_1 = 0;
508 		res2_2 = 0;
509 		res2_3 = 0;
510 		res2_4 = 0;
511 		res2_5 = 0;
512 		res2_6 = 0;
513 		res2_7 = 0;
514 
515 		res3_0 = 0;
516 		res3_1 = 0;
517 		res3_2 = 0;
518 		res3_3 = 0;
519 		res3_4 = 0;
520 		res3_5 = 0;
521 		res3_6 = 0;
522 		res3_7 = 0;
523 
524 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
525 		temp = bk-off;
526 #elif defined(LEFT)
527 		temp = off+8;	// number of values in A
528 #else
529 		temp = off+4;	// number of values in B
530 #endif
531 
532 		for (k=0; k<temp; k++)
533                 {
534 			b0 = ptrbb[0];
535 			b1 = ptrbb[1];
536 			b2 = ptrbb[2];
537 			b3 = ptrbb[3];
538 
539 			a0 = ptrba[0];
540 			res0_0 += a0*b0;
541 			res1_0 += a0*b1;
542 			res2_0 += a0*b2;
543 			res3_0 += a0*b3;
544 
545 			a1 = ptrba[1];
546 			res0_1 += a1*b0;
547 			res1_1 += a1*b1;
548 			res2_1 += a1*b2;
549 			res3_1 += a1*b3;
550 
551 			a0 = ptrba[2];
552 			res0_2 += a0*b0;
553 			res1_2 += a0*b1;
554 			res2_2 += a0*b2;
555 			res3_2 += a0*b3;
556 
557 			a1 = ptrba[3];
558 			res0_3 += a1*b0;
559 			res1_3 += a1*b1;
560 			res2_3 += a1*b2;
561 			res3_3 += a1*b3;
562 
563 			a0 = ptrba[4];
564 			res0_4 += a0*b0;
565 			res1_4 += a0*b1;
566 			res2_4 += a0*b2;
567 			res3_4 += a0*b3;
568 
569 			a1 = ptrba[5];
570 			res0_5 += a1*b0;
571 			res1_5 += a1*b1;
572 			res2_5 += a1*b2;
573 			res3_5 += a1*b3;
574 
575 			a0 = ptrba[6];
576 			res0_6 += a0*b0;
577 			res1_6 += a0*b1;
578 			res2_6 += a0*b2;
579 			res3_6 += a0*b3;
580 
581 			a1 = ptrba[7];
582 			res0_7 += a1*b0;
583 			res1_7 += a1*b1;
584 			res2_7 += a1*b2;
585 			res3_7 += a1*b3;
586 
587 			ptrba = ptrba+8;
588 			ptrbb = ptrbb+4;
589 
590                 }
591 
592 		res0_0 *= alpha;
593 		res0_1 *= alpha;
594 		res0_2 *= alpha;
595 		res0_3 *= alpha;
596 		res0_4 *= alpha;
597 		res0_5 *= alpha;
598 		res0_6 *= alpha;
599 		res0_7 *= alpha;
600 
601 		res1_0 *= alpha;
602 		res1_1 *= alpha;
603 		res1_2 *= alpha;
604 		res1_3 *= alpha;
605 		res1_4 *= alpha;
606 		res1_5 *= alpha;
607 		res1_6 *= alpha;
608 		res1_7 *= alpha;
609 
610 		res2_0 *= alpha;
611 		res2_1 *= alpha;
612 		res2_2 *= alpha;
613 		res2_3 *= alpha;
614 		res2_4 *= alpha;
615 		res2_5 *= alpha;
616 		res2_6 *= alpha;
617 		res2_7 *= alpha;
618 
619 		res3_0 *= alpha;
620 		res3_1 *= alpha;
621 		res3_2 *= alpha;
622 		res3_3 *= alpha;
623 		res3_4 *= alpha;
624 		res3_5 *= alpha;
625 		res3_6 *= alpha;
626 		res3_7 *= alpha;
627 
628 		C0[0] = res0_0;
629 		C0[1] = res0_1;
630 		C0[2] = res0_2;
631 		C0[3] = res0_3;
632 		C0[4] = res0_4;
633 		C0[5] = res0_5;
634 		C0[6] = res0_6;
635 		C0[7] = res0_7;
636 
637 		C1[0] = res1_0;
638 		C1[1] = res1_1;
639 		C1[2] = res1_2;
640 		C1[3] = res1_3;
641 		C1[4] = res1_4;
642 		C1[5] = res1_5;
643 		C1[6] = res1_6;
644 		C1[7] = res1_7;
645 
646 		C2[0] = res2_0;
647 		C2[1] = res2_1;
648 		C2[2] = res2_2;
649 		C2[3] = res2_3;
650 		C2[4] = res2_4;
651 		C2[5] = res2_5;
652 		C2[6] = res2_6;
653 		C2[7] = res2_7;
654 
655 		C3[0] = res3_0;
656 		C3[1] = res3_1;
657 		C3[2] = res3_2;
658 		C3[3] = res3_3;
659 		C3[4] = res3_4;
660 		C3[5] = res3_5;
661 		C3[6] = res3_6;
662 		C3[7] = res3_7;
663 
664 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
665 		temp = bk - off;
666 #ifdef LEFT
667 		temp -= 8; // number of values in A
668 #else
669 		temp -= 4; // number of values in B
670 #endif
671 		ptrba += temp*8;
672 		ptrbb += temp*4;
673 #endif
674 
675 #ifdef LEFT
676 		off += 8; // number of values in A
677 #endif
678 
679 		C0 = C0+8;
680 		C1 = C1+8;
681 		C2 = C2+8;
682 		C3 = C3+8;
683 	}
684 
685 	if ( bm & 4 )
686 	{
687 
688 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
689 		ptrbb = bb;
690 #else
691 		ptrba += off*4;
692 		ptrbb = bb + off*4;
693 #endif
694 
695 		res0_0 = 0;
696 		res0_1 = 0;
697 		res0_2 = 0;
698 		res0_3 = 0;
699 
700 		res1_0 = 0;
701 		res1_1 = 0;
702 		res1_2 = 0;
703 		res1_3 = 0;
704 
705 		res2_0 = 0;
706 		res2_1 = 0;
707 		res2_2 = 0;
708 		res2_3 = 0;
709 
710 		res3_0 = 0;
711 		res3_1 = 0;
712 		res3_2 = 0;
713 		res3_3 = 0;
714 
715 
716 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
717 		temp = bk-off;
718 #elif defined(LEFT)
719 		temp = off+4;	// number of values in A
720 #else
721 		temp = off+4;	// number of values in B
722 #endif
723 
724 		for (k=0; k<temp; k++)
725                 {
726 			b0 = ptrbb[0];
727 			b1 = ptrbb[1];
728 			b2 = ptrbb[2];
729 			b3 = ptrbb[3];
730 
731 			a0 = ptrba[0];
732 			res0_0 += a0*b0;
733 			res1_0 += a0*b1;
734 			res2_0 += a0*b2;
735 			res3_0 += a0*b3;
736 
737 			a1 = ptrba[1];
738 			res0_1 += a1*b0;
739 			res1_1 += a1*b1;
740 			res2_1 += a1*b2;
741 			res3_1 += a1*b3;
742 
743 			a0 = ptrba[2];
744 			res0_2 += a0*b0;
745 			res1_2 += a0*b1;
746 			res2_2 += a0*b2;
747 			res3_2 += a0*b3;
748 
749 			a1 = ptrba[3];
750 			res0_3 += a1*b0;
751 			res1_3 += a1*b1;
752 			res2_3 += a1*b2;
753 			res3_3 += a1*b3;
754 
755 			ptrba = ptrba+4;
756 			ptrbb = ptrbb+4;
757                 }
758 
759 		res0_0 *= alpha;
760 		res0_1 *= alpha;
761 		res0_2 *= alpha;
762 		res0_3 *= alpha;
763 
764 		res1_0 *= alpha;
765 		res1_1 *= alpha;
766 		res1_2 *= alpha;
767 		res1_3 *= alpha;
768 
769 		res2_0 *= alpha;
770 		res2_1 *= alpha;
771 		res2_2 *= alpha;
772 		res2_3 *= alpha;
773 
774 		res3_0 *= alpha;
775 		res3_1 *= alpha;
776 		res3_2 *= alpha;
777 		res3_3 *= alpha;
778 
779 		C0[0] = res0_0;
780 		C0[1] = res0_1;
781 		C0[2] = res0_2;
782 		C0[3] = res0_3;
783 
784 		C1[0] = res1_0;
785 		C1[1] = res1_1;
786 		C1[2] = res1_2;
787 		C1[3] = res1_3;
788 
789 
790 		C2[0] = res2_0;
791 		C2[1] = res2_1;
792 		C2[2] = res2_2;
793 		C2[3] = res2_3;
794 
795 		C3[0] = res3_0;
796 		C3[1] = res3_1;
797 		C3[2] = res3_2;
798 		C3[3] = res3_3;
799 
800 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
801 		temp = bk - off;
802 #ifdef LEFT
803 		temp -= 4; // number of values in A
804 #else
805 		temp -= 4; // number of values in B
806 #endif
807 		ptrba += temp*4;
808 		ptrbb += temp*4;
809 #endif
810 
811 #ifdef LEFT
812 		off += 4; // number of values in A
813 #endif
814 
815 		C0 = C0+4;
816 		C1 = C1+4;
817 		C2 = C2+4;
818 		C3 = C3+4;
819 	}
820 
821 	if ( bm & 2 )
822 	{
823 
824 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
825 		ptrbb = bb;
826 #else
827 		ptrba += off*2;
828 		ptrbb = bb + off*4;
829 #endif
830 
831 		res0_0 = 0;
832 		res0_1 = 0;
833 
834 		res1_0 = 0;
835 		res1_1 = 0;
836 
837 		res2_0 = 0;
838 		res2_1 = 0;
839 
840 		res3_0 = 0;
841 		res3_1 = 0;
842 
843 
844 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
845 		temp = bk-off;
846 #elif defined(LEFT)
847 		temp = off+2;	// number of values in A
848 #else
849 		temp = off+4;	// number of values in B
850 #endif
851 
852 		for (k=0; k<temp; k++)
853                 {
854 			b0 = ptrbb[0];
855 			b1 = ptrbb[1];
856 			b2 = ptrbb[2];
857 			b3 = ptrbb[3];
858 
859 			a0 = ptrba[0];
860 			res0_0 += a0*b0;
861 			res1_0 += a0*b1;
862 			res2_0 += a0*b2;
863 			res3_0 += a0*b3;
864 
865 			a1 = ptrba[1];
866 			res0_1 += a1*b0;
867 			res1_1 += a1*b1;
868 			res2_1 += a1*b2;
869 			res3_1 += a1*b3;
870 
871 			ptrba = ptrba+2;
872 			ptrbb = ptrbb+4;
873                 }
874 
875 		res0_0 *= alpha;
876 		res0_1 *= alpha;
877 
878 		res1_0 *= alpha;
879 		res1_1 *= alpha;
880 
881 		res2_0 *= alpha;
882 		res2_1 *= alpha;
883 
884 		res3_0 *= alpha;
885 		res3_1 *= alpha;
886 
887 		C0[0] = res0_0;
888 		C0[1] = res0_1;
889 
890 		C1[0] = res1_0;
891 		C1[1] = res1_1;
892 
893 		C2[0] = res2_0;
894 		C2[1] = res2_1;
895 
896 		C3[0] = res3_0;
897 		C3[1] = res3_1;
898 
899 
900 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
901 		temp = bk - off;
902 #ifdef LEFT
903 		temp -= 2; // number of values in A
904 #else
905 		temp -= 4; // number of values in B
906 #endif
907 		ptrba += temp*2;
908 		ptrbb += temp*4;
909 #endif
910 
911 #ifdef LEFT
912 		off += 2; // number of values in A
913 #endif
914 
915 		C0 = C0+2;
916 		C1 = C1+2;
917 		C2 = C2+2;
918 		C3 = C3+2;
919 	}
920 
921 	if ( bm & 1 )
922 	{
923 
924 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
925 		ptrbb = bb;
926 #else
927 		ptrba += off*1;
928 		ptrbb = bb + off*4;
929 #endif
930 
931 		res0_0 = 0;
932 		res1_0 = 0;
933 		res2_0 = 0;
934 		res3_0 = 0;
935 
936 
937 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
938 		temp = bk-off;
939 #elif defined(LEFT)
940 		temp = off+1;	// number of values in A
941 #else
942 		temp = off+4;	// number of values in B
943 #endif
944 
945 		for (k=0; k<temp; k++)
946                 {
947 			b0 = ptrbb[0];
948 			b1 = ptrbb[1];
949 			b2 = ptrbb[2];
950 			b3 = ptrbb[3];
951 
952 			a0 = ptrba[0];
953 			res0_0 += a0*b0;
954 			res1_0 += a0*b1;
955 			res2_0 += a0*b2;
956 			res3_0 += a0*b3;
957 
958 			ptrba = ptrba+1;
959 			ptrbb = ptrbb+4;
960                 }
961 		res0_0 *= alpha;
962 		res1_0 *= alpha;
963 		res2_0 *= alpha;
964 		res3_0 *= alpha;
965 
966 		C0[0] = res0_0;
967 		C1[0] = res1_0;
968 		C2[0] = res2_0;
969 		C3[0] = res3_0;
970 
971 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
972 		temp = bk - off;
973 #ifdef LEFT
974 		temp -= 1; // number of values in A
975 #else
976 		temp -= 4; // number of values in B
977 #endif
978 		ptrba += temp*1;
979 		ptrbb += temp*4;
980 #endif
981 
982 #ifdef LEFT
983 		off += 1; // number of values in A
984 #endif
985 
986 		C0 = C0+1;
987 		C1 = C1+1;
988 		C2 = C2+1;
989 		C3 = C3+1;
990 
991 	}
992 
993 
994 #if defined(TRMMKERNEL) && !defined(LEFT)
995 		off += 4;
996 #endif
997 
998         k = (bk<<2);
999         bb = bb+k;
1000         i = (ldc<<2);
1001         C = C+i;
1002     }
1003 
1004 
1005    if(bn&2)
1006    {
1007         C0 = C;
1008         C1 = C0+ldc;
1009 
1010 #if defined(TRMMKERNEL) && defined(LEFT)
1011 	off = offset;
1012 #endif
1013 
1014 
1015         ptrba = ba;
1016 
1017 
1018         for (i=0; i<bm/16; i+=1)
1019         {
1020 
1021 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1022 		ptrbb = bb;
1023 #else
1024 		ptrba += off*16;
1025 		ptrbb = bb + off*2;
1026 #endif
1027 
1028 		res0_0 = 0;
1029 		res0_1 = 0;
1030 		res0_2 = 0;
1031 		res0_3 = 0;
1032 		res0_4 = 0;
1033 		res0_5 = 0;
1034 		res0_6 = 0;
1035 		res0_7 = 0;
1036 
1037 		res0_8  = 0;
1038 		res0_9  = 0;
1039 		res0_10 = 0;
1040 		res0_11 = 0;
1041 		res0_12 = 0;
1042 		res0_13 = 0;
1043 		res0_14 = 0;
1044 		res0_15 = 0;
1045 
1046 		res1_0 = 0;
1047 		res1_1 = 0;
1048 		res1_2 = 0;
1049 		res1_3 = 0;
1050 		res1_4 = 0;
1051 		res1_5 = 0;
1052 		res1_6 = 0;
1053 		res1_7 = 0;
1054 
1055 		res1_8  = 0;
1056 		res1_9  = 0;
1057 		res1_10 = 0;
1058 		res1_11 = 0;
1059 		res1_12 = 0;
1060 		res1_13 = 0;
1061 		res1_14 = 0;
1062 		res1_15 = 0;
1063 
1064 
1065 
1066 
1067 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1068 		temp = bk-off;
1069 #elif defined(LEFT)
1070 		temp = off+16;	// number of values in A
1071 #else
1072 		temp = off+2;	// number of values in B
1073 #endif
1074 
1075 		for (k=0; k<temp; k++)
1076                 {
1077 			b0 = ptrbb[0];
1078 			b1 = ptrbb[1];
1079 
1080 			a0 = ptrba[0];
1081 			res0_0 += a0*b0;
1082 			res1_0 += a0*b1;
1083 
1084 			a1 = ptrba[1];
1085 			res0_1 += a1*b0;
1086 			res1_1 += a1*b1;
1087 
1088 			a0 = ptrba[2];
1089 			res0_2 += a0*b0;
1090 			res1_2 += a0*b1;
1091 
1092 			a1 = ptrba[3];
1093 			res0_3 += a1*b0;
1094 			res1_3 += a1*b1;
1095 
1096 			a0 = ptrba[4];
1097 			res0_4 += a0*b0;
1098 			res1_4 += a0*b1;
1099 
1100 			a1 = ptrba[5];
1101 			res0_5 += a1*b0;
1102 			res1_5 += a1*b1;
1103 
1104 			a0 = ptrba[6];
1105 			res0_6 += a0*b0;
1106 			res1_6 += a0*b1;
1107 
1108 			a1 = ptrba[7];
1109 			res0_7 += a1*b0;
1110 			res1_7 += a1*b1;
1111 
1112 			a0 = ptrba[8];
1113 			res0_8 += a0*b0;
1114 			res1_8 += a0*b1;
1115 
1116 			a1 = ptrba[9];
1117 			res0_9 += a1*b0;
1118 			res1_9 += a1*b1;
1119 
1120 			a0 = ptrba[10];
1121 			res0_10 += a0*b0;
1122 			res1_10 += a0*b1;
1123 
1124 			a1 = ptrba[11];
1125 			res0_11 += a1*b0;
1126 			res1_11 += a1*b1;
1127 
1128 			a0 = ptrba[12];
1129 			res0_12 += a0*b0;
1130 			res1_12 += a0*b1;
1131 
1132 			a1 = ptrba[13];
1133 			res0_13 += a1*b0;
1134 			res1_13 += a1*b1;
1135 
1136 			a0 = ptrba[14];
1137 			res0_14 += a0*b0;
1138 			res1_14 += a0*b1;
1139 
1140 			a1 = ptrba[15];
1141 			res0_15 += a1*b0;
1142 			res1_15 += a1*b1;
1143 
1144 
1145 			ptrba = ptrba+16;
1146 			ptrbb = ptrbb+2;
1147                 }
1148 
1149 		res0_0 *= alpha;
1150 		res0_1 *= alpha;
1151 		res0_2 *= alpha;
1152 		res0_3 *= alpha;
1153 		res0_4 *= alpha;
1154 		res0_5 *= alpha;
1155 		res0_6 *= alpha;
1156 		res0_7 *= alpha;
1157 
1158 		res0_8  *= alpha;
1159 		res0_9  *= alpha;
1160 		res0_10 *= alpha;
1161 		res0_11 *= alpha;
1162 		res0_12 *= alpha;
1163 		res0_13 *= alpha;
1164 		res0_14 *= alpha;
1165 		res0_15 *= alpha;
1166 
1167 		res1_0 *= alpha;
1168 		res1_1 *= alpha;
1169 		res1_2 *= alpha;
1170 		res1_3 *= alpha;
1171 		res1_4 *= alpha;
1172 		res1_5 *= alpha;
1173 		res1_6 *= alpha;
1174 		res1_7 *= alpha;
1175 
1176 		res1_8  *= alpha;
1177 		res1_9  *= alpha;
1178 		res1_10 *= alpha;
1179 		res1_11 *= alpha;
1180 		res1_12 *= alpha;
1181 		res1_13 *= alpha;
1182 		res1_14 *= alpha;
1183 		res1_15 *= alpha;
1184 
1185 		C0[0] = res0_0;
1186 		C0[1] = res0_1;
1187 		C0[2] = res0_2;
1188 		C0[3] = res0_3;
1189 		C0[4] = res0_4;
1190 		C0[5] = res0_5;
1191 		C0[6] = res0_6;
1192 		C0[7] = res0_7;
1193 
1194 		C0[8]  = res0_8;
1195 		C0[9]  = res0_9;
1196 		C0[10] = res0_10;
1197 		C0[11] = res0_11;
1198 		C0[12] = res0_12;
1199 		C0[13] = res0_13;
1200 		C0[14] = res0_14;
1201 		C0[15] = res0_15;
1202 
1203 		C1[0] = res1_0;
1204 		C1[1] = res1_1;
1205 		C1[2] = res1_2;
1206 		C1[3] = res1_3;
1207 		C1[4] = res1_4;
1208 		C1[5] = res1_5;
1209 		C1[6] = res1_6;
1210 		C1[7] = res1_7;
1211 
1212 		C1[8]  = res1_8;
1213 		C1[9]  = res1_9;
1214 		C1[10] = res1_10;
1215 		C1[11] = res1_11;
1216 		C1[12] = res1_12;
1217 		C1[13] = res1_13;
1218 		C1[14] = res1_14;
1219 		C1[15] = res1_15;
1220 
1221 
1222 
1223 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1224 		temp = bk - off;
1225 #ifdef LEFT
1226 		temp -= 16; // number of values in A
1227 #else
1228 		temp -= 2; // number of values in B
1229 #endif
1230 		ptrba += temp*16;
1231 		ptrbb += temp*2;
1232 #endif
1233 
1234 #ifdef LEFT
1235 		off += 16; // number of values in A
1236 #endif
1237 
1238 		C0 = C0+16;
1239 		C1 = C1+16;
1240 	}
1241 
1242 
1243 
1244 
1245         if ( bm & 8)
1246         {
1247 
1248 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1249 		ptrbb = bb;
1250 #else
1251 		ptrba += off*8;
1252 		ptrbb = bb + off*2;
1253 #endif
1254 
1255 		res0_0 = 0;
1256 		res0_1 = 0;
1257 		res0_2 = 0;
1258 		res0_3 = 0;
1259 		res0_4 = 0;
1260 		res0_5 = 0;
1261 		res0_6 = 0;
1262 		res0_7 = 0;
1263 
1264 		res1_0 = 0;
1265 		res1_1 = 0;
1266 		res1_2 = 0;
1267 		res1_3 = 0;
1268 		res1_4 = 0;
1269 		res1_5 = 0;
1270 		res1_6 = 0;
1271 		res1_7 = 0;
1272 
1273 
1274 
1275 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1276 		temp = bk-off;
1277 #elif defined(LEFT)
1278 		temp = off+8;	// number of values in A
1279 #else
1280 		temp = off+2;	// number of values in B
1281 #endif
1282 
1283 		for (k=0; k<temp; k++)
1284                 {
1285 			b0 = ptrbb[0];
1286 			b1 = ptrbb[1];
1287 
1288 			a0 = ptrba[0];
1289 			res0_0 += a0*b0;
1290 			res1_0 += a0*b1;
1291 
1292 			a1 = ptrba[1];
1293 			res0_1 += a1*b0;
1294 			res1_1 += a1*b1;
1295 
1296 			a0 = ptrba[2];
1297 			res0_2 += a0*b0;
1298 			res1_2 += a0*b1;
1299 
1300 			a1 = ptrba[3];
1301 			res0_3 += a1*b0;
1302 			res1_3 += a1*b1;
1303 
1304 			a0 = ptrba[4];
1305 			res0_4 += a0*b0;
1306 			res1_4 += a0*b1;
1307 
1308 			a1 = ptrba[5];
1309 			res0_5 += a1*b0;
1310 			res1_5 += a1*b1;
1311 
1312 			a0 = ptrba[6];
1313 			res0_6 += a0*b0;
1314 			res1_6 += a0*b1;
1315 
1316 			a1 = ptrba[7];
1317 			res0_7 += a1*b0;
1318 			res1_7 += a1*b1;
1319 
1320 			ptrba = ptrba+8;
1321 			ptrbb = ptrbb+2;
1322                 }
1323 
1324 		res0_0 *= alpha;
1325 		res0_1 *= alpha;
1326 		res0_2 *= alpha;
1327 		res0_3 *= alpha;
1328 		res0_4 *= alpha;
1329 		res0_5 *= alpha;
1330 		res0_6 *= alpha;
1331 		res0_7 *= alpha;
1332 
1333 		res1_0 *= alpha;
1334 		res1_1 *= alpha;
1335 		res1_2 *= alpha;
1336 		res1_3 *= alpha;
1337 		res1_4 *= alpha;
1338 		res1_5 *= alpha;
1339 		res1_6 *= alpha;
1340 		res1_7 *= alpha;
1341 
1342 		C0[0] = res0_0;
1343 		C0[1] = res0_1;
1344 		C0[2] = res0_2;
1345 		C0[3] = res0_3;
1346 		C0[4] = res0_4;
1347 		C0[5] = res0_5;
1348 		C0[6] = res0_6;
1349 		C0[7] = res0_7;
1350 
1351 		C1[0] = res1_0;
1352 		C1[1] = res1_1;
1353 		C1[2] = res1_2;
1354 		C1[3] = res1_3;
1355 		C1[4] = res1_4;
1356 		C1[5] = res1_5;
1357 		C1[6] = res1_6;
1358 		C1[7] = res1_7;
1359 
1360 
1361 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1362 		temp = bk - off;
1363 #ifdef LEFT
1364 		temp -= 8; // number of values in A
1365 #else
1366 		temp -= 2; // number of values in B
1367 #endif
1368 		ptrba += temp*8;
1369 		ptrbb += temp*2;
1370 #endif
1371 
1372 #ifdef LEFT
1373 		off += 8; // number of values in A
1374 #endif
1375 
1376 		C0 = C0+8;
1377 		C1 = C1+8;
1378 	}
1379 
1380 	if ( bm & 4 )
1381 	{
1382 
1383 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1384 		ptrbb = bb;
1385 #else
1386 		ptrba += off*4;
1387 		ptrbb = bb + off*2;
1388 #endif
1389 
1390 		res0_0 = 0;
1391 		res0_1 = 0;
1392 		res0_2 = 0;
1393 		res0_3 = 0;
1394 
1395 		res1_0 = 0;
1396 		res1_1 = 0;
1397 		res1_2 = 0;
1398 		res1_3 = 0;
1399 
1400 
1401 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1402 		temp = bk-off;
1403 #elif defined(LEFT)
1404 		temp = off+4;	// number of values in A
1405 #else
1406 		temp = off+2;	// number of values in B
1407 #endif
1408 
1409 		for (k=0; k<temp; k++)
1410                 {
1411 			b0 = ptrbb[0];
1412 			b1 = ptrbb[1];
1413 
1414 			a0 = ptrba[0];
1415 			res0_0 += a0*b0;
1416 			res1_0 += a0*b1;
1417 
1418 			a1 = ptrba[1];
1419 			res0_1 += a1*b0;
1420 			res1_1 += a1*b1;
1421 
1422 			a0 = ptrba[2];
1423 			res0_2 += a0*b0;
1424 			res1_2 += a0*b1;
1425 
1426 			a1 = ptrba[3];
1427 			res0_3 += a1*b0;
1428 			res1_3 += a1*b1;
1429 
1430 			ptrba = ptrba+4;
1431 			ptrbb = ptrbb+2;
1432                 }
1433 
1434 		res0_0 *= alpha;
1435 		res0_1 *= alpha;
1436 		res0_2 *= alpha;
1437 		res0_3 *= alpha;
1438 
1439 		res1_0 *= alpha;
1440 		res1_1 *= alpha;
1441 		res1_2 *= alpha;
1442 		res1_3 *= alpha;
1443 
1444 		C0[0] = res0_0;
1445 		C0[1] = res0_1;
1446 		C0[2] = res0_2;
1447 		C0[3] = res0_3;
1448 
1449 		C1[0] = res1_0;
1450 		C1[1] = res1_1;
1451 		C1[2] = res1_2;
1452 		C1[3] = res1_3;
1453 
1454 
1455 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1456 		temp = bk - off;
1457 #ifdef LEFT
1458 		temp -= 4; // number of values in A
1459 #else
1460 		temp -= 2; // number of values in B
1461 #endif
1462 		ptrba += temp*4;
1463 		ptrbb += temp*2;
1464 #endif
1465 
1466 #ifdef LEFT
1467 		off += 4; // number of values in A
1468 #endif
1469 
1470 		C0 = C0+4;
1471 		C1 = C1+4;
1472 
1473 	}
1474 
1475 	if ( bm & 2 )
1476 	{
1477 
1478 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1479 		ptrbb = bb;
1480 #else
1481 		ptrba += off*2;
1482 		ptrbb = bb + off*2;
1483 #endif
1484 
1485 		res0_0 = 0;
1486 		res0_1 = 0;
1487 
1488 		res1_0 = 0;
1489 		res1_1 = 0;
1490 
1491 
1492 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1493 		temp = bk-off;
1494 #elif defined(LEFT)
1495 		temp = off+2;	// number of values in A
1496 #else
1497 		temp = off+2;	// number of values in B
1498 #endif
1499 
1500 		for (k=0; k<temp; k++)
1501                 {
1502 			b0 = ptrbb[0];
1503 			b1 = ptrbb[1];
1504 
1505 			a0 = ptrba[0];
1506 			res0_0 += a0*b0;
1507 			res1_0 += a0*b1;
1508 
1509 			a1 = ptrba[1];
1510 			res0_1 += a1*b0;
1511 			res1_1 += a1*b1;
1512 
1513 			ptrba = ptrba+2;
1514 			ptrbb = ptrbb+2;
1515                 }
1516 
1517 		res0_0 *= alpha;
1518 		res0_1 *= alpha;
1519 
1520 		res1_0 *= alpha;
1521 		res1_1 *= alpha;
1522 
1523 		C0[0] = res0_0;
1524 		C0[1] = res0_1;
1525 
1526 		C1[0] = res1_0;
1527 		C1[1] = res1_1;
1528 
1529 
1530 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1531 		temp = bk - off;
1532 #ifdef LEFT
1533 		temp -= 2; // number of values in A
1534 #else
1535 		temp -= 2; // number of values in B
1536 #endif
1537 		ptrba += temp*2;
1538 		ptrbb += temp*2;
1539 #endif
1540 
1541 #ifdef LEFT
1542 		off += 2; // number of values in A
1543 #endif
1544 
1545 		C0 = C0+2;
1546 		C1 = C1+2;
1547 
1548 	}
1549 
1550 	if ( bm & 1 )
1551 	{
1552 
1553 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1554 		ptrbb = bb;
1555 #else
1556 		ptrba += off*1;
1557 		ptrbb = bb + off*2;
1558 #endif
1559 
1560 		res0_0 = 0;
1561 
1562 		res1_0 = 0;
1563 
1564 
1565 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1566 		temp = bk-off;
1567 #elif defined(LEFT)
1568 		temp = off+1;	// number of values in A
1569 #else
1570 		temp = off+2;	// number of values in B
1571 #endif
1572 
1573 		for (k=0; k<temp; k++)
1574                 {
1575 			b0 = ptrbb[0];
1576 			b1 = ptrbb[1];
1577 
1578 			a0 = ptrba[0];
1579 			res0_0 += a0*b0;
1580 			res1_0 += a0*b1;
1581 
1582 			ptrba = ptrba+1;
1583 			ptrbb = ptrbb+2;
1584                 }
1585 
1586 		res0_0 *= alpha;
1587 
1588 		res1_0 *= alpha;
1589 
1590 		C0[0] = res0_0;
1591 
1592 		C1[0] = res1_0;
1593 
1594 
1595 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1596 		temp = bk - off;
1597 #ifdef LEFT
1598 		temp -= 1; // number of values in A
1599 #else
1600 		temp -= 2; // number of values in B
1601 #endif
1602 		ptrba += temp*1;
1603 		ptrbb += temp*2;
1604 #endif
1605 
1606 #ifdef LEFT
1607 		off += 1; // number of values in A
1608 #endif
1609 
1610 		C0 = C0+1;
1611 		C1 = C1+1;
1612 
1613 	}
1614 
1615 
1616 #if defined(TRMMKERNEL) && !defined(LEFT)
1617 		off += 2;
1618 #endif
1619 
1620         k = (bk<<1);
1621         bb = bb+k;
1622         i = (ldc<<1);
1623         C = C+i;
1624     }
1625 
1626 
1627    for (j=0; j<(bn&1); j+=1)
1628    {
1629         C0 = C;
1630 
1631 #if defined(TRMMKERNEL) &&  defined(LEFT)
1632 	off = offset;
1633 #endif
1634 
1635         ptrba = ba;
1636 
1637 
1638         for (i=0; i<bm/16; i+=1)
1639         {
1640 
1641 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1642 		ptrbb = bb;
1643 #else
1644 		ptrba += off*16;
1645 		ptrbb = bb + off*1;
1646 #endif
1647 
1648 		res0_0 = 0;
1649 		res0_1 = 0;
1650 		res0_2 = 0;
1651 		res0_3 = 0;
1652 		res0_4 = 0;
1653 		res0_5 = 0;
1654 		res0_6 = 0;
1655 		res0_7 = 0;
1656 
1657 		res0_8  = 0;
1658 		res0_9  = 0;
1659 		res0_10 = 0;
1660 		res0_11 = 0;
1661 		res0_12 = 0;
1662 		res0_13 = 0;
1663 		res0_14 = 0;
1664 		res0_15 = 0;
1665 
1666 
1667 
1668 
1669 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1670 		temp = bk-off;
1671 #elif defined(LEFT)
1672 		temp = off+16;	// number of values in A
1673 #else
1674 		temp = off+1;	// number of values in B
1675 #endif
1676 
1677 		for (k=0; k<temp; k++)
1678                 {
1679 			b0 = ptrbb[0];
1680 
1681 			a0 = ptrba[0];
1682 			res0_0 += a0*b0;
1683 
1684 			a1 = ptrba[1];
1685 			res0_1 += a1*b0;
1686 
1687 			a0 = ptrba[2];
1688 			res0_2 += a0*b0;
1689 
1690 			a1 = ptrba[3];
1691 			res0_3 += a1*b0;
1692 
1693 			a0 = ptrba[4];
1694 			res0_4 += a0*b0;
1695 
1696 			a1 = ptrba[5];
1697 			res0_5 += a1*b0;
1698 
1699 			a0 = ptrba[6];
1700 			res0_6 += a0*b0;
1701 
1702 			a1 = ptrba[7];
1703 			res0_7 += a1*b0;
1704 
1705 			a0 = ptrba[8];
1706 			res0_8 += a0*b0;
1707 
1708 			a1 = ptrba[9];
1709 			res0_9 += a1*b0;
1710 
1711 			a0 = ptrba[10];
1712 			res0_10 += a0*b0;
1713 
1714 			a1 = ptrba[11];
1715 			res0_11 += a1*b0;
1716 
1717 			a0 = ptrba[12];
1718 			res0_12 += a0*b0;
1719 
1720 			a1 = ptrba[13];
1721 			res0_13 += a1*b0;
1722 
1723 			a0 = ptrba[14];
1724 			res0_14 += a0*b0;
1725 
1726 			a1 = ptrba[15];
1727 			res0_15 += a1*b0;
1728 
1729 
1730 			ptrba = ptrba+16;
1731 			ptrbb = ptrbb+1;
1732                 }
1733 
1734 		res0_0 *= alpha;
1735 		res0_1 *= alpha;
1736 		res0_2 *= alpha;
1737 		res0_3 *= alpha;
1738 		res0_4 *= alpha;
1739 		res0_5 *= alpha;
1740 		res0_6 *= alpha;
1741 		res0_7 *= alpha;
1742 
1743 		res0_8  *= alpha;
1744 		res0_9  *= alpha;
1745 		res0_10 *= alpha;
1746 		res0_11 *= alpha;
1747 		res0_12 *= alpha;
1748 		res0_13 *= alpha;
1749 		res0_14 *= alpha;
1750 		res0_15 *= alpha;
1751 
1752 		C0[0] = res0_0;
1753 		C0[1] = res0_1;
1754 		C0[2] = res0_2;
1755 		C0[3] = res0_3;
1756 		C0[4] = res0_4;
1757 		C0[5] = res0_5;
1758 		C0[6] = res0_6;
1759 		C0[7] = res0_7;
1760 
1761 		C0[8]  = res0_8;
1762 		C0[9]  = res0_9;
1763 		C0[10] = res0_10;
1764 		C0[11] = res0_11;
1765 		C0[12] = res0_12;
1766 		C0[13] = res0_13;
1767 		C0[14] = res0_14;
1768 		C0[15] = res0_15;
1769 
1770 
1771 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1772 		temp = bk - off;
1773 #ifdef LEFT
1774 		temp -= 16; // number of values in A
1775 #else
1776 		temp -= 1; // number of values in B
1777 #endif
1778 		ptrba += temp*16;
1779 		ptrbb += temp*1;
1780 #endif
1781 
1782 #ifdef LEFT
1783 		off += 16; // number of values in A
1784 #endif
1785 
1786 		C0 = C0+16;
1787 	}
1788 
1789 
1790 
1791 
1792         if ( bm & 8 )
1793         {
1794 
1795 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1796 		ptrbb = bb;
1797 #else
1798 		ptrba += off*8;
1799 		ptrbb = bb + off*1;
1800 #endif
1801 
1802 		res0_0 = 0;
1803 		res0_1 = 0;
1804 		res0_2 = 0;
1805 		res0_3 = 0;
1806 		res0_4 = 0;
1807 		res0_5 = 0;
1808 		res0_6 = 0;
1809 		res0_7 = 0;
1810 
1811 
1812 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1813 		temp = bk-off;
1814 #elif defined(LEFT)
1815 		temp = off+8;	// number of values in A
1816 #else
1817 		temp = off+1;	// number of values in B
1818 #endif
1819 
1820 		for (k=0; k<temp; k++)
1821                 {
1822 			b0 = ptrbb[0];
1823 
1824 			a0 = ptrba[0];
1825 			res0_0 += a0*b0;
1826 
1827 			a1 = ptrba[1];
1828 			res0_1 += a1*b0;
1829 
1830 			a0 = ptrba[2];
1831 			res0_2 += a0*b0;
1832 
1833 			a1 = ptrba[3];
1834 			res0_3 += a1*b0;
1835 
1836 			a0 = ptrba[4];
1837 			res0_4 += a0*b0;
1838 
1839 			a1 = ptrba[5];
1840 			res0_5 += a1*b0;
1841 
1842 			a0 = ptrba[6];
1843 			res0_6 += a0*b0;
1844 
1845 			a1 = ptrba[7];
1846 			res0_7 += a1*b0;
1847 
1848 			ptrba = ptrba+8;
1849 			ptrbb = ptrbb+1;
1850                 }
1851 
1852 		res0_0 *= alpha;
1853 		res0_1 *= alpha;
1854 		res0_2 *= alpha;
1855 		res0_3 *= alpha;
1856 		res0_4 *= alpha;
1857 		res0_5 *= alpha;
1858 		res0_6 *= alpha;
1859 		res0_7 *= alpha;
1860 
1861 		C0[0] = res0_0;
1862 		C0[1] = res0_1;
1863 		C0[2] = res0_2;
1864 		C0[3] = res0_3;
1865 		C0[4] = res0_4;
1866 		C0[5] = res0_5;
1867 		C0[6] = res0_6;
1868 		C0[7] = res0_7;
1869 
1870 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1871 		temp = bk - off;
1872 #ifdef LEFT
1873 		temp -= 8; // number of values in A
1874 #else
1875 		temp -= 1; // number of values in B
1876 #endif
1877 		ptrba += temp*8;
1878 		ptrbb += temp*1;
1879 #endif
1880 
1881 #ifdef LEFT
1882 		off += 8; // number of values in A
1883 #endif
1884 
1885 		C0 = C0+8;
1886 	}
1887 
1888 	if ( bm & 4 )
1889 	{
1890 
1891 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1892 		ptrbb = bb;
1893 #else
1894 		ptrba += off*4;
1895 		ptrbb = bb + off*1;
1896 #endif
1897 
1898 		res0_0 = 0;
1899 		res0_1 = 0;
1900 		res0_2 = 0;
1901 		res0_3 = 0;
1902 
1903 
1904 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1905 		temp = bk-off;
1906 #elif defined(LEFT)
1907 		temp = off+4;	// number of values in A
1908 #else
1909 		temp = off+1;	// number of values in B
1910 #endif
1911 
1912 		for (k=0; k<temp; k++)
1913                 {
1914 			b0 = ptrbb[0];
1915 
1916 			a0 = ptrba[0];
1917 			res0_0 += a0*b0;
1918 
1919 			a1 = ptrba[1];
1920 			res0_1 += a1*b0;
1921 
1922 			a0 = ptrba[2];
1923 			res0_2 += a0*b0;
1924 
1925 			a1 = ptrba[3];
1926 			res0_3 += a1*b0;
1927 
1928 			ptrba = ptrba+4;
1929 			ptrbb = ptrbb+1;
1930                 }
1931 
1932 		res0_0 *= alpha;
1933 		res0_1 *= alpha;
1934 		res0_2 *= alpha;
1935 		res0_3 *= alpha;
1936 
1937 		C0[0] = res0_0;
1938 		C0[1] = res0_1;
1939 		C0[2] = res0_2;
1940 		C0[3] = res0_3;
1941 
1942 
1943 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1944 		temp = bk - off;
1945 #ifdef LEFT
1946 		temp -= 4; // number of values in A
1947 #else
1948 		temp -= 1; // number of values in B
1949 #endif
1950 		ptrba += temp*4;
1951 		ptrbb += temp*1;
1952 #endif
1953 
1954 #ifdef LEFT
1955 		off += 4; // number of values in A
1956 #endif
1957 
1958 		C0 = C0+4;
1959 
1960 	}
1961 
1962 	if ( bm & 2 )
1963 	{
1964 
1965 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1966 		ptrbb = bb;
1967 #else
1968 		ptrba += off*2;
1969 		ptrbb = bb + off*1;
1970 #endif
1971 
1972 		res0_0 = 0;
1973 		res0_1 = 0;
1974 
1975 
1976 
1977 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1978 		temp = bk-off;
1979 #elif defined(LEFT)
1980 		temp = off+2;	// number of values in A
1981 #else
1982 		temp = off+1;	// number of values in B
1983 #endif
1984 
1985 		for (k=0; k<temp; k++)
1986                 {
1987 			b0 = ptrbb[0];
1988 
1989 			a0 = ptrba[0];
1990 			res0_0 += a0*b0;
1991 
1992 			a1 = ptrba[1];
1993 			res0_1 += a1*b0;
1994 
1995 			ptrba = ptrba+2;
1996 			ptrbb = ptrbb+1;
1997                 }
1998 
1999 		res0_0 *= alpha;
2000 		res0_1 *= alpha;
2001 
2002 		C0[0] = res0_0;
2003 		C0[1] = res0_1;
2004 
2005 
2006 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2007 		temp = bk - off;
2008 #ifdef LEFT
2009 		temp -= 2; // number of values in A
2010 #else
2011 		temp -= 1; // number of values in B
2012 #endif
2013 		ptrba += temp*2;
2014 		ptrbb += temp*1;
2015 #endif
2016 
2017 #ifdef LEFT
2018 		off += 2; // number of values in A
2019 #endif
2020 
2021 		C0 = C0+2;
2022 
2023 	}
2024 
2025 	if ( bm & 1 )
2026 	{
2027 
2028 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2029 		ptrbb = bb;
2030 #else
2031 		ptrba += off*1;
2032 		ptrbb = bb + off*1;
2033 #endif
2034 
2035 		res0_0 = 0;
2036 
2037 
2038 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2039 		temp = bk-off;
2040 #elif defined(LEFT)
2041 		temp = off+1;	// number of values in A
2042 #else
2043 		temp = off+1;	// number of values in B
2044 #endif
2045 
2046 		for (k=0; k<temp; k++)
2047                 {
2048 			b0 = ptrbb[0];
2049 
2050 			a0 = ptrba[0];
2051 			res0_0 += a0*b0;
2052 
2053 			ptrba = ptrba+1;
2054 			ptrbb = ptrbb+1;
2055                 }
2056 
2057 		res0_0 *= alpha;
2058 
2059 		C0[0] = res0_0;
2060 
2061 
2062 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2063 		temp = bk - off;
2064 #ifdef LEFT
2065 		temp -= 1; // number of values in A
2066 #else
2067 		temp -= 1; // number of values in B
2068 #endif
2069 		ptrba += temp*1;
2070 		ptrbb += temp*1;
2071 #endif
2072 
2073 #ifdef LEFT
2074 		off += 1; // number of values in A
2075 #endif
2076 
2077 		C0 = C0+1;
2078 
2079 	}
2080 
2081 
2082 
2083 #if defined(TRMMKERNEL) && !defined(LEFT)
2084 		off += 1;
2085 #endif
2086 
2087         k = (bk<<0);
2088         bb = bb+k;
2089         C = C+ldc;
2090    }
2091    return 0;
2092 }
2093