1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define M	%i0
43#define N	%i1
44#define K	%i2
45
46#if defined(DOUBLE) && !defined(__64BIT__)
47#define A	%i5
48#define B	%i4
49#else
50#define A	%i4
51#define B	%i5
52#endif
53
54#define C	%o4
55#define LDC	%o5
56
57#define AO	%l0
58#define BO	%l1
59#define I	%l2
60#define J	%l3
61#define L	%l4
62
63#define C1	%o0
64#define C2	%o1
65#define C3	%o2
66#define C4	%o3
67
68#define OFFSET	%l5
69#define	KK	%l6
70#define TEMP1	%l7
71#define TEMP2	%i3
72
73#ifdef DOUBLE
74#define c01	%f0
75#define c02	%f2
76#define c03	%f4
77#define c04	%f6
78#define c05	%f8
79#define c06	%f10
80#define c07	%f12
81#define c08	%f14
82#define c09	%f16
83#define c10	%f18
84#define c11	%f20
85#define c12	%f22
86#define c13	%f24
87#define c14	%f26
88#define c15	%f28
89#define c16	%f30
90
91#define t1	%f32
92#define	t2 	%f34
93#define t3	%f36
94#define	t4 	%f38
95
96#define a1	%f40
97#define a2	%f42
98#define a3	%f44
99#define a4	%f46
100#define a5	%f58
101
102#define b1	%f48
103#define b2	%f50
104#define b3	%f52
105#define b4	%f54
106#define b5	%f56
107
108#define FZERO	%f60
109#define ALPHA	%f62
110#else
111#define c01	%f0
112#define c02	%f1
113#define c03	%f2
114#define c04	%f3
115#define c05	%f4
116#define c06	%f5
117#define c07	%f6
118#define c08	%f7
119#define c09	%f8
120#define c10	%f9
121#define c11	%f10
122#define c12	%f11
123#define c13	%f12
124#define c14	%f13
125#define c15	%f14
126#define c16	%f15
127
128#define t1	%f16
129#define	t2 	%f17
130#define t3	%f18
131#define	t4 	%f19
132
133#define a1	%f20
134#define a2	%f21
135#define a3	%f22
136#define a4	%f23
137#define a5	%f31
138
139#define b1	%f24
140#define b2	%f25
141#define b3	%f26
142#define b4	%f27
143#define b5	%f28
144
145#define FZERO	%f29
146#define ALPHA	%f30
147#endif
148
149	PROLOGUE
150	SAVESP
151	nop
152
153#ifndef __64BIT__
154
155#ifdef DOUBLE
156	st	%i3, [%sp + STACK_START + 16]   /* ALPHA */
157	st	%i4, [%sp + STACK_START + 20]
158
159	ld	[%sp + STACK_START + 28], B
160	ld	[%sp + STACK_START + 32], C
161	ld	[%sp + STACK_START + 36], LDC
162#ifdef TRMMKERNEL
163	ld	[%sp + STACK_START + 40], OFFSET
164#endif
165#else
166	st	%i3, [%sp + STACK_START + 16]   /* ALPHA */
167
168	ld	[%sp + STACK_START + 28], C
169	ld	[%sp + STACK_START + 32], LDC
170#ifdef TRMMKERNEL
171	ld	[%sp + STACK_START + 36], OFFSET
172#endif
173#endif
174	LDF	[%sp + STACK_START + 16], ALPHA
175#else
176	ldx	[%sp+  STACK_START + 56], C
177	ldx	[%sp+  STACK_START + 64], LDC
178#ifdef TRMMKERNEL
179	ldx	[%sp+  STACK_START + 72], OFFSET
180#endif
181#ifdef DOUBLE
182	FMOV	%f6, ALPHA
183#else
184	FMOV	%f7, ALPHA
185#endif
186#endif
187
188	FCLR(29)
189
190#if defined(TRMMKERNEL) && !defined(LEFT)
191	neg	OFFSET, KK
192#endif
193
194	sra	N, 2, J
195	cmp	J, 0
196	ble,pn	%icc, .LL100
197	sll	LDC, BASE_SHIFT, LDC
198
199.LL11:
200	add	C, LDC, C2
201	FMOV	FZERO, t1
202	nop
203	mov	C, C1
204
205	add	C2, LDC, C3
206	FMOV	FZERO, t2
207	sra	K, 2, L
208	mov	A, AO
209
210	sra	M, 2, I
211	add	C3, LDC, C4
212	FMOV	FZERO, t3
213
214#if defined(TRMMKERNEL) &&  defined(LEFT)
215	mov	OFFSET, KK
216#endif
217
218	cmp	I, 0
219	add	C4, LDC, C
220	FMOV	FZERO, t4
221
222	ble,pn	%icc, .LL50
223	FMOV	FZERO, c01
224
225.LL21:
226#if !defined(TRMMKERNEL)
227	FMOV	FZERO, c02
228	mov	B, BO
229
230	FMOV	FZERO, c03
231	cmp	L,  0
232#else
233	FMOV	FZERO, c02
234	FMOV	FZERO, c03
235
236#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
237	mov	B, BO
238#else
239	sll	KK, 2 + BASE_SHIFT, TEMP1
240
241	add	AO, TEMP1, AO
242	add	B,  TEMP1, BO
243#endif
244
245#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
246	sub	K, KK, L
247#elif defined(LEFT)
248	add	KK, 4, L
249#else
250	add	KK, 4, L
251#endif
252	sra	L, 2, L
253	cmp	L,  0
254#endif
255
256	LDF	[AO + 0 * SIZE], a1
257	FMOV	FZERO, c04
258	LDF	[BO + 0 * SIZE], b1
259	FMOV	FZERO, c05
260	LDF	[AO + 1 * SIZE], a2
261	FMOV	FZERO, c06
262	LDF	[BO + 1 * SIZE], b2
263	FMOV	FZERO, c07
264
265	LDF	[AO + 2 * SIZE], a3
266	FMOV	FZERO, c08
267	LDF	[BO + 2 * SIZE], b3
268	FMOV	FZERO, c09
269	LDF	[AO + 3 * SIZE], a4
270	FMOV	FZERO, c10
271	LDF	[BO + 3 * SIZE], b4
272	FMOV	FZERO, c11
273	LDF	[BO +  4 * SIZE], b5	/* ***** */
274
275	LDF	[AO +  4 * SIZE], a5	/* ***** */
276
277	prefetch [C1 + 3 * SIZE], 3
278	FMOV	FZERO, c12
279	prefetch [C2 + 3 * SIZE], 3
280	FMOV	FZERO, c13
281	prefetch [C3 + 3 * SIZE], 3
282	FMOV	FZERO, c14
283	prefetch [C4 + 3 * SIZE], 3
284	FMOV	FZERO, c15
285
286	ble,pn	%icc, .LL25
287	FMOV	FZERO, c16
288
289
290#define APREFETCHSIZE 40
291#define BPREFETCHSIZE 40
292
293#define APREFETCH_CATEGORY 0
294#define BPREFETCH_CATEGORY 0
295
296.LL22:
297	FADD	c04, t1, c04
298	prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY
299	FMUL	a1, b1, t1
300	nop
301
302	FADD	c08, t2, c08
303	prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY
304	FMUL	a1, b2, t2
305	add	AO, 16 * SIZE, AO
306
307	FADD	c12, t3, c12
308	LDF	[AO - 13 * SIZE], a4
309	FMUL	a1, b3, t3
310	add	BO, 16 * SIZE, BO
311
312	FADD	c16, t4, c16
313	nop
314	FMUL	a1, b4, t4
315	LDF	[AO -  8 * SIZE], a1
316
317	FADD	c01, t1, c01
318	nop
319	FMUL	a2, b1, t1
320	nop
321
322	FADD	c05, t2, c05
323	nop
324	FMUL	a2, b2, t2
325	nop
326
327	FADD	c09, t3, c09
328	nop
329	FMUL	a2, b3, t3
330	nop
331
332	FADD	c13, t4, c13
333	add	L, -1, L
334	FMUL	a2, b4, t4
335	LDF	[AO - 11 * SIZE], a2
336
337	FADD	c02, t1, c02
338	nop
339	FMUL	a3, b1, t1
340	nop
341
342	FADD	c06, t2, c06
343	nop
344	FMUL	a3, b2, t2
345	nop
346
347	FADD	c10, t3, c10
348	nop
349	FMUL	a3, b3, t3
350	nop
351
352	FADD	c14, t4, c14
353	nop
354	FMUL	a3, b4, t4
355	LDF	[AO - 10 * SIZE], a3
356
357	FADD	c03, t1, c03
358	nop
359	FMUL	a4, b1, t1
360	LDF	[BO -  8 * SIZE], b1
361
362	FADD	c07, t2, c07
363	nop
364	FMUL	a4, b2, t2
365	LDF	[BO - 11 * SIZE], b2
366
367	FADD	c11, t3, c11
368	nop
369	FMUL	a4, b3, t3
370	LDF	[BO - 10 * SIZE], b3
371
372	FADD	c15, t4, c15
373	nop
374	FMUL	a4, b4, t4
375	LDF	[BO -  9 * SIZE], b4
376
377	FADD	c04, t1, c04
378	nop
379	FMUL	a5, b5, t1
380	LDF	[AO -  9 * SIZE], a4
381
382	FADD	c08, t2, c08
383	nop
384	FMUL	a5, b2, t2
385	nop
386
387	FADD	c12, t3, c12
388	nop
389	FMUL	a5, b3, t3
390	nop
391
392	FADD	c16, t4, c16
393	nop
394	FMUL	a5, b4, t4
395	LDF	[AO - 4 * SIZE], a5
396
397	FADD	c01, t1, c01
398	nop
399	FMUL	a2, b5, t1
400	nop
401
402	FADD	c05, t2, c05
403	nop
404	FMUL	a2, b2, t2
405	nop
406
407	FADD	c09, t3, c09
408	nop
409	FMUL	a2, b3, t3
410	nop
411
412	FADD	c13, t4, c13
413	nop
414	FMUL	a2, b4, t4
415	LDF	[AO -  7 * SIZE], a2
416
417	FADD	c02, t1, c02
418	nop
419	FMUL	a3, b5, t1
420	nop
421
422	FADD	c06, t2, c06
423	nop
424	FMUL	a3, b2, t2
425	nop
426
427	FADD	c10, t3, c10
428	nop
429	FMUL	a3, b3, t3
430	nop
431
432	FADD	c14, t4, c14
433	nop
434	FMUL	a3, b4, t4
435	LDF	[AO -  6 * SIZE], a3
436
437	FADD	c03, t1, c03
438	nop
439	FMUL	a4, b5, t1
440	LDF	[BO - 4 * SIZE], b5
441
442	FADD	c07, t2, c07
443	nop
444	FMUL	a4, b2, t2
445	LDF	[BO -  7 * SIZE], b2
446
447	FADD	c11, t3, c11
448	nop
449	FMUL	a4, b3, t3
450	LDF	[BO -  6 * SIZE], b3
451
452	FADD	c15, t4, c15
453	nop
454	FMUL	a4, b4, t4
455	LDF	[BO -  5 * SIZE], b4
456
457	FADD	c04, t1, c04
458	nop
459	FMUL	a1, b1, t1
460	LDF	[AO -  5 * SIZE], a4
461
462	FADD	c08, t2, c08
463	nop
464	FMUL	a1, b2, t2
465	nop
466
467	FADD	c12, t3, c12
468	nop
469	FMUL	a1, b3, t3
470	nop
471
472	FADD	c16, t4, c16
473	nop
474	FMUL	a1, b4, t4
475	LDF	[AO -  0 * SIZE], a1
476
477	FADD	c01, t1, c01
478	nop
479	FMUL	a2, b1, t1
480	nop
481
482#ifdef DOUBLE
483	prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY
484#else
485	nop
486#endif
487	FADD	c05, t2, c05
488	nop
489	FMUL	a2, b2, t2
490
491	FADD	c09, t3, c09
492	nop
493	FMUL	a2, b3, t3
494	nop
495
496	FADD	c13, t4, c13
497	nop
498	FMUL	a2, b4, t4
499	nop
500
501	FADD	c02, t1, c02
502	nop
503	FMUL	a3, b1, t1
504	LDF	[AO - 3 * SIZE], a2
505
506	FADD	c06, t2, c06
507#ifdef DOUBLE
508	prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY
509#else
510	nop
511#endif
512	FMUL	a3, b2, t2
513	nop
514
515	FADD	c10, t3, c10
516	nop
517	FMUL	a3, b3, t3
518	nop
519
520	FADD	c14, t4, c14
521	nop
522	FMUL	a3, b4, t4
523	LDF	[AO - 2 * SIZE], a3
524
525	FADD	c03, t1, c03
526	nop
527	FMUL	a4, b1, t1
528	LDF	[BO -  0 * SIZE], b1
529
530	FADD	c07, t2, c07
531	nop
532	FMUL	a4, b2, t2
533	LDF	[BO - 3 * SIZE], b2
534
535	FADD	c11, t3, c11
536	nop
537	FMUL	a4, b3, t3
538	LDF	[BO - 2 * SIZE], b3
539
540	FADD	c15, t4, c15
541	nop
542	FMUL	a4, b4, t4
543	LDF	[BO - 1 * SIZE], b4
544
545	FADD	c04, t1, c04
546	nop
547	FMUL	a5, b5, t1
548	LDF	[AO - 1 * SIZE], a4
549
550	FADD	c08, t2, c08
551	FMUL	a5, b2, t2
552	FADD	c12, t3, c12
553	FMUL	a5, b3, t3
554
555	FADD	c16, t4, c16
556	nop
557	FMUL	a5, b4, t4
558	LDF	[AO +  4 * SIZE], a5
559
560	FADD	c01, t1, c01
561	nop
562	FMUL	a2, b5, t1
563	nop
564
565	FADD	c05, t2, c05
566	nop
567	FMUL	a2, b2, t2
568	nop
569
570	FADD	c09, t3, c09
571	nop
572	FMUL	a2, b3, t3
573	nop
574
575	FADD	c13, t4, c13
576	nop
577	FMUL	a2, b4, t4
578	LDF	[AO +  1 * SIZE], a2
579
580	FADD	c02, t1, c02
581	nop
582	FMUL	a3, b5, t1
583	nop
584
585	FADD	c06, t2, c06
586	nop
587	FMUL	a3, b2, t2
588	nop
589
590	FADD	c10, t3, c10
591	nop
592	FMUL	a3, b3, t3
593	nop
594
595	FADD	c14, t4, c14
596	nop
597	FMUL	a3, b4, t4
598	LDF	[AO +  2 * SIZE], a3
599
600	FADD	c03, t1, c03
601	cmp	L, 0
602	FMUL	a4, b5, t1
603	LDF	[BO +  4 * SIZE], b5
604
605	FADD	c07, t2, c07
606	nop
607	FMUL	a4, b2, t2
608	LDF	[BO +  1 * SIZE], b2
609
610	FADD	c11, t3, c11
611	nop
612	FMUL	a4, b3, t3
613	LDF	[BO +  2 * SIZE], b3
614
615	FADD	c15, t4, c15
616	FMUL	a4, b4, t4
617	bg,pt	%icc, .LL22
618	LDF	[BO +  3 * SIZE], b4
619
620.LL25:
621#ifndef TRMMKERNEL
622	and	K, 3, L
623#else
624#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
625	sub	K, KK, L
626#elif defined(LEFT)
627	add	KK, 4, L
628#else
629	add	KK, 4, L
630#endif
631	and	L, 3,  L
632#endif
633	cmp	L,  0
634	ble,a,pn %icc, .LL29
635	nop
636
637.LL26:
638	FADD	c04, t1, c04
639	LDF	[AO +  3 * SIZE], a4
640	FMUL	a1, b1, t1
641	add	AO, 4 * SIZE, AO
642
643	FADD	c08, t2, c08
644	add	BO, 4 * SIZE, BO
645	FMUL	a1, b2, t2
646	add	L, -1, L
647
648	FADD	c12, t3, c12
649	nop
650	FMUL	a1, b3, t3
651	cmp	L, 0
652
653	FADD	c16, t4, c16
654	nop
655	FMUL	a1, b4, t4
656	LDF	[AO + 0 * SIZE], a1
657
658	FADD	c01, t1, c01
659	nop
660	FMUL	a2, b1, t1
661	nop
662
663	FADD	c05, t2, c05
664	nop
665	FMUL	a2, b2, t2
666	nop
667
668	FADD	c09, t3, c09
669	nop
670	FMUL	a2, b3, t3
671	nop
672
673	FADD	c13, t4, c13
674	nop
675	FMUL	a2, b4, t4
676	LDF	[AO + 1 * SIZE], a2
677
678	FADD	c02, t1, c02
679	nop
680	FMUL	a3, b1, t1
681	nop
682
683	FADD	c06, t2, c06
684	nop
685	FMUL	a3, b2, t2
686	nop
687
688	FADD	c10, t3, c10
689	nop
690	FMUL	a3, b3, t3
691	nop
692
693	FADD	c14, t4, c14
694	nop
695	FMUL	a3, b4, t4
696	LDF	[AO + 2 * SIZE], a3
697
698	FADD	c03, t1, c03
699	nop
700	FMUL	a4, b1, t1
701	LDF	[BO + 0 * SIZE], b1
702
703	FADD	c07, t2, c07
704	nop
705	FMUL	a4, b2, t2
706	LDF	[BO + 1 * SIZE], b2
707
708	FADD	c11, t3, c11
709	nop
710	FMUL	a4, b3, t3
711	LDF	[BO + 2 * SIZE], b3
712
713	FADD	c15, t4, c15
714	FMUL	a4, b4, t4
715	bg,pt	%icc, .LL26
716	LDF	[BO + 3 * SIZE], b4
717
718.LL29:
719#ifndef TRMMKERNEL
720	FADD	c04, t1, c04
721	add	I, -1, I
722	FMUL	c01, ALPHA, c01
723	LDF	[C1 + 0 * SIZE], a1
724
725	FADD	c08, t2, c08
726	cmp	I, 0
727	FMUL	c02, ALPHA, c02
728	LDF	[C1 + 1 * SIZE], a2
729
730	FADD	c12, t3, c12
731	nop
732	FMUL	c03, ALPHA, c03
733	LDF	[C1 + 2 * SIZE], a3
734
735	FADD	c16, t4, c16
736	nop
737	FMUL	c04, ALPHA, c04
738	LDF	[C1 + 3 * SIZE], a4
739
740	FMUL	c05, ALPHA, c05
741	LDF	[C2 + 0 * SIZE], b1
742	FMUL	c06, ALPHA, c06
743	LDF	[C2 + 1 * SIZE], b2
744
745	FMUL	c07, ALPHA, c07
746	LDF	[C2 + 2 * SIZE], b3
747	FMUL	c08, ALPHA, c08
748	LDF	[C2 + 3 * SIZE], b4
749
750	FMUL	c09, ALPHA, c09
751	LDF	[C3 + 0 * SIZE], t1
752	FMUL	c10, ALPHA, c10
753	LDF	[C3 + 1 * SIZE], t2
754
755	FMUL	c11, ALPHA, c11
756	LDF	[C3 + 2 * SIZE], t3
757	FMUL	c12, ALPHA, c12
758	LDF	[C3 + 3 * SIZE], t4
759
760	FMUL	c13, ALPHA, c13
761	add	C1, 4 * SIZE, C1
762	FADD	c01, a1, c01
763	LDF	[C4 + 0 * SIZE], a1
764
765	FMUL	c14, ALPHA, c14
766	add	C2, 4 * SIZE, C2
767	FADD	c02, a2, c02
768	LDF	[C4 + 1 * SIZE], a2
769
770	FMUL	c15, ALPHA, c15
771	add	C3, 4 * SIZE, C3
772	FADD	c03, a3, c03
773	LDF	[C4 + 2 * SIZE], a3
774
775	FMUL	c16, ALPHA, c16
776	nop
777	FADD	c04, a4, c04
778	LDF	[C4 + 3 * SIZE], a4
779
780	STF	c01, [C1 - 4 * SIZE]
781	FADD	c05, b1, c05
782	STF	c02, [C1 - 3 * SIZE]
783	FADD	c06, b2, c06
784
785	STF	c03, [C1 - 2 * SIZE]
786	FADD	c07, b3, c07
787	STF	c04, [C1 - 1 * SIZE]
788	FADD	c08, b4, c08
789
790	STF	c05, [C2 - 4 * SIZE]
791	FADD	c09, t1, c09
792	STF	c06, [C2 - 3 * SIZE]
793	FADD	c10, t2, c10
794
795	STF	c07, [C2 - 2 * SIZE]
796	FADD	c11, t3, c11
797	STF	c08, [C2 - 1 * SIZE]
798	FADD	c12, t4, c12
799
800	STF	c09, [C3 - 4 * SIZE]
801	FADD	c13, a1, c13
802	STF	c10, [C3 - 3 * SIZE]
803	FADD	c14, a2, c14
804
805	STF	c11, [C3 - 2 * SIZE]
806	FADD	c15, a3, c15
807	STF	c12, [C3 - 1 * SIZE]
808	FADD	c16, a4, c16
809
810	STF	c13, [C4 + 0 * SIZE]
811	FMOV	FZERO, t1
812	STF	c14, [C4 + 1 * SIZE]
813	FMOV	FZERO, t2
814
815	STF	c15, [C4 + 2 * SIZE]
816	FMOV	FZERO, t3
817	STF	c16, [C4 + 3 * SIZE]
818	FMOV	FZERO, t4
819
820	add	C4, 4 * SIZE, C4
821#else
822
823	FADD	c04, t1, c04
824	FMUL	c01, ALPHA, c01
825	FADD	c08, t2, c08
826	FMUL	c02, ALPHA, c02
827	FADD	c12, t3, c12
828	FMUL	c03, ALPHA, c03
829	FADD	c16, t4, c16
830	FMUL	c04, ALPHA, c04
831
832	STF	c01, [C1 + 0 * SIZE]
833	FMUL	c05, ALPHA, c05
834	STF	c02, [C1 + 1 * SIZE]
835	FMUL	c06, ALPHA, c06
836	STF	c03, [C1 + 2 * SIZE]
837	FMUL	c07, ALPHA, c07
838	STF	c04, [C1 + 3 * SIZE]
839	FMUL	c08, ALPHA, c08
840
841	STF	c05, [C2 + 0 * SIZE]
842	FMUL	c09, ALPHA, c09
843	STF	c06, [C2 + 1 * SIZE]
844	FMUL	c10, ALPHA, c10
845	STF	c07, [C2 + 2 * SIZE]
846	FMUL	c11, ALPHA, c11
847	STF	c08, [C2 + 3 * SIZE]
848	FMUL	c12, ALPHA, c12
849
850	STF	c09, [C3 + 0 * SIZE]
851	FMUL	c13, ALPHA, c13
852	STF	c10, [C3 + 1 * SIZE]
853	FMUL	c14, ALPHA, c14
854	STF	c11, [C3 + 2 * SIZE]
855	FMUL	c15, ALPHA, c15
856	STF	c12, [C3 + 3 * SIZE]
857	FMUL	c16, ALPHA, c16
858
859	STF	c13, [C4 + 0 * SIZE]
860	STF	c14, [C4 + 1 * SIZE]
861	STF	c15, [C4 + 2 * SIZE]
862	STF	c16, [C4 + 3 * SIZE]
863
864	FMOV	FZERO, t1
865	FMOV	FZERO, t2
866	FMOV	FZERO, t3
867	FMOV	FZERO, t4
868
869	add	C1, 4 * SIZE, C1
870	add	C2, 4 * SIZE, C2
871	add	C3, 4 * SIZE, C3
872	add	C4, 4 * SIZE, C4
873
874#if ( defined(LEFT) &&  defined(TRANSA)) || \
875    (!defined(LEFT) && !defined(TRANSA))
876	sub	K, KK, TEMP1
877#ifdef LEFT
878	add	TEMP1, -4, TEMP1
879#else
880	add	TEMP1, -4, TEMP1
881#endif
882	sll	TEMP1, 2 + BASE_SHIFT, TEMP1
883
884	add	AO, TEMP1, AO
885	add	BO, TEMP1, BO
886#endif
887
888#ifdef LEFT
889	add	KK, 4, KK
890#endif
891
892	add	I, -1, I
893	cmp	I, 0
894
895#endif
896
897	sra	K, 2, L
898	bg,pt	%icc, .LL21
899	FMOV	FZERO, c01
900
901.LL50:
902	and	M, 2, I
903	FMOV	FZERO, c02
904	cmp	I, 0
905
906	FMOV	FZERO, t1
907	ble,pn	%icc, .LL70
908	FMOV	FZERO, c04
909
910#if !defined(TRMMKERNEL)
911	LDF	[AO + 0 * SIZE], a1
912	sra	K, 2, L
913	FMOV	FZERO, t2
914	LDF	[B  + 0 * SIZE], b1
915	mov	B, BO
916	FMOV	FZERO, c06
917	LDF	[AO + 1 * SIZE], a2
918	cmp	L,  0
919	FMOV	FZERO, t3
920	LDF	[B  + 1 * SIZE], b2
921	FMOV	FZERO, c08
922	LDF	[AO + 2 * SIZE], a3
923	FMOV	FZERO, t4
924	LDF	[B  + 2 * SIZE], b3
925	FMOV	FZERO, c01
926	LDF	[AO + 3 * SIZE], a4
927	FMOV	FZERO, c03
928	LDF	[B  + 3 * SIZE], b4
929	FMOV	FZERO, c05
930#else
931
932#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
933	mov	B, BO
934#else
935	sll	KK, 1 + BASE_SHIFT, TEMP1
936	sll	KK, 2 + BASE_SHIFT, TEMP2
937
938	add	AO, TEMP1, AO
939	add	B,  TEMP2, BO
940#endif
941
942#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
943	sub	K, KK, L
944#elif defined(LEFT)
945	add	KK, 2, L
946#else
947	add	KK, 4, L
948#endif
949	sra	L, 2, L
950	cmp	L,  0
951
952	LDF	[AO + 0 * SIZE], a1
953	FMOV	FZERO, t2
954	LDF	[BO + 0 * SIZE], b1
955	FMOV	FZERO, c06
956
957	LDF	[AO + 1 * SIZE], a2
958	FMOV	FZERO, t3
959	LDF	[BO + 1 * SIZE], b2
960	FMOV	FZERO, c08
961
962	LDF	[AO + 2 * SIZE], a3
963	FMOV	FZERO, t4
964	LDF	[BO + 2 * SIZE], b3
965	FMOV	FZERO, c01
966
967	LDF	[AO + 3 * SIZE], a4
968	FMOV	FZERO, c03
969	LDF	[BO + 3 * SIZE], b4
970	FMOV	FZERO, c05
971
972#endif
973	ble,pn	%icc, .LL55
974	FMOV	FZERO, c07
975
976.LL52:
977	FADD	c02, t1, c02
978	add	AO,  8 * SIZE, AO
979	prefetch [AO + APREFETCHSIZE * SIZE], 0
980
981	FMUL	a1, b1, t1
982	add	BO, 16 * SIZE, BO
983
984	FADD	c04, t2, c04
985	add	L, -1, L
986	FMUL	a1, b2, t2
987
988	FADD	c06, t3, c06
989	cmp	L, 0
990	FMUL	a1, b3, t3
991
992	FADD	c08, t4, c08
993	FMUL	a1, b4, t4
994	LDF	[AO -  4 * SIZE], a1
995
996	FADD	c01, t1, c01
997	FMUL	a2, b1, t1
998	LDF	[BO - 12 * SIZE], b1
999	FADD	c03, t2, c03
1000	FMUL	a2, b2, t2
1001	LDF	[BO - 11 * SIZE], b2
1002
1003	FADD	c05, t3, c05
1004	FMUL	a2, b3, t3
1005	LDF	[BO - 10 * SIZE], b3
1006	FADD	c07, t4, c07
1007	FMUL	a2, b4, t4
1008	LDF	[BO -  9 * SIZE], b4
1009
1010	FADD	c02, t1, c02
1011	FMUL	a3, b1, t1
1012	LDF	[AO -  3 * SIZE], a2
1013	FADD	c04, t2, c04
1014	FMUL	a3, b2, t2
1015
1016	FADD	c06, t3, c06
1017	FMUL	a3, b3, t3
1018	FADD	c08, t4, c08
1019	FMUL	a3, b4, t4
1020	LDF	[AO -  2 * SIZE], a3
1021
1022	FADD	c01, t1, c01
1023	FMUL	a4, b1, t1
1024	LDF	[BO -  8 * SIZE], b1
1025	FADD	c03, t2, c03
1026	FMUL	a4, b2, t2
1027	LDF	[BO -  7 * SIZE], b2
1028
1029	FADD	c05, t3, c05
1030	FMUL	a4, b3, t3
1031	LDF	[BO -  6 * SIZE], b3
1032	FADD	c07, t4, c07
1033	FMUL	a4, b4, t4
1034	LDF	[BO -  5 * SIZE], b4
1035
1036	FADD	c02, t1, c02
1037	FMUL	a1, b1, t1
1038	LDF	[AO -  1 * SIZE], a4
1039	FADD	c04, t2, c04
1040	FMUL	a1, b2, t2
1041
1042	FADD	c06, t3, c06
1043	FMUL	a1, b3, t3
1044	FADD	c08, t4, c08
1045	FMUL	a1, b4, t4
1046	LDF	[AO +  0 * SIZE], a1
1047
1048	FADD	c01, t1, c01
1049	FMUL	a2, b1, t1
1050	LDF	[BO -  4 * SIZE], b1
1051
1052	FADD	c03, t2, c03
1053	FMUL	a2, b2, t2
1054	LDF	[BO -  3 * SIZE], b2
1055
1056	FADD	c05, t3, c05
1057	FMUL	a2, b3, t3
1058	LDF	[BO -  2 * SIZE], b3
1059	FADD	c07, t4, c07
1060	FMUL	a2, b4, t4
1061	LDF	[BO -  1 * SIZE], b4
1062
1063	FADD	c02, t1, c02
1064	FMUL	a3, b1, t1
1065	LDF	[AO +  1 * SIZE], a2
1066	FADD	c04, t2, c04
1067	FMUL	a3, b2, t2
1068
1069	FADD	c06, t3, c06
1070	FMUL	a3, b3, t3
1071	FADD	c08, t4, c08
1072	FMUL	a3, b4, t4
1073	LDF	[AO +  2 * SIZE], a3
1074
1075	FADD	c01, t1, c01
1076	FMUL	a4, b1, t1
1077	LDF	[BO +  0 * SIZE], b1
1078	FADD	c03, t2, c03
1079	FMUL	a4, b2, t2
1080	LDF	[BO +  1 * SIZE], b2
1081
1082	FADD	c05, t3, c05
1083	FMUL	a4, b3, t3
1084	LDF	[BO +  2 * SIZE], b3
1085	FADD	c07, t4, c07
1086	FMUL	a4, b4, t4
1087	LDF	[BO +  3 * SIZE], b4
1088
1089	bg,pt	%icc, .LL52
1090	LDF	[AO +  3 * SIZE], a4
1091
1092.LL55:
1093#ifndef TRMMKERNEL
1094	and	K, 3, L
1095#else
1096#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1097	sub	K, KK, L
1098#elif defined(LEFT)
1099	add	KK, 2, L
1100#else
1101	add	KK, 4, L
1102#endif
1103	and	L, 3,  L
1104#endif
1105	cmp	L,  0
1106	ble,a,pn %icc, .LL59
1107	nop
1108
1109.LL56:
1110	FADD	c02, t1, c02
1111	add	AO, 2 * SIZE, AO
1112	FMUL	a1, b1, t1
1113	add	L, -1, L
1114
1115	add	BO, 4 * SIZE, BO
1116	FADD	c04, t2, c04
1117	cmp	L, 0
1118	FMUL	a1, b2, t2
1119
1120	FADD	c06, t3, c06
1121	FMUL	a1, b3, t3
1122	FADD	c08, t4, c08
1123	FMUL	a1, b4, t4
1124	LDF	[AO + 0 * SIZE], a1
1125
1126	FADD	c01, t1, c01
1127	FMUL	a2, b1, t1
1128	LDF	[BO + 0 * SIZE], b1
1129	FADD	c03, t2, c03
1130	FMUL	a2, b2, t2
1131	LDF	[BO + 1 * SIZE], b2
1132
1133	FADD	c05, t3, c05
1134	FMUL	a2, b3, t3
1135	LDF	[BO + 2 * SIZE], b3
1136	FADD	c07, t4, c07
1137	FMUL	a2, b4, t4
1138	LDF	[BO + 3 * SIZE], b4
1139
1140	bg,pt	%icc, .LL56
1141	LDF	[AO + 1 * SIZE], a2
1142
1143.LL59:
1144#ifndef TRMMKERNEL
1145	FADD	c02, t1, c02
1146	FMUL	c01, ALPHA, c01
1147	LDF	[C1 + 0 * SIZE], a1
1148	FADD	c04, t2, c04
1149	FMUL	c03, ALPHA, c03
1150	LDF	[C1 + 1 * SIZE], a2
1151	FADD	c06, t3, c06
1152	FMUL	c05, ALPHA, c05
1153	LDF	[C2 + 0 * SIZE], a3
1154	FADD	c08, t4, c08
1155	FMUL	c07, ALPHA, c07
1156	LDF	[C2 + 1 * SIZE], a4
1157
1158	FMUL	c02, ALPHA, c02
1159	FADD	c01, a1, c01
1160	LDF	[C3 + 0 * SIZE], b1
1161
1162	FMUL	c04, ALPHA, c04
1163	FADD	c02, a2, c02
1164	LDF	[C3 + 1 * SIZE], b2
1165
1166	FMUL	c06, ALPHA, c06
1167	FADD	c03, a3, c03
1168	LDF	[C4 + 0 * SIZE], b3
1169
1170	FMUL	c08, ALPHA, c08
1171	FADD	c04, a4, c04
1172	LDF	[C4 + 1 * SIZE], b4
1173
1174	STF	c01, [C1 + 0 * SIZE]
1175	FADD	c05, b1, c05
1176	STF	c02, [C1 + 1 * SIZE]
1177	FADD	c06, b2, c06
1178	add	C1, 2 * SIZE, C1
1179
1180	STF	c03, [C2 + 0 * SIZE]
1181	FADD	c07, b3, c07
1182	STF	c04, [C2 + 1 * SIZE]
1183	FADD	c08, b4, c08
1184	add	C2, 2 * SIZE, C2
1185
1186	STF	c05, [C3 + 0 * SIZE]
1187	STF	c06, [C3 + 1 * SIZE]
1188	add	C3, 2 * SIZE, C3
1189
1190	STF	c07, [C4 + 0 * SIZE]
1191	STF	c08, [C4 + 1 * SIZE]
1192	add	C4, 2 * SIZE, C4
1193#else
1194
1195	FADD	c02, t1, c02
1196	FADD	c04, t2, c04
1197	FADD	c06, t3, c06
1198	FADD	c08, t4, c08
1199
1200	FMUL	c01, ALPHA, c01
1201	FMUL	c03, ALPHA, c03
1202	FMUL	c05, ALPHA, c05
1203	FMUL	c07, ALPHA, c07
1204
1205	FMUL	c02, ALPHA, c02
1206	FMUL	c04, ALPHA, c04
1207	FMUL	c06, ALPHA, c06
1208	FMUL	c08, ALPHA, c08
1209
1210	STF	c01, [C1 + 0 * SIZE]
1211	STF	c02, [C1 + 1 * SIZE]
1212
1213	STF	c03, [C2 + 0 * SIZE]
1214	STF	c04, [C2 + 1 * SIZE]
1215
1216	STF	c05, [C3 + 0 * SIZE]
1217	STF	c06, [C3 + 1 * SIZE]
1218
1219	STF	c07, [C4 + 0 * SIZE]
1220	STF	c08, [C4 + 1 * SIZE]
1221
1222	add	C1, 2 * SIZE, C1
1223	add	C2, 2 * SIZE, C2
1224	add	C3, 2 * SIZE, C3
1225	add	C4, 2 * SIZE, C4
1226
1227#if ( defined(LEFT) &&  defined(TRANSA)) || \
1228    (!defined(LEFT) && !defined(TRANSA))
1229	sub	K, KK, TEMP1
1230#ifdef LEFT
1231	add	TEMP1, -2, TEMP1
1232#else
1233	add	TEMP1, -4, TEMP1
1234#endif
1235	sll	TEMP1, 1 + BASE_SHIFT, TEMP2
1236	sll	TEMP1, 2 + BASE_SHIFT, TEMP1
1237
1238	add	AO, TEMP2, AO
1239	add	BO, TEMP1, BO
1240#endif
1241
1242#ifdef LEFT
1243	add	KK, 2, KK
1244#endif
1245#endif
1246
1247.LL70:
1248	and	M, 1, I
1249	cmp	I, 0
1250	ble,pn	%icc, .LL99
1251	nop
1252
1253.LL71:
1254#if !defined(TRMMKERNEL)
1255	LDF	[AO + 0 * SIZE], a1
1256	sra	K, 2, L
1257	FMOV	FZERO, c01
1258	LDF	[B  + 0 * SIZE], b1
1259	mov	B, BO
1260	FMOV	FZERO, t1
1261 	LDF	[AO + 1 * SIZE], a2
1262	cmp	L,  0
1263	FMOV	FZERO, c02
1264	LDF	[B  + 1 * SIZE], b2
1265	FMOV	FZERO, t2
1266	LDF	[AO + 2 * SIZE], a3
1267	FMOV	FZERO, c03
1268	LDF	[B  + 2 * SIZE], b3
1269	FMOV	FZERO, t3
1270	LDF	[AO + 3 * SIZE], a4
1271	FMOV	FZERO, c04
1272	LDF	[B  + 3 * SIZE], b4
1273	FMOV	FZERO, t4
1274#else
1275
1276#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1277	mov	B, BO
1278#else
1279	sll	KK, 0 + BASE_SHIFT, TEMP1
1280	sll	KK, 2 + BASE_SHIFT, TEMP2
1281
1282	add	AO, TEMP1, AO
1283	add	B,  TEMP2, BO
1284#endif
1285
1286#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1287	sub	K, KK, L
1288#elif defined(LEFT)
1289	add	KK, 1, L
1290#else
1291	add	KK, 4, L
1292#endif
1293	sra	L, 2, L
1294	cmp	L,  0
1295
1296	LDF	[AO + 0 * SIZE], a1
1297	FMOV	FZERO, c01
1298	LDF	[BO  + 0 * SIZE], b1
1299	FMOV	FZERO, t1
1300
1301 	LDF	[AO + 1 * SIZE], a2
1302	FMOV	FZERO, c02
1303	LDF	[BO  + 1 * SIZE], b2
1304	FMOV	FZERO, t2
1305
1306	LDF	[AO + 2 * SIZE], a3
1307	FMOV	FZERO, c03
1308	LDF	[BO  + 2 * SIZE], b3
1309	FMOV	FZERO, t3
1310
1311	LDF	[AO + 3 * SIZE], a4
1312	FMOV	FZERO, c04
1313	LDF	[BO  + 3 * SIZE], b4
1314	FMOV	FZERO, t4
1315#endif
1316
1317	ble,pn	%icc, .LL75
1318	nop
1319
1320.LL72:
1321	FADD	c01, t1, c01
1322	add	L, -1, L
1323	FMUL	a1, b1, t1
1324	LDF	[BO + 4 * SIZE], b1
1325
1326	FADD	c02, t2, c02
1327	cmp	L, 0
1328	FMUL	a1, b2, t2
1329	LDF	[BO + 5 * SIZE], b2
1330
1331	FADD	c03, t3, c03
1332	FMUL	a1, b3, t3
1333	LDF	[BO + 6 * SIZE], b3
1334
1335	FADD	c04, t4, c04
1336	FMUL	a1, b4, t4
1337	LDF	[BO + 7 * SIZE], b4
1338	LDF	[AO +  4 * SIZE], a1
1339
1340	FADD	c01, t1, c01
1341	add	AO,  4 * SIZE, AO
1342	FMUL	a2, b1, t1
1343	LDF	[BO +  8 * SIZE], b1
1344
1345	FADD	c02, t2, c02
1346	FMUL	a2, b2, t2
1347	LDF	[BO +  9 * SIZE], b2
1348
1349	FADD	c03, t3, c03
1350	FMUL	a2, b3, t3
1351	LDF	[BO + 10 * SIZE], b3
1352
1353	FADD	c04, t4, c04
1354	FMUL	a2, b4, t4
1355	LDF	[BO + 11 * SIZE], b4
1356	LDF	[AO +  1 * SIZE], a2
1357
1358	FADD	c01, t1, c01
1359	FMUL	a3, b1, t1
1360	LDF	[BO + 12 * SIZE], b1
1361
1362	FADD	c02, t2, c02
1363	FMUL	a3, b2, t2
1364	LDF	[BO + 13 * SIZE], b2
1365
1366	FADD	c03, t3, c03
1367	FMUL	a3, b3, t3
1368	LDF	[BO + 14 * SIZE], b3
1369
1370	FADD	c04, t4, c04
1371	FMUL	a3, b4, t4
1372	LDF	[BO + 15 * SIZE], b4
1373	LDF	[AO +  2 * SIZE], a3
1374
1375	FADD	c01, t1, c01
1376	FMUL	a4, b1, t1
1377	LDF	[BO + 16 * SIZE], b1
1378
1379	FADD	c02, t2, c02
1380	FMUL	a4, b2, t2
1381	LDF	[BO + 17 * SIZE], b2
1382
1383	FADD	c03, t3, c03
1384	FMUL	a4, b3, t3
1385	LDF	[BO + 18 * SIZE], b3
1386
1387	FADD	c04, t4, c04
1388	FMUL	a4, b4, t4
1389	LDF	[BO + 19 * SIZE], b4
1390
1391	add	BO, 16 * SIZE, BO
1392	bg,pt	%icc, .LL72
1393	LDF	[AO +  3 * SIZE], a4
1394
1395.LL75:
1396#ifndef TRMMKERNEL
1397	and	K, 3, L
1398#else
1399#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1400	sub	K, KK, L
1401#elif defined(LEFT)
1402	add	KK, 1, L
1403#else
1404	add	KK, 4, L
1405#endif
1406	and	L, 3,  L
1407#endif
1408	cmp	L,  0
1409	ble,a,pn %icc, .LL79
1410	nop
1411
1412.LL76:
1413	FADD	c01, t1, c01
1414	add	AO, 1 * SIZE, AO
1415	FMUL	a1, b1, t1
1416	LDF	[BO + 4 * SIZE], b1
1417
1418	FADD	c02, t2, c02
1419	add	L, -1, L
1420	FMUL	a1, b2, t2
1421	LDF	[BO + 5 * SIZE], b2
1422
1423	FADD	c03, t3, c03
1424	cmp	L, 0
1425	FMUL	a1, b3, t3
1426	LDF	[BO + 6 * SIZE], b3
1427
1428	FADD	c04, t4, c04
1429	add	BO, 4 * SIZE, BO
1430	FMUL	a1, b4, t4
1431	LDF	[AO + 0 * SIZE], a1
1432
1433	bg,pt	%icc, .LL76
1434	LDF	[BO + 3 * SIZE], b4
1435
1436
1437.LL79:
1438#ifndef TRMMKERNEL
1439	FADD	c01, t1, c01
1440	LDF	[C1 + 0 * SIZE], a1
1441	FADD	c02, t2, c02
1442	LDF	[C2 + 0 * SIZE], a2
1443	FADD	c03, t3, c03
1444	LDF	[C3 + 0 * SIZE], a3
1445	FADD	c04, t4, c04
1446	LDF	[C4 + 0 * SIZE], a4
1447
1448	FMUL	c01, ALPHA, c01
1449	FMUL	c02, ALPHA, c02
1450	FMUL	c03, ALPHA, c03
1451	FMUL	c04, ALPHA, c04
1452
1453	FADD	c01, a1, c01
1454	FADD	c02, a2, c02
1455	FADD	c03, a3, c03
1456	FADD	c04, a4, c04
1457
1458	STF	c01, [C1 + 0 * SIZE]
1459	STF	c02, [C2 + 0 * SIZE]
1460	STF	c03, [C3 + 0 * SIZE]
1461	STF	c04, [C4 + 0 * SIZE]
1462#else
1463	FADD	c01, t1, c01
1464	FADD	c02, t2, c02
1465	FADD	c03, t3, c03
1466	FADD	c04, t4, c04
1467
1468	FMUL	c01, ALPHA, c01
1469	FMUL	c02, ALPHA, c02
1470	FMUL	c03, ALPHA, c03
1471	FMUL	c04, ALPHA, c04
1472
1473	STF	c01, [C1 + 0 * SIZE]
1474	STF	c02, [C2 + 0 * SIZE]
1475	STF	c03, [C3 + 0 * SIZE]
1476	STF	c04, [C4 + 0 * SIZE]
1477
1478#if ( defined(LEFT) &&  defined(TRANSA)) || \
1479    (!defined(LEFT) && !defined(TRANSA))
1480	sub	K, KK, TEMP1
1481#ifdef LEFT
1482	add	TEMP1, -1, TEMP1
1483#else
1484	add	TEMP1, -4, TEMP1
1485#endif
1486	sll	TEMP1, 0 + BASE_SHIFT, TEMP2
1487	sll	TEMP1, 2 + BASE_SHIFT, TEMP1
1488
1489	add	AO, TEMP2, AO
1490	add	BO, TEMP1, BO
1491#endif
1492
1493#ifdef LEFT
1494	add	KK, 1, KK
1495#endif
1496#endif
1497
1498.LL99:
1499	add	J, -1, J
1500	mov	BO, B
1501	cmp	J, 0
1502	bg,pt	%icc, .LL11
1503#if defined(TRMMKERNEL) && !defined(LEFT)
1504	add	KK, 4, KK
1505#else
1506	nop
1507#endif
1508
1509.LL100:  /* n & 2 */
1510	sra	M, 2, I
1511	and	N, 2, J
1512
1513	cmp	J, 0
1514	add	C, LDC, C2
1515	ble,pn	%icc, .LL200
1516	mov	A, AO
1517
1518#if defined(TRMMKERNEL) &&  defined(LEFT)
1519	mov	OFFSET, KK
1520#endif
1521
1522	mov	C, C1
1523	add	C2, LDC, C
1524
1525	cmp	I, 0
1526	ble,pn	%icc, .LL150
1527	FMOV	FZERO, c03
1528
1529.LL121:
1530#if !defined(TRMMKERNEL)
1531	LDF	[AO + 0 * SIZE], a1
1532	sra	K, 2, L
1533	FMOV	FZERO, t1
1534	LDF	[B  + 0 * SIZE], b1
1535	mov	B, BO
1536	FMOV	FZERO, c07
1537
1538	LDF	[AO + 1 * SIZE], a2
1539	cmp	L,  0
1540	FMOV	FZERO, t2
1541	LDF	[B  + 1 * SIZE], b2
1542	FMOV	FZERO, c04
1543
1544	LDF	[AO + 2 * SIZE], a3
1545	FMOV	FZERO, t3
1546	LDF	[B  + 2 * SIZE], b3
1547	FMOV	FZERO, c08
1548
1549	LDF	[AO + 3 * SIZE], a4
1550	FMOV	FZERO, t4
1551	LDF	[B  + 3 * SIZE], b4
1552	FMOV	FZERO, c01
1553
1554	prefetch [C1 + 3 * SIZE], 2
1555	FMOV	FZERO, c05
1556	prefetch [C2 + 3 * SIZE], 2
1557	FMOV	FZERO, c02
1558#else
1559
1560#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1561	mov	B, BO
1562#else
1563	sll	KK, 2 + BASE_SHIFT, TEMP1
1564	sll	KK, 1 + BASE_SHIFT, TEMP2
1565
1566	add	AO, TEMP1, AO
1567	add	B,  TEMP2, BO
1568#endif
1569
1570#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1571	sub	K, KK, L
1572#elif defined(LEFT)
1573	add	KK, 4, L
1574#else
1575	add	KK, 2, L
1576#endif
1577	sra	L, 2, L
1578	cmp	L,  0
1579
1580	LDF	[AO + 0 * SIZE], a1
1581	FMOV	FZERO, t1
1582	LDF	[BO + 0 * SIZE], b1
1583	FMOV	FZERO, c07
1584
1585	LDF	[AO + 1 * SIZE], a2
1586	FMOV	FZERO, t2
1587	LDF	[BO + 1 * SIZE], b2
1588	FMOV	FZERO, c04
1589
1590	LDF	[AO + 2 * SIZE], a3
1591	FMOV	FZERO, t3
1592	LDF	[BO + 2 * SIZE], b3
1593	FMOV	FZERO, c08
1594
1595	LDF	[AO + 3 * SIZE], a4
1596	FMOV	FZERO, t4
1597	LDF	[BO + 3 * SIZE], b4
1598	FMOV	FZERO, c01
1599
1600	prefetch [C1 + 3 * SIZE], 2
1601	FMOV	FZERO, c05
1602	prefetch [C2 + 3 * SIZE], 2
1603	FMOV	FZERO, c02
1604#endif
1605
1606	ble,pn	%icc, .LL125
1607	FMOV	FZERO, c06
1608
1609.LL122:
1610	FADD	c03, t1, c03
1611	add	L, -1, L
1612	FMUL	a1, b1, t1
1613	prefetch [AO + APREFETCHSIZE * SIZE], 0
1614
1615	FADD	c07, t2, c07
1616	add	BO,  8 * SIZE, BO
1617	FMUL	a1, b2, t2
1618	LDF	[AO + 4 * SIZE], a1
1619
1620	FADD	c04, t3, c04
1621	add	AO, 16 * SIZE, AO
1622	FMUL	a2, b1, t3
1623	cmp	L,  0
1624
1625	FADD	c08, t4, c08
1626	nop
1627	FMUL	a2, b2, t4
1628	LDF	[AO - 11 * SIZE], a2
1629
1630	FADD	c01, t1, c01
1631	nop
1632	FMUL	a3, b1, t1
1633	nop
1634
1635	FADD	c05, t2, c05
1636	nop
1637	FMUL	a3, b2, t2
1638	LDF	[AO - 10 * SIZE], a3
1639
1640	FADD	c02, t3, c02
1641	nop
1642	FMUL	a4, b1, t3
1643	LDF	[BO -  4 * SIZE], b1
1644
1645	FADD	c06, t4, c06
1646	nop
1647	FMUL	a4, b2, t4
1648	LDF	[BO -  3 * SIZE], b2
1649
1650	FADD	c03, t1, c03
1651	nop
1652	FMUL	a1, b3, t1
1653	LDF	[AO -  9 * SIZE], a4
1654
1655	FADD	c07, t2, c07
1656	nop
1657	FMUL	a1, b4, t2
1658	LDF	[AO -  8 * SIZE], a1
1659
1660	FADD	c04, t3, c04
1661	nop
1662	FMUL	a2, b3, t3
1663	nop
1664
1665	FADD	c08, t4, c08
1666	nop
1667	FMUL	a2, b4, t4
1668	LDF	[AO -  7 * SIZE], a2
1669
1670	FADD	c01, t1, c01
1671	nop
1672	FMUL	a3, b3, t1
1673	nop
1674
1675	FADD	c05, t2, c05
1676	nop
1677	FMUL	a3, b4, t2
1678	LDF	[AO -  6 * SIZE], a3
1679
1680	FADD	c02, t3, c02
1681	nop
1682	FMUL	a4, b3, t3
1683	LDF	[BO -  2 * SIZE], b3
1684
1685	FADD	c06, t4, c06
1686	nop
1687	FMUL	a4, b4, t4
1688	LDF	[BO -  1 * SIZE], b4
1689
1690	FADD	c03, t1, c03
1691	nop
1692	FMUL	a1, b1, t1
1693	LDF	[AO -  5 * SIZE], a4
1694
1695	FADD	c07, t2, c07
1696	nop
1697	FMUL	a1, b2, t2
1698	LDF	[AO -  4 * SIZE], a1
1699
1700	FADD	c04, t3, c04
1701	nop
1702	FMUL	a2, b1, t3
1703	nop
1704
1705	FADD	c08, t4, c08
1706	nop
1707	FMUL	a2, b2, t4
1708	LDF	[AO -  3 * SIZE], a2
1709
1710	FADD	c01, t1, c01
1711	nop
1712	FMUL	a3, b1, t1
1713	nop
1714
1715	FADD	c05, t2, c05
1716	nop
1717	FMUL	a3, b2, t2
1718	LDF	[AO -  2 * SIZE], a3
1719
1720	FADD	c02, t3, c02
1721	nop
1722	FMUL	a4, b1, t3
1723	LDF	[BO +  0 * SIZE], b1
1724
1725	FADD	c06, t4, c06
1726	nop
1727	FMUL	a4, b2, t4
1728	LDF	[BO +  1 * SIZE], b2
1729
1730	FADD	c03, t1, c03
1731	nop
1732	FMUL	a1, b3, t1
1733	LDF	[AO -  1 * SIZE], a4
1734
1735	FADD	c07, t2, c07
1736	nop
1737	FMUL	a1, b4, t2
1738	LDF	[AO +  0 * SIZE], a1
1739
1740	FADD	c04, t3, c04
1741	nop
1742	FMUL	a2, b3, t3
1743	nop
1744
1745	FADD	c08, t4, c08
1746	nop
1747	FMUL	a2, b4, t4
1748	LDF	[AO +  1 * SIZE], a2
1749
1750	FADD	c01, t1, c01
1751	nop
1752	FMUL	a3, b3, t1
1753	nop
1754
1755	FADD	c05, t2, c05
1756	nop
1757	FMUL	a3, b4, t2
1758	LDF	[AO +  2 * SIZE], a3
1759
1760	FADD	c02, t3, c02
1761	nop
1762	FMUL	a4, b3, t3
1763	LDF	[BO +  2 * SIZE], b3
1764
1765	FADD	c06, t4, c06
1766	FMUL	a4, b4, t4
1767	LDF	[AO +  3 * SIZE], a4
1768
1769	bg,pt	%icc, .LL122
1770	LDF	[BO +  3 * SIZE], b4
1771
1772.LL125:
1773#ifndef TRMMKERNEL
1774	and	K, 3, L
1775#else
1776#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1777	sub	K, KK, L
1778#elif defined(LEFT)
1779	add	KK, 4, L
1780#else
1781	add	KK, 2, L
1782#endif
1783	and	L, 3,  L
1784#endif
1785	cmp	L,  0
1786	ble,a,pn %icc, .LL129
1787	nop
1788
1789.LL126:
1790	FADD	c03, t1, c03
1791	add	AO, 4 * SIZE, AO
1792	FMUL	a1, b1, t1
1793	add	BO, 2 * SIZE, BO
1794
1795	FADD	c07, t2, c07
1796	add	L, -1, L
1797	FMUL	a1, b2, t2
1798	LDF	[AO + 0 * SIZE], a1
1799
1800	FADD	c04, t3, c04
1801	cmp	L, 0
1802	FMUL	a2, b1, t3
1803
1804	FADD	c08, t4, c08
1805	FMUL	a2, b2, t4
1806	LDF	[AO + 1 * SIZE], a2
1807
1808	FADD	c01, t1, c01
1809	FMUL	a3, b1, t1
1810	FADD	c05, t2, c05
1811	FMUL	a3, b2, t2
1812	LDF	[AO + 2 * SIZE], a3
1813
1814	FADD	c02, t3, c02
1815	FMUL	a4, b1, t3
1816	LDF	[BO + 0 * SIZE], b1
1817	FADD	c06, t4, c06
1818	FMUL	a4, b2, t4
1819	LDF	[BO + 1 * SIZE], b2
1820	bg,pt	%icc, .LL126
1821	LDF	[AO + 3 * SIZE], a4
1822
1823.LL129:
1824#ifndef TRMMKERNEL
1825	FADD	c03, t1, c03
1826	add	I, -1, I
1827	LDF	[C1 + 0 * SIZE], a1
1828	FADD	c07, t2, c07
1829	cmp	I, 0
1830	LDF	[C1 + 1 * SIZE], a2
1831	FADD	c04, t3, c04
1832	LDF	[C1 + 2 * SIZE], a3
1833	FADD	c08, t4, c08
1834	LDF	[C1 + 3 * SIZE], a4
1835
1836	LDF	[C2 + 0 * SIZE], b1
1837	FMUL	c01, ALPHA, c01
1838	LDF	[C2 + 1 * SIZE], b2
1839	FMUL	c02, ALPHA, c02
1840	LDF	[C2 + 2 * SIZE], b3
1841	FMUL	c03, ALPHA, c03
1842	LDF	[C2 + 3 * SIZE], b4
1843	FMUL	c04, ALPHA, c04
1844
1845	FMUL	c05, ALPHA, c05
1846	FADD	c01, a1, c01
1847	FMUL	c06, ALPHA, c06
1848	FADD	c02, a2, c02
1849	FMUL	c07, ALPHA, c07
1850	FADD	c03, a3, c03
1851	FMUL	c08, ALPHA, c08
1852	FADD	c04, a4, c04
1853
1854	STF	c01, [C1 + 0 * SIZE]
1855	FADD	c05, b1, c05
1856	STF	c02, [C1 + 1 * SIZE]
1857	FADD	c06, b2, c06
1858	STF	c03, [C1 + 2 * SIZE]
1859	FADD	c07, b3, c07
1860	STF	c04, [C1 + 3 * SIZE]
1861	add	C1, 4 * SIZE, C1
1862	FADD	c08, b4, c08
1863
1864	STF	c05, [C2 + 0 * SIZE]
1865	STF	c06, [C2 + 1 * SIZE]
1866	STF	c07, [C2 + 2 * SIZE]
1867	STF	c08, [C2 + 3 * SIZE]
1868	add	C2, 4 * SIZE, C2
1869#else
1870	FADD	c03, t1, c03
1871	FADD	c07, t2, c07
1872	FADD	c04, t3, c04
1873	FADD	c08, t4, c08
1874
1875	FMUL	c01, ALPHA, c01
1876	FMUL	c02, ALPHA, c02
1877	FMUL	c03, ALPHA, c03
1878	FMUL	c04, ALPHA, c04
1879
1880	FMUL	c05, ALPHA, c05
1881	FMUL	c06, ALPHA, c06
1882	FMUL	c07, ALPHA, c07
1883	FMUL	c08, ALPHA, c08
1884
1885	STF	c01, [C1 + 0 * SIZE]
1886	STF	c02, [C1 + 1 * SIZE]
1887	STF	c03, [C1 + 2 * SIZE]
1888	STF	c04, [C1 + 3 * SIZE]
1889
1890	STF	c05, [C2 + 0 * SIZE]
1891	STF	c06, [C2 + 1 * SIZE]
1892	STF	c07, [C2 + 2 * SIZE]
1893	STF	c08, [C2 + 3 * SIZE]
1894	add	C1, 4 * SIZE, C1
1895	add	C2, 4 * SIZE, C2
1896
1897#if ( defined(LEFT) &&  defined(TRANSA)) || \
1898    (!defined(LEFT) && !defined(TRANSA))
1899	sub	K, KK, TEMP1
1900#ifdef LEFT
1901	add	TEMP1, -4, TEMP1
1902#else
1903	add	TEMP1, -2, TEMP1
1904#endif
1905	sll	TEMP1, 2 + BASE_SHIFT, TEMP2
1906	sll	TEMP1, 1 + BASE_SHIFT, TEMP1
1907
1908	add	AO, TEMP2, AO
1909	add	BO, TEMP1, BO
1910#endif
1911
1912#ifdef LEFT
1913	add	KK, 4, KK
1914#endif
1915
1916	add	I, -1, I
1917	cmp	I, 0
1918#endif
1919
1920	bg,pt	%icc, .LL121
1921	FMOV	FZERO, c03
1922
1923.LL150:
1924	and	M, 2, I
1925	cmp	I, 0
1926	ble,pn	%icc, .LL170
1927	nop
1928
1929.LL151:
1930#if !defined(TRMMKERNEL)
1931	LDF	[AO + 0 * SIZE], a1
1932	sra	K, 2, L
1933	FMOV	FZERO, c01
1934
1935	LDF	[B  + 0 * SIZE], b1
1936	mov	B, BO
1937	FMOV	FZERO, t1
1938
1939	LDF	[AO + 1 * SIZE], a2
1940	cmp	L,  0
1941	FMOV	FZERO, c02
1942	LDF	[B  + 1 * SIZE], b2
1943	FMOV	FZERO, t2
1944
1945	LDF	[AO + 2 * SIZE], a3
1946	FMOV	FZERO, c03
1947	LDF	[B  + 2 * SIZE], b3
1948	FMOV	FZERO, t3
1949
1950	LDF	[AO + 3 * SIZE], a4
1951	FMOV	FZERO, c04
1952	LDF	[B  + 3 * SIZE], b4
1953	FMOV	FZERO, t4
1954#else
1955
1956#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1957	mov	B, BO
1958#else
1959	sll	KK, 1 + BASE_SHIFT, TEMP1
1960	sll	KK, 1 + BASE_SHIFT, TEMP2
1961
1962	add	AO, TEMP1, AO
1963	add	B,  TEMP2, BO
1964#endif
1965
1966#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1967	sub	K, KK, L
1968#elif defined(LEFT)
1969	add	KK, 2, L
1970#else
1971	add	KK, 2, L
1972#endif
1973	sra	L, 2, L
1974	cmp	L,  0
1975
1976	LDF	[AO + 0 * SIZE], a1
1977	FMOV	FZERO, c01
1978	LDF	[BO + 0 * SIZE], b1
1979	FMOV	FZERO, t1
1980
1981	LDF	[AO + 1 * SIZE], a2
1982	FMOV	FZERO, c02
1983	LDF	[BO + 1 * SIZE], b2
1984	FMOV	FZERO, t2
1985
1986	LDF	[AO + 2 * SIZE], a3
1987	FMOV	FZERO, c03
1988	LDF	[BO + 2 * SIZE], b3
1989	FMOV	FZERO, t3
1990
1991	LDF	[AO + 3 * SIZE], a4
1992	FMOV	FZERO, c04
1993	LDF	[BO + 3 * SIZE], b4
1994	FMOV	FZERO, t4
1995#endif
1996
1997	ble,pn	%icc, .LL155
1998	nop
1999
2000.LL152:
2001	FADD	c01, t1, c01
2002	add	L, -1, L
2003	FMUL	a1, b1, t1
2004	prefetch [AO + APREFETCHSIZE * SIZE], 0
2005
2006	FADD	c02, t2, c02
2007	add	BO,  8 * SIZE, BO
2008	FMUL	a1, b2, t2
2009	LDF	[AO + 4 * SIZE], a1
2010
2011	FADD	c03, t3, c03
2012	cmp	L, 0
2013	FMUL	a2, b1, t3
2014	LDF	[BO - 4 * SIZE], b1
2015
2016	FADD	c04, t4, c04
2017	nop
2018	FMUL	a2, b2, t4
2019	LDF	[AO + 5 * SIZE], a2
2020
2021	FADD	c01, t1, c01
2022	nop
2023	FMUL	a3, b3, t1
2024	LDF	[BO - 3 * SIZE], b2
2025
2026	FADD	c02, t2, c02
2027	nop
2028	FMUL	a3, b4, t2
2029	LDF	[AO + 6 * SIZE], a3
2030
2031	FADD	c03, t3, c03
2032	nop
2033	FMUL	a4, b3, t3
2034	LDF	[BO - 2 * SIZE], b3
2035
2036	FADD	c04, t4, c04
2037	nop
2038	FMUL	a4, b4, t4
2039	LDF	[AO + 7 * SIZE], a4
2040
2041	FADD	c01, t1, c01
2042	nop
2043	FMUL	a1, b1, t1
2044	LDF	[BO - 1 * SIZE], b4
2045
2046	FADD	c02, t2, c02
2047	FMUL	a1, b2, t2
2048	LDF	[AO +  8 * SIZE], a1
2049
2050	FADD	c03, t3, c03
2051	FMUL	a2, b1, t3
2052	LDF	[BO +  0 * SIZE], b1
2053
2054	FADD	c04, t4, c04
2055	FMUL	a2, b2, t4
2056	LDF	[AO +  9 * SIZE], a2
2057
2058	FADD	c01, t1, c01
2059	FMUL	a3, b3, t1
2060	LDF	[BO +  1 * SIZE], b2
2061
2062	FADD	c02, t2, c02
2063	FMUL	a3, b4, t2
2064	LDF	[AO + 10 * SIZE], a3
2065
2066	FADD	c03, t3, c03
2067	FMUL	a4, b3, t3
2068	LDF	[BO +  2 * SIZE], b3
2069
2070	FADD	c04, t4, c04
2071	FMUL	a4, b4, t4
2072	LDF	[AO + 11 * SIZE], a4
2073
2074	add	AO,  8 * SIZE, AO
2075	bg,pt	%icc, .LL152
2076	LDF	[BO +  3 * SIZE], b4
2077
2078.LL155:
2079#ifndef TRMMKERNEL
2080	and	K, 3, L
2081#else
2082#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2083	sub	K, KK, L
2084#elif defined(LEFT)
2085	add	KK, 2, L
2086#else
2087	add	KK, 2, L
2088#endif
2089	and	L, 3,  L
2090#endif
2091	cmp	L,  0
2092	ble,a,pn %icc, .LL159
2093	nop
2094
2095.LL156:
2096	LDF	[AO + 0 * SIZE], a1
2097	LDF	[AO + 1 * SIZE], a2
2098
2099	LDF	[BO + 0 * SIZE], b1
2100	LDF	[BO + 1 * SIZE], b2
2101
2102	FADD	c01, t1, c01
2103	FADD	c02, t2, c02
2104	FADD	c03, t3, c03
2105	FADD	c04, t4, c04
2106
2107	FMUL	a1, b1, t1
2108	FMUL	a1, b2, t2
2109	FMUL	a2, b1, t3
2110	FMUL	a2, b2, t4
2111
2112	add	AO, 2 * SIZE, AO
2113	add	BO, 2 * SIZE, BO
2114
2115	add	L, -1, L
2116	cmp	L, 0
2117	bg,pt	%icc, .LL156
2118	nop
2119
2120.LL159:
2121#ifndef TRMMKERNEL
2122	LDF	[C1 + 0 * SIZE], a1
2123	LDF	[C2 + 0 * SIZE], a2
2124	LDF	[C1 + 1 * SIZE], a3
2125	LDF	[C2 + 1 * SIZE], a4
2126
2127	FADD	c01, t1, c01
2128	FADD	c02, t2, c02
2129	FADD	c03, t3, c03
2130	FADD	c04, t4, c04
2131
2132	FMUL	c01, ALPHA, c01
2133	FMUL	c02, ALPHA, c02
2134	FMUL	c03, ALPHA, c03
2135	FMUL	c04, ALPHA, c04
2136
2137	FADD	c01, a1, c01
2138	FADD	c02, a2, c02
2139	FADD	c03, a3, c03
2140	FADD	c04, a4, c04
2141
2142	STF	c01, [C1 + 0 * SIZE]
2143	STF	c02, [C2 + 0 * SIZE]
2144	STF	c03, [C1 + 1 * SIZE]
2145	add	C1, 2 * SIZE, C1
2146	STF	c04, [C2 + 1 * SIZE]
2147	add	C2, 2 * SIZE, C2
2148#else
2149	FADD	c01, t1, c01
2150	FADD	c02, t2, c02
2151	FADD	c03, t3, c03
2152	FADD	c04, t4, c04
2153
2154	FMUL	c01, ALPHA, c01
2155	FMUL	c02, ALPHA, c02
2156	FMUL	c03, ALPHA, c03
2157	FMUL	c04, ALPHA, c04
2158
2159	STF	c01, [C1 + 0 * SIZE]
2160	STF	c02, [C2 + 0 * SIZE]
2161	STF	c03, [C1 + 1 * SIZE]
2162	STF	c04, [C2 + 1 * SIZE]
2163	add	C1, 2 * SIZE, C1
2164	add	C2, 2 * SIZE, C2
2165
2166#if ( defined(LEFT) &&  defined(TRANSA)) || \
2167    (!defined(LEFT) && !defined(TRANSA))
2168	sub	K, KK, TEMP1
2169#ifdef LEFT
2170	add	TEMP1, -2, TEMP1
2171#else
2172	add	TEMP1, -2, TEMP1
2173#endif
2174	sll	TEMP1, 1 + BASE_SHIFT, TEMP2
2175	sll	TEMP1, 1 + BASE_SHIFT, TEMP1
2176
2177	add	AO, TEMP2, AO
2178	add	BO, TEMP1, BO
2179#endif
2180
2181#ifdef LEFT
2182	add	KK, 2, KK
2183#endif
2184#endif
2185
2186.LL170:
2187	and	M, 1, I
2188	cmp	I, 0
2189	ble,pn	%icc, .LL199
2190	nop
2191
2192.LL171:
2193#if !defined(TRMMKERNEL)
2194	LDF	[AO + 0 * SIZE], a1
2195	sra	K, 2, L
2196	FMOV	FZERO, c01
2197	LDF	[B  + 0 * SIZE], b1
2198	mov	B, BO
2199	FMOV	FZERO, t1
2200
2201 	LDF	[AO + 1 * SIZE], a2
2202	cmp	L,  0
2203	FMOV	FZERO, c02
2204	LDF	[B  + 1 * SIZE], b2
2205	FMOV	FZERO, t2
2206
2207	LDF	[AO + 2 * SIZE], a3
2208	FMOV	FZERO, c03
2209
2210	LDF	[B  + 2 * SIZE], b3
2211	FMOV	FZERO, t3
2212
2213	LDF	[AO + 3 * SIZE], a4
2214	FMOV	FZERO, c04
2215	LDF	[B  + 3 * SIZE], b4
2216	FMOV	FZERO, t4
2217#else
2218#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2219	mov	B, BO
2220#else
2221	sll	KK, 0 + BASE_SHIFT, TEMP1
2222	sll	KK, 1 + BASE_SHIFT, TEMP2
2223
2224	add	AO, TEMP1, AO
2225	add	B,  TEMP2, BO
2226#endif
2227
2228#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2229	sub	K, KK, L
2230#elif defined(LEFT)
2231	add	KK, 1, L
2232#else
2233	add	KK, 2, L
2234#endif
2235	sra	L, 2, L
2236	cmp	L,  0
2237
2238	LDF	[AO + 0 * SIZE], a1
2239	FMOV	FZERO, c01
2240	LDF	[BO + 0 * SIZE], b1
2241	FMOV	FZERO, t1
2242
2243 	LDF	[AO + 1 * SIZE], a2
2244	FMOV	FZERO, c02
2245	LDF	[BO + 1 * SIZE], b2
2246	FMOV	FZERO, t2
2247
2248	LDF	[AO + 2 * SIZE], a3
2249	FMOV	FZERO, c03
2250	LDF	[BO  + 2 * SIZE], b3
2251	FMOV	FZERO, t3
2252
2253	LDF	[AO + 3 * SIZE], a4
2254	FMOV	FZERO, c04
2255	LDF	[BO  + 3 * SIZE], b4
2256	FMOV	FZERO, t4
2257#endif
2258
2259	ble,pn	%icc, .LL175
2260	nop
2261
2262.LL172:
2263	FADD	c01, t1, c01
2264	add	AO,  4 * SIZE, AO
2265	FMUL	a1, b1, t1
2266	LDF	[BO + 4 * SIZE], b1
2267
2268	FADD	c02, t2, c02
2269	FMUL	a1, b2, t2
2270	LDF	[BO + 5 * SIZE], b2
2271
2272	add	L, -1, L
2273	LDF	[AO + 0 * SIZE], a1
2274
2275	FADD	c03, t3, c03
2276	cmp	L, 0
2277	FMUL	a2, b3, t3
2278	LDF	[BO + 6 * SIZE], b3
2279
2280	FADD	c04, t4, c04
2281	FMUL	a2, b4, t4
2282	LDF	[BO + 7 * SIZE], b4
2283	LDF	[AO + 1 * SIZE], a2
2284
2285	FADD	c01, t1, c01
2286	FMUL	a3, b1, t1
2287	LDF	[BO +  8 * SIZE], b1
2288
2289	FADD	c02, t2, c02
2290	FMUL	a3, b2, t2
2291	LDF	[BO +  9 * SIZE], b2
2292	LDF	[AO + 2 * SIZE], a3
2293
2294	FADD	c03, t3, c03
2295	FMUL	a4, b3, t3
2296	LDF	[BO + 10 * SIZE], b3
2297	FADD	c04, t4, c04
2298	FMUL	a4, b4, t4
2299	LDF	[BO + 11 * SIZE], b4
2300	add	BO,  8 * SIZE, BO
2301
2302	bg,pt	%icc, .LL172
2303	LDF	[AO + 3 * SIZE], a4
2304
2305.LL175:
2306#ifndef TRMMKERNEL
2307	and	K, 3, L
2308#else
2309#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2310	sub	K, KK, L
2311#elif defined(LEFT)
2312	add	KK, 1, L
2313#else
2314	add	KK, 2, L
2315#endif
2316	and	L, 3,  L
2317#endif
2318	cmp	L,  0
2319	ble,a,pn %icc, .LL179
2320	nop
2321
2322.LL176:
2323	FADD	c01, t1, c01
2324	add	L, -1, L
2325	FMUL	a1, b1, t1
2326	add	AO, 1 * SIZE, AO
2327	LDF	[BO + 2 * SIZE], b1
2328	FADD	c02, t2, c02
2329	cmp	L, 0
2330	FMUL	a1, b2, t2
2331	LDF	[BO + 3 * SIZE], b2
2332
2333	add	BO, 2 * SIZE, BO
2334	bg,pt	%icc, .LL176
2335	LDF	[AO + 0 * SIZE], a1
2336
2337.LL179:
2338#ifndef TRMMKERNEL
2339	FADD	c01, t1, c01
2340	LDF	[C1 + 0 * SIZE], a1
2341	FADD	c02, t2, c02
2342	LDF	[C2 + 0 * SIZE], a2
2343	FADD	c03, t3, c03
2344	FADD	c04, t4, c04
2345
2346	FADD	c01, c03, c01
2347	FADD	c02, c04, c02
2348
2349	FMUL	c01, ALPHA, c01
2350	FMUL	c02, ALPHA, c02
2351
2352	FADD	c01, a1, c01
2353	FADD	c02, a2, c02
2354
2355	STF	c01, [C1 + 0 * SIZE]
2356	STF	c02, [C2 + 0 * SIZE]
2357#else
2358
2359	FADD	c01, t1, c01
2360	FADD	c02, t2, c02
2361	FADD	c03, t3, c03
2362	FADD	c04, t4, c04
2363
2364	FADD	c01, c03, c01
2365	FADD	c02, c04, c02
2366
2367	FMUL	c01, ALPHA, c01
2368	FMUL	c02, ALPHA, c02
2369
2370	STF	c01, [C1 + 0 * SIZE]
2371	STF	c02, [C2 + 0 * SIZE]
2372
2373#if ( defined(LEFT) &&  defined(TRANSA)) || \
2374    (!defined(LEFT) && !defined(TRANSA))
2375	sub	K, KK, TEMP1
2376#ifdef LEFT
2377	add	TEMP1, -1, TEMP1
2378#else
2379	add	TEMP1, -2, TEMP1
2380#endif
2381	sll	TEMP1, 0 + BASE_SHIFT, TEMP2
2382	sll	TEMP1, 1 + BASE_SHIFT, TEMP1
2383
2384	add	AO, TEMP2, AO
2385	add	BO, TEMP1, BO
2386#endif
2387
2388#ifdef LEFT
2389	add	KK, 1, KK
2390#endif
2391#endif
2392
2393.LL199:
2394	mov	BO, B
2395#if defined(TRMMKERNEL) && !defined(LEFT)
2396	add	KK, 2, KK
2397#else
2398	nop
2399#endif
2400
2401.LL200:
2402	and	N, 1, J
2403	sra	M, 2, I
2404
2405	cmp	J, 0
2406	ble,pn	%icc, .LL999
2407	mov	A, AO
2408
2409#if defined(TRMMKERNEL) &&  defined(LEFT)
2410	mov	OFFSET, KK
2411#endif
2412
2413	cmp	I, 0
2414	ble,pn	%icc, .LL250
2415	mov	C, C1
2416
2417.LL221:
2418#if !defined(TRMMKERNEL)
2419	LDF	[AO + 0 * SIZE], a1
2420	sra	K, 2, L
2421	FMOV	FZERO, c01
2422	LDF	[B  + 0 * SIZE], b1
2423	mov	B, BO
2424	FMOV	FZERO, t1
2425
2426	LDF	[AO + 1 * SIZE], a2
2427	cmp	L,  0
2428	FMOV	FZERO, c02
2429	LDF	[B  + 1 * SIZE], b2
2430	FMOV	FZERO, t2
2431
2432	LDF	[AO + 2 * SIZE], a3
2433	FMOV	FZERO, c03
2434	LDF	[B  + 2 * SIZE], b3
2435	FMOV	FZERO, t3
2436
2437	LDF	[AO + 3 * SIZE], a4
2438	FMOV	FZERO, c04
2439	LDF	[B  + 3 * SIZE], b4
2440	FMOV	FZERO, t4
2441#else
2442#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2443	mov	B, BO
2444#else
2445	sll	KK, 2 + BASE_SHIFT, TEMP1
2446	sll	KK, 0 + BASE_SHIFT, TEMP2
2447
2448	add	AO, TEMP1, AO
2449	add	B,  TEMP2, BO
2450#endif
2451
2452#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2453	sub	K, KK, L
2454#elif defined(LEFT)
2455	add	KK, 4, L
2456#else
2457	add	KK, 1, L
2458#endif
2459	sra	L, 2, L
2460	cmp	L,  0
2461
2462	LDF	[AO + 0 * SIZE], a1
2463	FMOV	FZERO, c01
2464	LDF	[BO  + 0 * SIZE], b1
2465	FMOV	FZERO, t1
2466
2467	LDF	[AO + 1 * SIZE], a2
2468	FMOV	FZERO, c02
2469	LDF	[BO + 1 * SIZE], b2
2470	FMOV	FZERO, t2
2471
2472	LDF	[AO + 2 * SIZE], a3
2473	FMOV	FZERO, c03
2474	LDF	[BO + 2 * SIZE], b3
2475	FMOV	FZERO, t3
2476
2477	LDF	[AO + 3 * SIZE], a4
2478	FMOV	FZERO, c04
2479	LDF	[BO + 3 * SIZE], b4
2480	FMOV	FZERO, t4
2481#endif
2482
2483	ble,pn	%icc, .LL225
2484	prefetch [C1 + 4 * SIZE], 2
2485
2486.LL222:
2487	FADD	c01, t1, c01
2488	add	BO,  4 * SIZE, BO
2489	FMUL	a1, b1, t1
2490	LDF	[AO +  4 * SIZE], a1
2491
2492	FADD	c02, t2, c02
2493	FMUL	a2, b1, t2
2494	LDF	[AO +  5 * SIZE], a2
2495
2496	FADD	c03, t3, c03
2497	add	L, -1, L
2498	FMUL	a3, b1, t3
2499	LDF	[AO +  6 * SIZE], a3
2500
2501	FADD	c04, t4, c04
2502	FMUL	a4, b1, t4
2503	LDF	[AO +  7 * SIZE], a4
2504	LDF	[BO +  0 * SIZE], b1
2505
2506	FADD	c01, t1, c01
2507	cmp	L,  0
2508	FMUL	a1, b2, t1
2509	LDF	[AO +  8 * SIZE], a1
2510
2511	FADD	c02, t2, c02
2512	FMUL	a2, b2, t2
2513	LDF	[AO +  9 * SIZE], a2
2514
2515	FADD	c03, t3, c03
2516	FMUL	a3, b2, t3
2517	LDF	[AO + 10 * SIZE], a3
2518
2519	FADD	c04, t4, c04
2520	FMUL	a4, b2, t4
2521	LDF	[AO + 11 * SIZE], a4
2522	LDF	[BO +  1 * SIZE], b2
2523
2524	FADD	c01, t1, c01
2525	FMUL	a1, b3, t1
2526	LDF	[AO + 12 * SIZE], a1
2527
2528	FADD	c02, t2, c02
2529	FMUL	a2, b3, t2
2530	LDF	[AO + 13 * SIZE], a2
2531
2532	FADD	c03, t3, c03
2533	FMUL	a3, b3, t3
2534	LDF	[AO + 14 * SIZE], a3
2535
2536	FADD	c04, t4, c04
2537	FMUL	a4, b3, t4
2538	LDF	[AO + 15 * SIZE], a4
2539	LDF	[BO +  2 * SIZE], b3
2540
2541	FADD	c01, t1, c01
2542	FMUL	a1, b4, t1
2543	LDF	[AO + 16 * SIZE], a1
2544
2545	FADD	c02, t2, c02
2546	FMUL	a2, b4, t2
2547	LDF	[AO + 17 * SIZE], a2
2548
2549	FADD	c03, t3, c03
2550	FMUL	a3, b4, t3
2551	LDF	[AO + 18 * SIZE], a3
2552
2553	FADD	c04, t4, c04
2554	FMUL	a4, b4, t4
2555	LDF	[AO + 19 * SIZE], a4
2556	add	AO, 16 * SIZE, AO
2557
2558	bg,pt	%icc, .LL222
2559	LDF	[BO +  3 * SIZE], b4
2560
2561.LL225:
2562#ifndef TRMMKERNEL
2563	and	K, 3, L
2564#else
2565#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2566	sub	K, KK, L
2567#elif defined(LEFT)
2568	add	KK, 4, L
2569#else
2570	add	KK, 1, L
2571#endif
2572	and	L, 3,  L
2573#endif
2574	cmp	L,  0
2575	ble,a,pn %icc, .LL229
2576	nop
2577
2578.LL226:
2579	FADD	c01, t1, c01
2580	add	BO, 1 * SIZE, BO
2581	FMUL	a1, b1, t1
2582	LDF	[AO + 4 * SIZE], a1
2583
2584	FADD	c02, t2, c02
2585	add	L, -1, L
2586	FMUL	a2, b1, t2
2587	LDF	[AO + 5 * SIZE], a2
2588
2589	FADD	c03, t3, c03
2590	cmp	L, 0
2591	FMUL	a3, b1, t3
2592	LDF	[AO + 6 * SIZE], a3
2593
2594	FADD	c04, t4, c04
2595	FMUL	a4, b1, t4
2596	LDF	[AO + 7 * SIZE], a4
2597	add	AO, 4 * SIZE, AO
2598
2599	bg,pt	%icc, .LL226
2600	LDF	[BO + 0 * SIZE], b1
2601
2602.LL229:
2603#ifndef TRMMKERNEL
2604	FADD	c01, t1, c01
2605	add	I, -1, I
2606	FADD	c02, t2, c02
2607	cmp	I, 0
2608	FADD	c03, t3, c03
2609	FADD	c04, t4, c04
2610
2611	FMUL	c01, ALPHA, c01
2612	FMUL	c02, ALPHA, c02
2613	FMUL	c03, ALPHA, c03
2614	FMUL	c04, ALPHA, c04
2615
2616	LDF	[C1 + 0 * SIZE], a1
2617	LDF	[C1 + 1 * SIZE], a2
2618	LDF	[C1 + 2 * SIZE], a3
2619	LDF	[C1 + 3 * SIZE], a4
2620
2621	FADD	c01, a1, c01
2622	FADD	c02, a2, c02
2623	FADD	c03, a3, c03
2624	FADD	c04, a4, c04
2625
2626	STF	c01, [C1 + 0 * SIZE]
2627	STF	c02, [C1 + 1 * SIZE]
2628	STF	c03, [C1 + 2 * SIZE]
2629	STF	c04, [C1 + 3 * SIZE]
2630	add	C1, 4 * SIZE, C1
2631#else
2632	FADD	c01, t1, c01
2633	FADD	c02, t2, c02
2634	FADD	c03, t3, c03
2635	FADD	c04, t4, c04
2636
2637	FMUL	c01, ALPHA, c01
2638	FMUL	c02, ALPHA, c02
2639	FMUL	c03, ALPHA, c03
2640	FMUL	c04, ALPHA, c04
2641
2642	STF	c01, [C1 + 0 * SIZE]
2643	STF	c02, [C1 + 1 * SIZE]
2644	STF	c03, [C1 + 2 * SIZE]
2645	STF	c04, [C1 + 3 * SIZE]
2646	add	C1, 4 * SIZE, C1
2647
2648#if ( defined(LEFT) &&  defined(TRANSA)) || \
2649    (!defined(LEFT) && !defined(TRANSA))
2650	sub	K, KK, TEMP1
2651#ifdef LEFT
2652	add	TEMP1, -4, TEMP1
2653#else
2654	add	TEMP1, -1, TEMP1
2655#endif
2656	sll	TEMP1, 2 + BASE_SHIFT, TEMP2
2657	sll	TEMP1, 0 + BASE_SHIFT, TEMP1
2658
2659	add	AO, TEMP2, AO
2660	add	BO, TEMP1, BO
2661#endif
2662
2663#ifdef LEFT
2664	add	KK, 4, KK
2665#endif
2666
2667	add	I, -1, I
2668	cmp	I, 0
2669#endif
2670
2671	bg,pt	%icc, .LL221
2672	nop
2673
2674.LL250:
2675	and	M, 2, I
2676	cmp	I, 0
2677	ble,pn	%icc, .LL270
2678	nop
2679
2680.LL251:
2681#if !defined(TRMMKERNEL)
2682	LDF	[AO + 0 * SIZE], a1
2683	sra	K, 2, L
2684	FMOV	FZERO, c01
2685	LDF	[B  + 0 * SIZE], b1
2686	mov	B, BO
2687	FMOV	FZERO, t1
2688
2689	LDF	[AO + 1 * SIZE], a2
2690	cmp	L,  0
2691	FMOV	FZERO, c02
2692	LDF	[B  + 1 * SIZE], b2
2693	FMOV	FZERO, t2
2694
2695	LDF	[AO + 2 * SIZE], a3
2696	FMOV	FZERO, c03
2697	LDF	[B  + 2 * SIZE], b3
2698	FMOV	FZERO, t3
2699
2700	LDF	[AO + 3 * SIZE], a4
2701	FMOV	FZERO, c04
2702	LDF	[B  + 3 * SIZE], b4
2703	FMOV	FZERO, t4
2704#else
2705#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2706	mov	B, BO
2707#else
2708	sll	KK, 1 + BASE_SHIFT, TEMP1
2709	sll	KK, 0 + BASE_SHIFT, TEMP2
2710
2711	add	AO, TEMP1, AO
2712	add	B,  TEMP2, BO
2713#endif
2714
2715#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2716	sub	K, KK, L
2717#elif defined(LEFT)
2718	add	KK, 2, L
2719#else
2720	add	KK, 1, L
2721#endif
2722	sra	L, 2, L
2723	cmp	L,  0
2724
2725	LDF	[AO + 0 * SIZE], a1
2726	FMOV	FZERO, c01
2727	LDF	[BO + 0 * SIZE], b1
2728	FMOV	FZERO, t1
2729
2730	LDF	[AO + 1 * SIZE], a2
2731	FMOV	FZERO, c02
2732	LDF	[BO + 1 * SIZE], b2
2733	FMOV	FZERO, t2
2734
2735	LDF	[AO + 2 * SIZE], a3
2736	FMOV	FZERO, c03
2737	LDF	[BO + 2 * SIZE], b3
2738	FMOV	FZERO, t3
2739
2740	LDF	[AO + 3 * SIZE], a4
2741	FMOV	FZERO, c04
2742	LDF	[BO + 3 * SIZE], b4
2743	FMOV	FZERO, t4
2744#endif
2745
2746	ble,pn	%icc, .LL255
2747	nop
2748
2749.LL252:
2750	FADD	c01, t1, c01
2751	add	L, -1, L
2752	FMUL	a1, b1, t1
2753	LDF	[AO + 4 * SIZE], a1
2754
2755	FADD	c02, t2, c02
2756	FMUL	a2, b1, t2
2757	LDF	[AO +  5 * SIZE], a2
2758	LDF	[BO +  4 * SIZE], b1
2759
2760	FADD	c03, t3, c03
2761	cmp	L, 0
2762	FMUL	a3, b2, t3
2763	LDF	[AO +  6 * SIZE], a3
2764
2765	FADD	c04, t4, c04
2766	FMUL	a4, b2, t4
2767	LDF	[AO +  7 * SIZE], a4
2768	LDF	[BO +  5 * SIZE], b2
2769
2770	FADD	c01, t1, c01
2771	FMUL	a1, b3, t1
2772	LDF	[AO +  8 * SIZE], a1
2773
2774	FADD	c02, t2, c02
2775	FMUL	a2, b3, t2
2776	LDF	[AO +  9 * SIZE], a2
2777	LDF	[BO +  6 * SIZE], b3
2778
2779	FADD	c03, t3, c03
2780	FMUL	a3, b4, t3
2781	LDF	[AO + 10 * SIZE], a3
2782
2783	FADD	c04, t4, c04
2784	FMUL	a4, b4, t4
2785	LDF	[AO + 11 * SIZE], a4
2786	add	AO,  8 * SIZE, AO
2787
2788	LDF	[BO +  7 * SIZE], b4
2789	bg,pt	%icc, .LL252
2790	add	BO,  4 * SIZE, BO
2791
2792.LL255:
2793#ifndef TRMMKERNEL
2794	and	K, 3, L
2795#else
2796#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2797	sub	K, KK, L
2798#elif defined(LEFT)
2799	add	KK, 2, L
2800#else
2801	add	KK, 1, L
2802#endif
2803	and	L, 3,  L
2804#endif
2805	cmp	L,  0
2806	ble,a,pn %icc, .LL259
2807	nop
2808
2809.LL256:
2810
2811	FADD	c01, t1, c01
2812	add	L, -1, L
2813	FMUL	a1, b1, t1
2814	LDF	[AO + 2 * SIZE], a1
2815
2816	FADD	c02, t2, c02
2817	cmp	L, 0
2818	FMUL	a2, b1, t2
2819	LDF	[AO + 3 * SIZE], a2
2820
2821	LDF	[BO + 1 * SIZE], b1
2822	add	AO, 2 * SIZE, AO
2823
2824	bg,pt	%icc, .LL256
2825	add	BO, 1 * SIZE, BO
2826
2827.LL259:
2828#ifndef TRMMKERNEL
2829	FADD	c01, t1, c01
2830	LDF	[C1 + 0 * SIZE], a1
2831	FADD	c02, t2, c02
2832	LDF	[C1 + 1 * SIZE], a2
2833	FADD	c03, t3, c03
2834	FADD	c04, t4, c04
2835
2836	FADD	c01, c03, c01
2837	FADD	c02, c04, c02
2838	FMUL	c01, ALPHA, c01
2839	FMUL	c02, ALPHA, c02
2840	FADD	c01, a1, c01
2841	FADD	c02, a2, c02
2842
2843	STF	c01, [C1 + 0 * SIZE]
2844	STF	c02, [C1 + 1 * SIZE]
2845	add	C1, 2 * SIZE, C1
2846#else
2847	FADD	c01, t1, c01
2848	FADD	c02, t2, c02
2849	FADD	c03, t3, c03
2850	FADD	c04, t4, c04
2851
2852	FADD	c01, c03, c01
2853	FADD	c02, c04, c02
2854	FMUL	c01, ALPHA, c01
2855	FMUL	c02, ALPHA, c02
2856
2857	STF	c01, [C1 + 0 * SIZE]
2858	STF	c02, [C1 + 1 * SIZE]
2859	add	C1, 2 * SIZE, C1
2860
2861#if ( defined(LEFT) &&  defined(TRANSA)) || \
2862    (!defined(LEFT) && !defined(TRANSA))
2863	sub	K, KK, TEMP1
2864#ifdef LEFT
2865	add	TEMP1, -2, TEMP1
2866#else
2867	add	TEMP1, -1, TEMP1
2868#endif
2869	sll	TEMP1, 1 + BASE_SHIFT, TEMP2
2870	sll	TEMP1, 0 + BASE_SHIFT, TEMP1
2871
2872	add	AO, TEMP2, AO
2873	add	BO, TEMP1, BO
2874#endif
2875
2876#ifdef LEFT
2877	add	KK, 2, KK
2878#endif
2879#endif
2880
2881.LL270:
2882	and	M, 1, I
2883	cmp	I, 0
2884	ble,pn	%icc, .LL999
2885	nop
2886
2887.LL271:
2888#if !defined(TRMMKERNEL)
2889	LDF	[AO + 0 * SIZE], a1
2890	sra	K, 2, L
2891	FMOV	FZERO, t1
2892
2893 	LDF	[AO + 1 * SIZE], a2
2894	mov	B, BO
2895	FMOV	FZERO, c01
2896
2897	LDF	[AO + 2 * SIZE], a3
2898	cmp	L,  0
2899	FMOV	FZERO, t2
2900
2901	LDF	[AO + 3 * SIZE], a4
2902	FMOV	FZERO, c02
2903
2904	LDF	[BO + 0 * SIZE], b1
2905	FMOV	FZERO, t3
2906	LDF	[BO + 1 * SIZE], b2
2907	FMOV	FZERO, t4
2908	LDF	[BO + 2 * SIZE], b3
2909#else
2910#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2911	mov	B, BO
2912#else
2913	sll	KK, 0 + BASE_SHIFT, TEMP1
2914	sll	KK, 0 + BASE_SHIFT, TEMP2
2915
2916	add	AO, TEMP1, AO
2917	add	B,  TEMP2, BO
2918#endif
2919
2920#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2921	sub	K, KK, L
2922#elif defined(LEFT)
2923	add	KK, 1, L
2924#else
2925	add	KK, 1, L
2926#endif
2927	sra	L, 2, L
2928	cmp	L,  0
2929
2930	LDF	[AO + 0 * SIZE], a1
2931	FMOV	FZERO, t1
2932 	LDF	[AO + 1 * SIZE], a2
2933	FMOV	FZERO, c01
2934
2935	LDF	[AO + 2 * SIZE], a3
2936	FMOV	FZERO, t2
2937	LDF	[AO + 3 * SIZE], a4
2938	FMOV	FZERO, c02
2939
2940	LDF	[BO + 0 * SIZE], b1
2941	FMOV	FZERO, t3
2942	LDF	[BO + 1 * SIZE], b2
2943	FMOV	FZERO, t4
2944	LDF	[BO + 2 * SIZE], b3
2945#endif
2946
2947	ble,pn	%icc, .LL275
2948	LDF	[BO + 3 * SIZE], b4
2949
2950.LL272:
2951	FADD	c01, t1, c01
2952	add	L, -1, L
2953	add	AO,  4 * SIZE, AO
2954
2955	FMUL	a1, b1, t1
2956	add	BO,  4 * SIZE, BO
2957	LDF	[AO + 0 * SIZE], a1
2958
2959	FADD	c02, t2, c02
2960	cmp	L, 0
2961	LDF	[BO + 0 * SIZE], b1
2962	FMUL	a2, b2, t2
2963
2964	LDF	[AO + 1 * SIZE], a2
2965	FADD	c01, t3, c01
2966	LDF	[BO + 1 * SIZE], b2
2967	FMUL	a3, b3, t3
2968
2969	LDF	[AO + 2 * SIZE], a3
2970	FADD	c02, t4, c02
2971	LDF	[BO + 2 * SIZE], b3
2972	FMUL	a4, b4, t4
2973	LDF	[AO + 3 * SIZE], a4
2974
2975	bg,pt	%icc, .LL272
2976	LDF	[BO + 3 * SIZE], b4
2977
2978.LL275:
2979#ifndef TRMMKERNEL
2980	and	K, 3, L
2981#else
2982#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2983	sub	K, KK, L
2984#elif defined(LEFT)
2985	add	KK, 1, L
2986#else
2987	add	KK, 1, L
2988#endif
2989	and	L, 3,  L
2990#endif
2991	cmp	L,  0
2992	ble,a,pn %icc, .LL279
2993	nop
2994
2995.LL276:
2996	FADD	c01, t1, c01
2997	add	L, -1, L
2998	FMUL	a1, b1, t1
2999	LDF	[AO + 1 * SIZE], a1
3000
3001	LDF	[BO + 1 * SIZE], b1
3002	add	BO, 1 * SIZE, BO
3003	cmp	L, 0
3004	bg,pt	%icc, .LL276
3005	add	AO, 1 * SIZE, AO
3006
3007.LL279:
3008#ifndef TRMMKERNEL
3009	FADD	c01, t1, c01
3010
3011	LDF	[C1 + 0 * SIZE], a1
3012	FADD	c02, t2, c02
3013	FADD	c01, t3, c01
3014	FADD	c02, t4, c02
3015	FADD	c01, c02, c01
3016
3017	FMUL	c01, ALPHA, c01
3018	FADD	c01, a1, c01
3019	STF	c01, [C1 + 0 * SIZE]
3020#else
3021	FADD	c01, t1, c01
3022	FADD	c02, t2, c02
3023	FADD	c01, t3, c01
3024	FADD	c02, t4, c02
3025	FADD	c01, c02, c01
3026
3027	FMUL	c01, ALPHA, c01
3028	STF	c01, [C1 + 0 * SIZE]
3029
3030#if ( defined(LEFT) &&  defined(TRANSA)) || \
3031    (!defined(LEFT) && !defined(TRANSA))
3032	sub	K, KK, TEMP1
3033#ifdef LEFT
3034	add	TEMP1, -1, TEMP1
3035#else
3036	add	TEMP1, -1, TEMP1
3037#endif
3038	sll	TEMP1, 0 + BASE_SHIFT, TEMP2
3039	sll	TEMP1, 0 + BASE_SHIFT, TEMP1
3040
3041	add	AO, TEMP2, AO
3042	add	BO, TEMP1, BO
3043#endif
3044
3045#ifdef LEFT
3046	add	KK, 1, KK
3047#endif
3048#endif
3049
3050.LL999:
3051	return	%i7 + 8
3052	clr	%o0
3053
3054	EPILOGUE
3055