1/*********************************************************************/
2/* Copyright 2005-2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define APREFETCHSIZE 24
43#define APREFETCH_CATEGORY 0
44
45#define M	%i0
46#define N	%i1
47#define K	%i2
48
49#if defined(DOUBLE) && !defined(__64BIT__)
50#define A	%i5
51#define B	%i4
52#else
53#define A	%i4
54#define B	%i5
55#endif
56
57#define C	%o4
58#define LDC	%o5
59
60#define AO	%l0
61#define BO	%l1
62#define I	%l2
63#define J	%l3
64#define L	%l4
65
66#define BB	%o7
67
68#define C1	%o0
69#define C2	%o1
70#define C3	%o2
71#define C4	%o3
72
73#define C5	%l5
74#define	C6	%l6
75#define C7	%l7
76#define C8	%i3
77
78#define OFFSET	%g1
79#define	KK	%g2
80#define TEMP1	%g3
81#define TEMP2	%g4
82
83#ifdef DOUBLE
84#define c01	%f0
85#define c02	%f2
86#define c03	%f4
87#define c04	%f6
88#define c05	%f8
89#define c06	%f10
90#define c07	%f12
91#define c08	%f14
92#define c09	%f16
93#define c10	%f18
94#define c11	%f20
95#define c12	%f22
96#define c13	%f24
97#define c14	%f26
98#define c15	%f28
99#define c16	%f30
100
101#define a1	%f32
102#define a2	%f34
103#define a3	%f36
104#define a4	%f38
105#define a5	%f40
106
107#define b1	%f42
108#define b2	%f44
109#define b3	%f46
110#define b4	%f48
111#define b5	%f50
112#define b6	%f52
113#define b7	%f54
114#define b8	%f56
115#define b9	%f58
116
117#define ALPHA	%f62
118
119#define cc01	0
120#define cc02	2
121#define cc03	4
122#define cc04	6
123#define cc05	8
124#define cc06	10
125#define cc07	12
126#define cc08	14
127#define cc09	16
128#define cc10	18
129#define cc11	20
130#define cc12	22
131#define cc13	24
132#define cc14	26
133#define cc15	28
134#define cc16	30
135
136#define aa1	 1
137#define aa2	 3
138#define aa3	 5
139#define aa4	 7
140#define aa5	 9
141
142#define bb1	11
143#define bb2	13
144#define bb3	15
145#define bb4	17
146#define bb5	19
147#define bb6	21
148#define bb7	23
149#define bb8	25
150#define bb9	27
151
152#define alpha	31
153#else
154#define c01	%f0
155#define c02	%f1
156#define c03	%f2
157#define c04	%f3
158#define c05	%f4
159#define c06	%f5
160#define c07	%f6
161#define c08	%f7
162#define c09	%f8
163#define c10	%f9
164#define c11	%f10
165#define c12	%f11
166#define c13	%f12
167#define c14	%f13
168#define c15	%f14
169#define c16	%f15
170
171#define a1	%f16
172#define a2	%f17
173#define a3	%f18
174#define a4	%f19
175#define a5	%f20
176
177#define b1	%f21
178#define b2	%f22
179#define b3	%f23
180#define b4	%f24
181#define b5	%f25
182#define b6	%f26
183#define b7	%f27
184#define b8	%f28
185#define b9	%f29
186
187#define ALPHA	%f31
188
189#define cc01	0
190#define cc02	1
191#define cc03	2
192#define cc04	3
193#define cc05	4
194#define cc06	5
195#define cc07	6
196#define cc08	7
197#define cc09	8
198#define cc10	9
199#define cc11	10
200#define cc12	11
201#define cc13	12
202#define cc14	13
203#define cc15	14
204#define cc16	15
205
206#define aa1	16
207#define aa2	17
208#define aa3	18
209#define aa4	19
210#define aa5	20
211
212#define bb1	21
213#define bb2	22
214#define bb3	23
215#define bb4	24
216#define bb5	25
217#define bb6	26
218#define bb7	27
219#define bb8	28
220#define bb9	29
221
222#define alpha	31
223
224#endif
225
226        .register %g2, #scratch
227        .register %g3, #scratch
228
229	PROLOGUE
230	SAVESP
231	nop
232
233#ifndef __64BIT__
234
235#ifdef DOUBLE
236	st	%i3, [%sp + STACK_START + 16]
237	st	%i4, [%sp + STACK_START + 20]
238
239	ld	[%sp + STACK_START + 28], B
240	ld	[%sp + STACK_START + 32], C
241	ld	[%sp + STACK_START + 36], LDC
242#ifdef TRMMKERNEL
243	ld	[%sp + STACK_START + 40], OFFSET
244#endif
245#else
246	st	%i3, [%sp + STACK_START + 16]
247
248	ld	[%sp + STACK_START + 28], C
249	ld	[%sp + STACK_START + 32], LDC
250#ifdef TRMMKERNEL
251	ld	[%sp + STACK_START + 36], OFFSET
252#endif
253#endif
254	LDF	[%sp + STACK_START + 16], ALPHA
255#ifdef TRMMKERNEL
256	st	%g1, [%sp + STACK_START +  8]
257	st	%g2, [%sp + STACK_START + 12]
258	st	%g3, [%sp + STACK_START + 16]
259	st	%g4, [%sp + STACK_START + 20]
260#endif
261#else
262
263	ldx	[%sp+  STACK_START + 56], C
264	ldx	[%sp+  STACK_START + 64], LDC
265#ifdef TRMMKERNEL
266	ldx	[%sp+  STACK_START + 72], OFFSET
267#endif
268
269#ifdef DOUBLE
270	FMOV	%f6, ALPHA
271#else
272	FMOV	%f7, ALPHA
273#endif
274
275#ifdef TRMMKERNEL
276	stx	%g1, [%sp + STACK_START + 32]
277	stx	%g2, [%sp + STACK_START + 40]
278	stx	%g3, [%sp + STACK_START + 48]
279	stx	%g4, [%sp + STACK_START + 56]
280#endif
281
282#endif
283
284#if defined(TRMMKERNEL) && !defined(LEFT)
285	neg	OFFSET, KK
286#endif
287
288	sra	N, 3, J
289	cmp	J, 0
290	ble,pn	%icc, .LL30
291	sll	LDC, BASE_SHIFT, LDC
292
293.LL11:
294	mov	C,  C1
295	add	C,  LDC, C2
296	add	C2, LDC, C3
297	add	C3, LDC, C4
298	add	C4, LDC, C5
299	add	C5, LDC, C6
300	add	C6, LDC, C7
301	add	C7, LDC, C8
302	add	C8, LDC, C
303
304	sll	K, BASE_SHIFT + 3, BB
305
306#if defined(TRMMKERNEL) &&  defined(LEFT)
307	mov	OFFSET, KK
308#endif
309
310	mov	A, AO
311
312	sra	M, 1, I
313	cmp	I, 0
314	ble,pn	%icc, .LL20
315	add	B, BB, BB
316	.align 4
317
318.LL12:
319	prefetch [BB +  0 * SIZE], 1
320
321#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))))
322	mov	B, BO
323#else
324	sll	KK, BASE_SHIFT + 1, TEMP1
325	sll	KK, BASE_SHIFT + 3, TEMP2
326
327	add	AO, TEMP1, AO
328	add	B,  TEMP2, BO
329#endif
330
331	LDF	[AO +  0 * SIZE], a1
332	LDF	[AO +  1 * SIZE], a2
333	LDF	[AO +  8 * SIZE], a5
334
335	LDF	[BO +  0 * SIZE], b1
336
337	LDF	[BO +  1 * SIZE], b2
338	FCLR	(cc01)
339	LDF	[BO +  2 * SIZE], b3
340	FCLR	(cc05)
341	LDF	[BO +  3 * SIZE], b4
342	FCLR	(cc09)
343	LDF	[BO +  4 * SIZE], b5
344	FCLR	(cc13)
345
346	LDF	[BO +  5 * SIZE], b6
347	FCLR	(cc02)
348	LDF	[BO +  6 * SIZE], b7
349	FCLR	(cc06)
350	LDF	[BO +  7 * SIZE], b8
351	FCLR	(cc10)
352	LDF	[BO +  8 * SIZE], b9
353	FCLR	(cc14)
354
355	prefetch [C1 + 1 * SIZE], 3
356	FCLR	(cc03)
357	prefetch [C2 + 2 * SIZE], 3
358	FCLR	(cc07)
359	prefetch [C3 + 1 * SIZE], 3
360	FCLR	(cc11)
361	prefetch [C4 + 2 * SIZE], 3
362	FCLR	(cc15)
363
364	prefetch [C5 + 1 * SIZE], 3
365	FCLR	(cc04)
366	prefetch [C6 + 2 * SIZE], 3
367	FCLR	(cc08)
368	prefetch [C7 + 1 * SIZE], 3
369	FCLR	(cc12)
370	prefetch [C8 + 2 * SIZE], 3
371	FCLR	(cc16)
372
373#ifndef TRMMKERNEL
374	sra	K,  3, L
375#else
376#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
377	sub	K, KK, L
378#elif defined(LEFT)
379	add	KK, 2, L
380#else
381	add	KK, 8, L
382#endif
383	sra	L,  3, L
384#endif
385	cmp	L,  0
386	ble,pn	%icc, .LL15
387	add	 BB, 32 * SIZE, BB
388	.align 4
389
390.LL13:
391	FMADD	(aa1, bb1, cc01, cc01)
392	FMADD	(aa2, bb1, cc02, cc02)
393	FMADD	(aa1, bb2, cc03, cc03)
394	FMADD	(aa2, bb2, cc04, cc04)
395
396	FMADD	(aa1, bb3, cc05, cc05)
397	LDF	[BO + 16 * SIZE], b1
398	FMADD	(aa2, bb3, cc06, cc06)
399	LDF	[BO +  9 * SIZE], b2
400
401	FMADD	(aa1, bb4, cc07, cc07)
402	LDF	[BO + 10 * SIZE], b3
403	FMADD	(aa2, bb4, cc08, cc08)
404	LDF	[BO + 11 * SIZE], b4
405
406	FMADD	(aa1, bb5, cc09, cc09)
407	LDF	[AO +  2 * SIZE], a3
408	FMADD	(aa2, bb5, cc10, cc10)
409	LDF	[AO +  3 * SIZE], a4
410
411	FMADD	(aa1, bb6, cc11, cc11)
412	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
413	FMADD	(aa2, bb6, cc12, cc12)
414	nop
415
416	FMADD	(aa1, bb7, cc13, cc13)
417	LDF	[BO + 12 * SIZE], b5
418	FMADD	(aa2, bb7, cc14, cc14)
419	LDF	[BO + 13 * SIZE], b6
420
421	FMADD	(aa1, bb8, cc15, cc15)
422	LDF	[BO + 14 * SIZE], b7
423	FMADD	(aa2, bb8, cc16, cc16)
424	LDF	[BO + 15 * SIZE], b8
425
426	FMADD	(aa3, bb9, cc01, cc01)
427	FMADD	(aa4, bb9, cc02, cc02)
428	FMADD	(aa3, bb2, cc03, cc03)
429	FMADD	(aa4, bb2, cc04, cc04)
430
431	FMADD	(aa3, bb3, cc05, cc05)
432	LDF	[BO + 24 * SIZE], b9
433	FMADD	(aa4, bb3, cc06, cc06)
434	LDF	[BO + 17 * SIZE], b2
435
436	FMADD	(aa3, bb4, cc07, cc07)
437	LDF	[BO + 18 * SIZE], b3
438	FMADD	(aa4, bb4, cc08, cc08)
439	LDF	[BO + 19 * SIZE], b4
440
441	FMADD	(aa3, bb5, cc09, cc09)
442	LDF	[AO +  4 * SIZE], a1
443	FMADD	(aa4, bb5, cc10, cc10)
444	LDF	[AO +  5 * SIZE], a2
445
446	FMADD	(aa3, bb6, cc11, cc11)
447	add	L, -1, L
448	FMADD	(aa4, bb6, cc12, cc12)
449	nop
450
451	FMADD	(aa3, bb7, cc13, cc13)
452	LDF	[BO + 20 * SIZE], b5
453	FMADD	(aa4, bb7, cc14, cc14)
454	LDF	[BO + 21 * SIZE], b6
455
456	FMADD	(aa3, bb8, cc15, cc15)
457	LDF	[BO + 22 * SIZE], b7
458	FMADD	(aa4, bb8, cc16, cc16)
459	LDF	[BO + 23 * SIZE], b8
460
461	FMADD	(aa1, bb1, cc01, cc01)
462	FMADD	(aa2, bb1, cc02, cc02)
463	FMADD	(aa1, bb2, cc03, cc03)
464	FMADD	(aa2, bb2, cc04, cc04)
465
466	FMADD	(aa1, bb3, cc05, cc05)
467	LDF	[BO + 32 * SIZE], b1
468	FMADD	(aa2, bb3, cc06, cc06)
469	LDF	[BO + 25 * SIZE], b2
470
471	FMADD	(aa1, bb4, cc07, cc07)
472	LDF	[BO + 26 * SIZE], b3
473	FMADD	(aa2, bb4, cc08, cc08)
474	LDF	[BO + 27 * SIZE], b4
475
476	FMADD	(aa1, bb5, cc09, cc09)
477	LDF	[AO +  6 * SIZE], a3
478	FMADD	(aa2, bb5, cc10, cc10)
479	LDF	[AO +  7 * SIZE], a4
480
481	FMADD	(aa1, bb6, cc11, cc11)
482	nop
483	FMADD	(aa2, bb6, cc12, cc12)
484	nop
485
486	FMADD	(aa1, bb7, cc13, cc13)
487	LDF	[BO + 28 * SIZE], b5
488	FMADD	(aa2, bb7, cc14, cc14)
489	LDF	[BO + 29 * SIZE], b6
490
491	FMADD	(aa1, bb8, cc15, cc15)
492	LDF	[BO + 30 * SIZE], b7
493	FMADD	(aa2, bb8, cc16, cc16)
494	LDF	[BO + 31 * SIZE], b8
495
496	FMADD	(aa3, bb9, cc01, cc01)
497	FMADD	(aa4, bb9, cc02, cc02)
498	FMADD	(aa3, bb2, cc03, cc03)
499	FMADD	(aa4, bb2, cc04, cc04)
500
501	FMADD	(aa3, bb3, cc05, cc05)
502	LDF	[BO + 40 * SIZE], b9
503	FMADD	(aa4, bb3, cc06, cc06)
504	LDF	[BO + 33 * SIZE], b2
505
506	FMADD	(aa3, bb4, cc07, cc07)
507	LDF	[BO + 34 * SIZE], b3
508	FMADD	(aa4, bb4, cc08, cc08)
509	LDF	[BO + 35 * SIZE], b4
510
511	FMADD	(aa3, bb5, cc09, cc09)
512	LDF	[AO + 16 * SIZE], a1  /****/
513	FMADD	(aa4, bb5, cc10, cc10)
514	LDF	[AO +  9 * SIZE], a2
515
516	FMADD	(aa3, bb6, cc11, cc11)
517	nop
518	FMADD	(aa4, bb6, cc12, cc12)
519	nop
520
521	FMADD	(aa3, bb7, cc13, cc13)
522	LDF	[BO + 36 * SIZE], b5
523	FMADD	(aa4, bb7, cc14, cc14)
524	LDF	[BO + 37 * SIZE], b6
525
526	FMADD	(aa3, bb8, cc15, cc15)
527	LDF	[BO + 38 * SIZE], b7
528	FMADD	(aa4, bb8, cc16, cc16)
529	LDF	[BO + 39 * SIZE], b8
530
531	FMADD	(aa5, bb1, cc01, cc01)
532	FMADD	(aa2, bb1, cc02, cc02)
533	FMADD	(aa5, bb2, cc03, cc03)
534	FMADD	(aa2, bb2, cc04, cc04)
535
536	FMADD	(aa5, bb3, cc05, cc05)
537	LDF	[BO + 48 * SIZE], b1
538	FMADD	(aa2, bb3, cc06, cc06)
539	LDF	[BO + 41 * SIZE], b2
540
541	FMADD	(aa5, bb4, cc07, cc07)
542	LDF	[BO + 42 * SIZE], b3
543	FMADD	(aa2, bb4, cc08, cc08)
544	LDF	[BO + 43 * SIZE], b4
545
546	FMADD	(aa5, bb5, cc09, cc09)
547	LDF	[AO + 10 * SIZE], a3
548	FMADD	(aa2, bb5, cc10, cc10)
549	LDF	[AO + 11 * SIZE], a4
550
551	FMADD	(aa5, bb6, cc11, cc11)
552	prefetch [AO + (APREFETCHSIZE +  8) * SIZE], APREFETCH_CATEGORY
553	FMADD	(aa2, bb6, cc12, cc12)
554	nop
555
556	FMADD	(aa5, bb7, cc13, cc13)
557	LDF	[BO + 44 * SIZE], b5
558	FMADD	(aa2, bb7, cc14, cc14)
559	LDF	[BO + 45 * SIZE], b6
560
561	FMADD	(aa5, bb8, cc15, cc15)
562	LDF	[BO + 46 * SIZE], b7
563	FMADD	(aa2, bb8, cc16, cc16)
564	LDF	[BO + 47 * SIZE], b8
565
566	FMADD	(aa3, bb9, cc01, cc01)
567	FMADD	(aa4, bb9, cc02, cc02)
568	FMADD	(aa3, bb2, cc03, cc03)
569	FMADD	(aa4, bb2, cc04, cc04)
570
571	FMADD	(aa3, bb3, cc05, cc05)
572	LDF	[BO + 56 * SIZE], b9
573	FMADD	(aa4, bb3, cc06, cc06)
574	LDF	[BO + 49 * SIZE], b2
575
576	FMADD	(aa3, bb4, cc07, cc07)
577	LDF	[BO + 50 * SIZE], b3
578	FMADD	(aa4, bb4, cc08, cc08)
579	LDF	[BO + 51 * SIZE], b4
580
581	FMADD	(aa3, bb5, cc09, cc09)
582	LDF	[AO + 12 * SIZE], a5
583	FMADD	(aa4, bb5, cc10, cc10)
584	LDF	[AO + 13 * SIZE], a2
585
586	FMADD	(aa3, bb6, cc11, cc11)
587	cmp	L, 0
588	FMADD	(aa4, bb6, cc12, cc12)
589	nop
590
591	FMADD	(aa3, bb7, cc13, cc13)
592	LDF	[BO + 52 * SIZE], b5
593	FMADD	(aa4, bb7, cc14, cc14)
594	LDF	[BO + 53 * SIZE], b6
595
596	FMADD	(aa3, bb8, cc15, cc15)
597	LDF	[BO + 54 * SIZE], b7
598	FMADD	(aa4, bb8, cc16, cc16)
599	LDF	[BO + 55 * SIZE], b8
600
601	FMADD	(aa5, bb1, cc01, cc01)
602	FMADD	(aa2, bb1, cc02, cc02)
603	FMADD	(aa5, bb2, cc03, cc03)
604	FMADD	(aa2, bb2, cc04, cc04)
605
606	FMADD	(aa5, bb3, cc05, cc05)
607	LDF	[BO + 64 * SIZE], b1
608	FMADD	(aa2, bb3, cc06, cc06)
609	LDF	[BO + 57 * SIZE], b2
610
611	FMADD	(aa5, bb4, cc07, cc07)
612	LDF	[BO + 58 * SIZE], b3
613	FMADD	(aa2, bb4, cc08, cc08)
614	LDF	[BO + 59 * SIZE], b4
615
616	FMADD	(aa5, bb5, cc09, cc09)
617	LDF	[AO + 14 * SIZE], a3
618	FMADD	(aa2, bb5, cc10, cc10)
619	LDF	[AO + 15 * SIZE], a4
620
621	FMADD	(aa5, bb6, cc11, cc11)
622	add	BO, 64 * SIZE, BO
623	FMADD	(aa2, bb6, cc12, cc12)
624	add	AO, 16 * SIZE, AO
625
626	FMADD	(aa5, bb7, cc13, cc13)
627	LDF	[BO -  4 * SIZE], b5
628	FMADD	(aa2, bb7, cc14, cc14)
629	LDF	[BO -  3 * SIZE], b6
630
631	FMADD	(aa5, bb8, cc15, cc15)
632	LDF	[BO -  2 * SIZE], b7
633	FMADD	(aa2, bb8, cc16, cc16)
634	LDF	[BO -  1 * SIZE], b8
635
636	FMADD	(aa3, bb9, cc01, cc01)
637	FMADD	(aa4, bb9, cc02, cc02)
638	FMADD	(aa3, bb2, cc03, cc03)
639	FMADD	(aa4, bb2, cc04, cc04)
640
641	FMADD	(aa3, bb3, cc05, cc05)
642	LDF	[BO +  8 * SIZE], b9
643	FMADD	(aa4, bb3, cc06, cc06)
644	LDF	[BO +  1 * SIZE], b2
645
646	FMADD	(aa3, bb4, cc07, cc07)
647	LDF	[BO +  2 * SIZE], b3
648	FMADD	(aa4, bb4, cc08, cc08)
649	LDF	[BO +  3 * SIZE], b4
650
651	FMADD	(aa3, bb5, cc09, cc09)
652	LDF	[AO +  8 * SIZE], a5  /****/
653	FMADD	(aa4, bb5, cc10, cc10)
654	LDF	[AO +  1 * SIZE], a2
655
656	FMADD	(aa3, bb6, cc11, cc11)
657	FMADD	(aa4, bb6, cc12, cc12)
658
659	FMADD	(aa3, bb7, cc13, cc13)
660	LDF	[BO +  4 * SIZE], b5
661	FMADD	(aa4, bb7, cc14, cc14)
662	LDF	[BO +  5 * SIZE], b6
663
664	FMADD	(aa3, bb8, cc15, cc15)
665	LDF	[BO +  6 * SIZE], b7
666	FMADD	(aa4, bb8, cc16, cc16)
667	ble,pn	%icc, .LL15
668	LDF	[BO +  7 * SIZE], b8
669
670	FMADD	(aa1, bb1, cc01, cc01)
671	FMADD	(aa2, bb1, cc02, cc02)
672	FMADD	(aa1, bb2, cc03, cc03)
673	FMADD	(aa2, bb2, cc04, cc04)
674
675	FMADD	(aa1, bb3, cc05, cc05)
676	LDF	[BO + 16 * SIZE], b1
677	FMADD	(aa2, bb3, cc06, cc06)
678	LDF	[BO +  9 * SIZE], b2
679
680	FMADD	(aa1, bb4, cc07, cc07)
681	LDF	[BO + 10 * SIZE], b3
682	FMADD	(aa2, bb4, cc08, cc08)
683	LDF	[BO + 11 * SIZE], b4
684
685	FMADD	(aa1, bb5, cc09, cc09)
686	LDF	[AO +  2 * SIZE], a3
687	FMADD	(aa2, bb5, cc10, cc10)
688	LDF	[AO +  3 * SIZE], a4
689
690	FMADD	(aa1, bb6, cc11, cc11)
691	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
692	FMADD	(aa2, bb6, cc12, cc12)
693	nop
694
695	FMADD	(aa1, bb7, cc13, cc13)
696	LDF	[BO + 12 * SIZE], b5
697	FMADD	(aa2, bb7, cc14, cc14)
698	LDF	[BO + 13 * SIZE], b6
699
700	FMADD	(aa1, bb8, cc15, cc15)
701	LDF	[BO + 14 * SIZE], b7
702	FMADD	(aa2, bb8, cc16, cc16)
703	LDF	[BO + 15 * SIZE], b8
704
705	FMADD	(aa3, bb9, cc01, cc01)
706	FMADD	(aa4, bb9, cc02, cc02)
707	FMADD	(aa3, bb2, cc03, cc03)
708	FMADD	(aa4, bb2, cc04, cc04)
709
710	FMADD	(aa3, bb3, cc05, cc05)
711	LDF	[BO + 24 * SIZE], b9
712	FMADD	(aa4, bb3, cc06, cc06)
713	LDF	[BO + 17 * SIZE], b2
714
715	FMADD	(aa3, bb4, cc07, cc07)
716	LDF	[BO + 18 * SIZE], b3
717	FMADD	(aa4, bb4, cc08, cc08)
718	LDF	[BO + 19 * SIZE], b4
719
720	FMADD	(aa3, bb5, cc09, cc09)
721	LDF	[AO +  4 * SIZE], a1
722	FMADD	(aa4, bb5, cc10, cc10)
723	LDF	[AO +  5 * SIZE], a2
724
725	FMADD	(aa3, bb6, cc11, cc11)
726	add	L, -1, L
727	FMADD	(aa4, bb6, cc12, cc12)
728	nop
729
730	FMADD	(aa3, bb7, cc13, cc13)
731	LDF	[BO + 20 * SIZE], b5
732	FMADD	(aa4, bb7, cc14, cc14)
733	LDF	[BO + 21 * SIZE], b6
734
735	FMADD	(aa3, bb8, cc15, cc15)
736	LDF	[BO + 22 * SIZE], b7
737	FMADD	(aa4, bb8, cc16, cc16)
738	LDF	[BO + 23 * SIZE], b8
739
740	FMADD	(aa1, bb1, cc01, cc01)
741	FMADD	(aa2, bb1, cc02, cc02)
742	FMADD	(aa1, bb2, cc03, cc03)
743	FMADD	(aa2, bb2, cc04, cc04)
744
745	FMADD	(aa1, bb3, cc05, cc05)
746	LDF	[BO + 32 * SIZE], b1
747	FMADD	(aa2, bb3, cc06, cc06)
748	LDF	[BO + 25 * SIZE], b2
749
750	FMADD	(aa1, bb4, cc07, cc07)
751	LDF	[BO + 26 * SIZE], b3
752	FMADD	(aa2, bb4, cc08, cc08)
753	LDF	[BO + 27 * SIZE], b4
754
755	FMADD	(aa1, bb5, cc09, cc09)
756	LDF	[AO +  6 * SIZE], a3
757	FMADD	(aa2, bb5, cc10, cc10)
758	LDF	[AO +  7 * SIZE], a4
759
760	FMADD	(aa1, bb6, cc11, cc11)
761	nop
762	FMADD	(aa2, bb6, cc12, cc12)
763	nop
764
765	FMADD	(aa1, bb7, cc13, cc13)
766	LDF	[BO + 28 * SIZE], b5
767	FMADD	(aa2, bb7, cc14, cc14)
768	LDF	[BO + 29 * SIZE], b6
769
770	FMADD	(aa1, bb8, cc15, cc15)
771	LDF	[BO + 30 * SIZE], b7
772	FMADD	(aa2, bb8, cc16, cc16)
773	LDF	[BO + 31 * SIZE], b8
774
775	FMADD	(aa3, bb9, cc01, cc01)
776	FMADD	(aa4, bb9, cc02, cc02)
777	FMADD	(aa3, bb2, cc03, cc03)
778	FMADD	(aa4, bb2, cc04, cc04)
779
780	FMADD	(aa3, bb3, cc05, cc05)
781	LDF	[BO + 40 * SIZE], b9
782	FMADD	(aa4, bb3, cc06, cc06)
783	LDF	[BO + 33 * SIZE], b2
784
785	FMADD	(aa3, bb4, cc07, cc07)
786	LDF	[BO + 34 * SIZE], b3
787	FMADD	(aa4, bb4, cc08, cc08)
788	LDF	[BO + 35 * SIZE], b4
789
790	FMADD	(aa3, bb5, cc09, cc09)
791	LDF	[AO + 16 * SIZE], a1  /****/
792	FMADD	(aa4, bb5, cc10, cc10)
793	LDF	[AO +  9 * SIZE], a2
794
795	FMADD	(aa3, bb6, cc11, cc11)
796	nop
797	FMADD	(aa4, bb6, cc12, cc12)
798	nop
799
800	FMADD	(aa3, bb7, cc13, cc13)
801	LDF	[BO + 36 * SIZE], b5
802	FMADD	(aa4, bb7, cc14, cc14)
803	LDF	[BO + 37 * SIZE], b6
804
805	FMADD	(aa3, bb8, cc15, cc15)
806	LDF	[BO + 38 * SIZE], b7
807	FMADD	(aa4, bb8, cc16, cc16)
808	LDF	[BO + 39 * SIZE], b8
809
810	FMADD	(aa5, bb1, cc01, cc01)
811	FMADD	(aa2, bb1, cc02, cc02)
812	FMADD	(aa5, bb2, cc03, cc03)
813	FMADD	(aa2, bb2, cc04, cc04)
814
815	FMADD	(aa5, bb3, cc05, cc05)
816	LDF	[BO + 48 * SIZE], b1
817	FMADD	(aa2, bb3, cc06, cc06)
818	LDF	[BO + 41 * SIZE], b2
819
820	FMADD	(aa5, bb4, cc07, cc07)
821	LDF	[BO + 42 * SIZE], b3
822	FMADD	(aa2, bb4, cc08, cc08)
823	LDF	[BO + 43 * SIZE], b4
824
825	FMADD	(aa5, bb5, cc09, cc09)
826	LDF	[AO + 10 * SIZE], a3
827	FMADD	(aa2, bb5, cc10, cc10)
828	LDF	[AO + 11 * SIZE], a4
829
830	FMADD	(aa5, bb6, cc11, cc11)
831	prefetch [AO + (APREFETCHSIZE +  8) * SIZE], APREFETCH_CATEGORY
832	FMADD	(aa2, bb6, cc12, cc12)
833	nop
834
835	FMADD	(aa5, bb7, cc13, cc13)
836	LDF	[BO + 44 * SIZE], b5
837	FMADD	(aa2, bb7, cc14, cc14)
838	LDF	[BO + 45 * SIZE], b6
839
840	FMADD	(aa5, bb8, cc15, cc15)
841	LDF	[BO + 46 * SIZE], b7
842	FMADD	(aa2, bb8, cc16, cc16)
843	LDF	[BO + 47 * SIZE], b8
844
845	FMADD	(aa3, bb9, cc01, cc01)
846	FMADD	(aa4, bb9, cc02, cc02)
847	FMADD	(aa3, bb2, cc03, cc03)
848	FMADD	(aa4, bb2, cc04, cc04)
849
850	FMADD	(aa3, bb3, cc05, cc05)
851	LDF	[BO + 56 * SIZE], b9
852	FMADD	(aa4, bb3, cc06, cc06)
853	LDF	[BO + 49 * SIZE], b2
854
855	FMADD	(aa3, bb4, cc07, cc07)
856	LDF	[BO + 50 * SIZE], b3
857	FMADD	(aa4, bb4, cc08, cc08)
858	LDF	[BO + 51 * SIZE], b4
859
860	FMADD	(aa3, bb5, cc09, cc09)
861	LDF	[AO + 12 * SIZE], a5
862	FMADD	(aa4, bb5, cc10, cc10)
863	LDF	[AO + 13 * SIZE], a2
864
865	FMADD	(aa3, bb6, cc11, cc11)
866	cmp	L, 0
867	FMADD	(aa4, bb6, cc12, cc12)
868	nop
869
870	FMADD	(aa3, bb7, cc13, cc13)
871	LDF	[BO + 52 * SIZE], b5
872	FMADD	(aa4, bb7, cc14, cc14)
873	LDF	[BO + 53 * SIZE], b6
874
875	FMADD	(aa3, bb8, cc15, cc15)
876	LDF	[BO + 54 * SIZE], b7
877	FMADD	(aa4, bb8, cc16, cc16)
878	LDF	[BO + 55 * SIZE], b8
879
880	FMADD	(aa5, bb1, cc01, cc01)
881	FMADD	(aa2, bb1, cc02, cc02)
882	FMADD	(aa5, bb2, cc03, cc03)
883	FMADD	(aa2, bb2, cc04, cc04)
884
885	FMADD	(aa5, bb3, cc05, cc05)
886	LDF	[BO + 64 * SIZE], b1
887	FMADD	(aa2, bb3, cc06, cc06)
888	LDF	[BO + 57 * SIZE], b2
889
890	FMADD	(aa5, bb4, cc07, cc07)
891	LDF	[BO + 58 * SIZE], b3
892	FMADD	(aa2, bb4, cc08, cc08)
893	LDF	[BO + 59 * SIZE], b4
894
895	FMADD	(aa5, bb5, cc09, cc09)
896	LDF	[AO + 14 * SIZE], a3
897	FMADD	(aa2, bb5, cc10, cc10)
898	LDF	[AO + 15 * SIZE], a4
899
900	FMADD	(aa5, bb6, cc11, cc11)
901	add	BO, 64 * SIZE, BO
902	FMADD	(aa2, bb6, cc12, cc12)
903	add	AO, 16 * SIZE, AO
904
905	FMADD	(aa5, bb7, cc13, cc13)
906	LDF	[BO -  4 * SIZE], b5
907	FMADD	(aa2, bb7, cc14, cc14)
908	LDF	[BO -  3 * SIZE], b6
909
910	FMADD	(aa5, bb8, cc15, cc15)
911	LDF	[BO -  2 * SIZE], b7
912	FMADD	(aa2, bb8, cc16, cc16)
913	LDF	[BO -  1 * SIZE], b8
914
915	FMADD	(aa3, bb9, cc01, cc01)
916	FMADD	(aa4, bb9, cc02, cc02)
917	FMADD	(aa3, bb2, cc03, cc03)
918	FMADD	(aa4, bb2, cc04, cc04)
919
920	FMADD	(aa3, bb3, cc05, cc05)
921	LDF	[BO +  8 * SIZE], b9
922	FMADD	(aa4, bb3, cc06, cc06)
923	LDF	[BO +  1 * SIZE], b2
924
925	FMADD	(aa3, bb4, cc07, cc07)
926	LDF	[BO +  2 * SIZE], b3
927	FMADD	(aa4, bb4, cc08, cc08)
928	LDF	[BO +  3 * SIZE], b4
929
930	FMADD	(aa3, bb5, cc09, cc09)
931	LDF	[AO +  8 * SIZE], a5  /****/
932	FMADD	(aa4, bb5, cc10, cc10)
933	LDF	[AO +  1 * SIZE], a2
934
935	FMADD	(aa3, bb6, cc11, cc11)
936	FMADD	(aa4, bb6, cc12, cc12)
937
938	FMADD	(aa3, bb7, cc13, cc13)
939	LDF	[BO +  4 * SIZE], b5
940	FMADD	(aa4, bb7, cc14, cc14)
941	LDF	[BO +  5 * SIZE], b6
942
943	FMADD	(aa3, bb8, cc15, cc15)
944	LDF	[BO +  6 * SIZE], b7
945	FMADD	(aa4, bb8, cc16, cc16)
946	bg,pt	%icc, .LL13
947	LDF	[BO +  7 * SIZE], b8
948	.align 4
949
950.LL15:
951#ifndef TRMMKERNEL
952	and	K,  7, L
953#else
954#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
955	sub	K, KK, L
956#elif defined(LEFT)
957	add	KK, 2, L
958#else
959	add	KK, 8, L
960#endif
961	and	L,  7, L
962#endif
963	cmp	L,  0
964	ble,a,pn %icc, .LL18
965	nop
966	.align 4
967
968.LL17:
969	FMADD	(aa1, bb1, cc01, cc01)
970	add	L, -1, L
971	FMADD	(aa2, bb1, cc02, cc02)
972	nop
973
974	FMADD	(aa1, bb2, cc03, cc03)
975	LDF	[BO +  8 * SIZE], b1
976	FMADD	(aa2, bb2, cc04, cc04)
977	LDF	[BO +  9 * SIZE], b2
978
979	FMADD	(aa1, bb3, cc05, cc05)
980	cmp	L, 0
981	FMADD	(aa2, bb3, cc06, cc06)
982	nop
983
984	FMADD	(aa1, bb4, cc07, cc07)
985	LDF	[BO + 10 * SIZE], b3
986	FMADD	(aa2, bb4, cc08, cc08)
987	LDF	[BO + 11 * SIZE], b4
988
989	FMADD	(aa1, bb5, cc09, cc09)
990	nop
991	FMADD	(aa2, bb5, cc10, cc10)
992	nop
993
994	FMADD	(aa1, bb6, cc11, cc11)
995	LDF	[BO + 12 * SIZE], b5
996	FMADD	(aa2, bb6, cc12, cc12)
997	LDF	[BO + 13 * SIZE], b6
998
999	FMADD	(aa1, bb7, cc13, cc13)
1000	add	AO, 2 * SIZE, AO
1001	FMADD	(aa2, bb7, cc14, cc14)
1002	add	BO, 8 * SIZE, BO
1003
1004	FMADD	(aa1, bb8, cc15, cc15)
1005	LDF	[AO +  0 * SIZE], a1
1006	FMADD	(aa2, bb8, cc16, cc16)
1007	LDF	[AO +  1 * SIZE], a2
1008
1009	LDF	[BO +  6 * SIZE], b7
1010	bg,pt	%icc, .LL17
1011	LDF	[BO +  7 * SIZE], b8
1012	nop
1013	.align 4
1014
1015.LL18:
1016#ifndef TRMMKERNEL
1017	LDF	[C1 + 0 * SIZE], a1
1018	LDF	[C1 + 1 * SIZE], a2
1019	LDF	[C2 + 0 * SIZE], a3
1020	LDF	[C2 + 1 * SIZE], a4
1021
1022	LDF	[C3 + 0 * SIZE], b1
1023	LDF	[C3 + 1 * SIZE], b2
1024	LDF	[C4 + 0 * SIZE], b3
1025	LDF	[C4 + 1 * SIZE], b4
1026
1027	FMADD	(alpha, cc01, aa1, cc01)
1028	LDF	[C5 + 0 * SIZE], a1
1029	FMADD	(alpha, cc02, aa2, cc02)
1030	LDF	[C5 + 1 * SIZE], a2
1031	FMADD	(alpha, cc03, aa3, cc03)
1032	LDF	[C6 + 0 * SIZE], a3
1033	FMADD	(alpha, cc04, aa4, cc04)
1034	LDF	[C6 + 1 * SIZE], a4
1035
1036	FMADD	(alpha, cc05, bb1, cc05)
1037	LDF	[C7 + 0 * SIZE], b1
1038	FMADD	(alpha, cc06, bb2, cc06)
1039	LDF	[C7 + 1 * SIZE], b2
1040	FMADD	(alpha, cc07, bb3, cc07)
1041	LDF	[C8 + 0 * SIZE], b3
1042	FMADD	(alpha, cc08, bb4, cc08)
1043	LDF	[C8 + 1 * SIZE], b4
1044
1045	FMADD	(alpha, cc09, aa1, cc09)
1046	STF	c01, [C1 + 0 * SIZE]
1047	FMADD	(alpha, cc10, aa2, cc10)
1048	STF	c02, [C1 + 1 * SIZE]
1049	FMADD	(alpha, cc11, aa3, cc11)
1050	STF	c03, [C2 + 0 * SIZE]
1051	FMADD	(alpha, cc12, aa4, cc12)
1052	STF	c04, [C2 + 1 * SIZE]
1053
1054	FMADD	(alpha, cc13, bb1, cc13)
1055	STF	c05, [C3 + 0 * SIZE]
1056	FMADD	(alpha, cc14, bb2, cc14)
1057	STF	c06, [C3 + 1 * SIZE]
1058	FMADD	(alpha, cc15, bb3, cc15)
1059	STF	c07, [C4 + 0 * SIZE]
1060	FMADD	(alpha, cc16, bb4, cc16)
1061	STF	c08, [C4 + 1 * SIZE]
1062
1063#else
1064	FMUL	ALPHA, c01, c01
1065	FMUL	ALPHA, c02, c02
1066	FMUL	ALPHA, c03, c03
1067	FMUL	ALPHA, c04, c04
1068
1069	FMUL	ALPHA, c05, c05
1070	FMUL	ALPHA, c06, c06
1071	FMUL	ALPHA, c07, c07
1072	FMUL	ALPHA, c08, c08
1073
1074	FMUL	ALPHA, c09, c09
1075	STF	c01, [C1 + 0 * SIZE]
1076	FMUL	ALPHA, c10, c10
1077	STF	c02, [C1 + 1 * SIZE]
1078	FMUL	ALPHA, c11, c11
1079	STF	c03, [C2 + 0 * SIZE]
1080	FMUL	ALPHA, c12, c12
1081	STF	c04, [C2 + 1 * SIZE]
1082
1083	FMUL	ALPHA, c13, c13
1084	STF	c05, [C3 + 0 * SIZE]
1085	FMUL	ALPHA, c14, c14
1086	STF	c06, [C3 + 1 * SIZE]
1087	FMUL	ALPHA, c15, c15
1088	STF	c07, [C4 + 0 * SIZE]
1089	FMUL	ALPHA, c16, c16
1090	STF	c08, [C4 + 1 * SIZE]
1091#endif
1092
1093	STF	c09, [C5 + 0 * SIZE]
1094	add	C1, 2 * SIZE, C1
1095	STF	c10, [C5 + 1 * SIZE]
1096	add	C2, 2 * SIZE, C2
1097	STF	c11, [C6 + 0 * SIZE]
1098	add	C3, 2 * SIZE, C3
1099	STF	c12, [C6 + 1 * SIZE]
1100	add	C4, 2 * SIZE, C4
1101
1102	STF	c13, [C7 + 0 * SIZE]
1103	add	C5, 2 * SIZE, C5
1104	STF	c14, [C7 + 1 * SIZE]
1105	add	C6, 2 * SIZE, C6
1106	STF	c15, [C8 + 0 * SIZE]
1107	add	C7, 2 * SIZE, C7
1108	STF	c16, [C8 + 1 * SIZE]
1109	add	C8, 2 * SIZE, C8
1110
1111#ifdef TRMMKERNEL
1112#if ( defined(LEFT) &&  defined(TRANSA)) || \
1113    (!defined(LEFT) && !defined(TRANSA))
1114	sub	K, KK, TEMP1
1115#ifdef LEFT
1116	add	TEMP1, -2, TEMP1
1117#else
1118	add	TEMP1, -8, TEMP1
1119#endif
1120	sll	TEMP1, BASE_SHIFT + 1, TEMP2
1121	sll	TEMP1, BASE_SHIFT + 3, TEMP1
1122
1123	add	AO, TEMP2, AO
1124	add	BO, TEMP1, BO
1125#endif
1126
1127#ifdef LEFT
1128	add	KK, 2, KK
1129#endif
1130#endif
1131
1132	add	I, -1, I
1133	cmp	I, 0
1134	bg,pt	%icc, .LL12
1135	nop
1136	.align 4
1137
1138.LL20:
1139	and	M, 1, I
1140	cmp	I, 0
1141	ble,pn	%icc, .LL29
1142	nop
1143
1144#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))))
1145	mov	B, BO
1146#else
1147	sll	KK, BASE_SHIFT + 0, TEMP1
1148	sll	KK, BASE_SHIFT + 3, TEMP2
1149
1150	add	AO, TEMP1, AO
1151	add	B,  TEMP2, BO
1152#endif
1153
1154	LDF	[AO +  0 * SIZE], a1
1155	LDF	[AO +  1 * SIZE], a2
1156	LDF	[AO +  2 * SIZE], a3
1157	LDF	[AO +  3 * SIZE], a4
1158
1159	LDF	[BO +  0 * SIZE], b1
1160	FCLR	(cc01)
1161	LDF	[BO +  1 * SIZE], b2
1162	FCLR	(cc03)
1163	LDF	[BO +  2 * SIZE], b3
1164	FCLR	(cc05)
1165	LDF	[BO +  3 * SIZE], b4
1166	FCLR	(cc07)
1167	LDF	[BO +  4 * SIZE], b5
1168	FCLR	(cc09)
1169	LDF	[BO +  5 * SIZE], b6
1170	FCLR	(cc11)
1171	LDF	[BO +  6 * SIZE], b7
1172	FCLR	(cc13)
1173	LDF	[BO +  7 * SIZE], b8
1174	FCLR	(cc15)
1175
1176#ifndef TRMMKERNEL
1177	sra	K,  2, L
1178#else
1179#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1180	sub	K, KK, L
1181#elif defined(LEFT)
1182	add	KK, 1, L
1183#else
1184	add	KK, 8, L
1185#endif
1186	sra	L,  2, L
1187#endif
1188	cmp	L,  0
1189	ble,pn	%icc, .LL25
1190	LDF	[BO +  8 * SIZE], b9
1191	.align 4
1192
1193.LL23:
1194	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
1195	add	L, -1, L
1196
1197	FMADD	(aa1, bb1, cc01, cc01)
1198	LDF	[BO + 16 * SIZE], b1
1199	FMADD	(aa1, bb2, cc03, cc03)
1200	LDF	[BO +  9 * SIZE], b2
1201
1202	FMADD	(aa1, bb3, cc05, cc05)
1203	LDF	[BO + 10 * SIZE], b3
1204	FMADD	(aa1, bb4, cc07, cc07)
1205	LDF	[BO + 11 * SIZE], b4
1206
1207	FMADD	(aa1, bb5, cc09, cc09)
1208	LDF	[BO + 12 * SIZE], b5
1209	FMADD	(aa1, bb6, cc11, cc11)
1210	LDF	[BO + 13 * SIZE], b6
1211
1212	FMADD	(aa1, bb7, cc13, cc13)
1213	LDF	[BO + 14 * SIZE], b7
1214	FMADD	(aa1, bb8, cc15, cc15)
1215	LDF	[BO + 15 * SIZE], b8
1216
1217	FMADD	(aa2, bb9, cc01, cc01)
1218	LDF	[BO + 24 * SIZE], b9
1219	FMADD	(aa2, bb2, cc03, cc03)
1220	LDF	[BO + 17 * SIZE], b2
1221
1222	FMADD	(aa2, bb3, cc05, cc05)
1223	LDF	[BO + 18 * SIZE], b3
1224	FMADD	(aa2, bb4, cc07, cc07)
1225	LDF	[BO + 19 * SIZE], b4
1226
1227	FMADD	(aa2, bb5, cc09, cc09)
1228	LDF	[BO + 20 * SIZE], b5
1229	FMADD	(aa2, bb6, cc11, cc11)
1230	LDF	[BO + 21 * SIZE], b6
1231
1232	FMADD	(aa2, bb7, cc13, cc13)
1233	LDF	[BO + 22 * SIZE], b7
1234	FMADD	(aa2, bb8, cc15, cc15)
1235	LDF	[BO + 23 * SIZE], b8
1236
1237	LDF	[AO +  4 * SIZE], a1
1238	LDF	[AO +  5 * SIZE], a2
1239
1240	FMADD	(aa3, bb1, cc01, cc01)
1241	LDF	[BO + 32 * SIZE], b1
1242	FMADD	(aa3, bb2, cc03, cc03)
1243	LDF	[BO + 25 * SIZE], b2
1244
1245	FMADD	(aa3, bb3, cc05, cc05)
1246	LDF	[BO + 26 * SIZE], b3
1247	FMADD	(aa3, bb4, cc07, cc07)
1248	LDF	[BO + 27 * SIZE], b4
1249
1250	FMADD	(aa3, bb5, cc09, cc09)
1251	LDF	[BO + 28 * SIZE], b5
1252	FMADD	(aa3, bb6, cc11, cc11)
1253	LDF	[BO + 29 * SIZE], b6
1254
1255	FMADD	(aa3, bb7, cc13, cc13)
1256	LDF	[BO + 30 * SIZE], b7
1257	FMADD	(aa3, bb8, cc15, cc15)
1258	LDF	[BO + 31 * SIZE], b8
1259
1260	FMADD	(aa4, bb9, cc01, cc01)
1261	LDF	[BO + 40 * SIZE], b9
1262	FMADD	(aa4, bb2, cc03, cc03)
1263	LDF	[BO + 33 * SIZE], b2
1264
1265	FMADD	(aa4, bb3, cc05, cc05)
1266	LDF	[BO + 34 * SIZE], b3
1267	FMADD	(aa4, bb4, cc07, cc07)
1268	LDF	[BO + 35 * SIZE], b4
1269
1270	FMADD	(aa4, bb5, cc09, cc09)
1271	LDF	[BO + 36 * SIZE], b5
1272	FMADD	(aa4, bb6, cc11, cc11)
1273	LDF	[BO + 37 * SIZE], b6
1274
1275	FMADD	(aa4, bb7, cc13, cc13)
1276	LDF	[BO + 38 * SIZE], b7
1277	FMADD	(aa4, bb8, cc15, cc15)
1278	LDF	[BO + 39 * SIZE], b8
1279
1280	LDF	[AO +  6 * SIZE], a3
1281	LDF	[AO +  7 * SIZE], a4
1282
1283	add	AO,  4 * SIZE, AO
1284	cmp	L, 0
1285	bg,pt	%icc, .LL23
1286	add	BO, 32 * SIZE, BO
1287	.align 4
1288
1289.LL25:
1290#ifndef TRMMKERNEL
1291	and	K,  3, L
1292#else
1293#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1294	sub	K, KK, L
1295#elif defined(LEFT)
1296	add	KK, 1, L
1297#else
1298	add	KK, 8, L
1299#endif
1300	and	L,  3, L
1301#endif
1302	cmp	L,  0
1303	ble,a,pn %icc, .LL28
1304	nop
1305	.align 4
1306
1307.LL27:
1308	FMADD	(aa1, bb1, cc01, cc01)
1309	LDF	[BO +  8 * SIZE], b1
1310	FMADD	(aa1, bb2, cc03, cc03)
1311	LDF	[BO +  9 * SIZE], b2
1312
1313	FMADD	(aa1, bb3, cc05, cc05)
1314	LDF	[BO + 10 * SIZE], b3
1315	FMADD	(aa1, bb4, cc07, cc07)
1316	LDF	[BO + 11 * SIZE], b4
1317
1318	FMADD	(aa1, bb5, cc09, cc09)
1319	LDF	[BO + 12 * SIZE], b5
1320	FMADD	(aa1, bb6, cc11, cc11)
1321	LDF	[BO + 13 * SIZE], b6
1322
1323	FMADD	(aa1, bb7, cc13, cc13)
1324	LDF	[BO + 14 * SIZE], b7
1325	FMADD	(aa1, bb8, cc15, cc15)
1326	LDF	[BO + 15 * SIZE], b8
1327
1328	LDF	[AO +  1 * SIZE], a1
1329	add	AO, 1 * SIZE, AO
1330
1331	add	L, -1, L
1332	cmp	L, 0
1333	bg,pt	%icc, .LL27
1334	add	BO, 8 * SIZE, BO
1335	.align 4
1336
1337.LL28:
1338#ifndef TRMMKERNEL
1339	LDF	[C1 + 0 * SIZE], a1
1340	LDF	[C2 + 0 * SIZE], a2
1341	LDF	[C3 + 0 * SIZE], a3
1342	LDF	[C4 + 0 * SIZE], a4
1343
1344	FMADD	(alpha, cc01, aa1, cc01)
1345	LDF	[C5 + 0 * SIZE], b1
1346	FMADD	(alpha, cc03, aa2, cc03)
1347	LDF	[C6 + 0 * SIZE], b2
1348
1349	FMADD	(alpha, cc05, aa3, cc05)
1350	LDF	[C7 + 0 * SIZE], b3
1351	FMADD	(alpha, cc07, aa4, cc07)
1352	LDF	[C8 + 0 * SIZE], b4
1353
1354	FMADD	(alpha, cc09, bb1, cc09)
1355	STF	c01, [C1 + 0 * SIZE]
1356	FMADD	(alpha, cc11, bb2, cc11)
1357	STF	c03, [C2 + 0 * SIZE]
1358	FMADD	(alpha, cc13, bb3, cc13)
1359	STF	c05, [C3 + 0 * SIZE]
1360	FMADD	(alpha, cc15, bb4, cc15)
1361	STF	c07, [C4 + 0 * SIZE]
1362#else
1363	FMUL	ALPHA, c01, c01
1364	FMUL	ALPHA, c03, c03
1365	FMUL	ALPHA, c05, c05
1366	FMUL	ALPHA, c07, c07
1367
1368	FMUL	ALPHA, c09, c09
1369	STF	c01, [C1 + 0 * SIZE]
1370	FMUL	ALPHA, c11, c11
1371	STF	c03, [C2 + 0 * SIZE]
1372
1373	FMUL	ALPHA, c13, c13
1374	STF	c05, [C3 + 0 * SIZE]
1375	FMUL	ALPHA, c15, c15
1376	STF	c07, [C4 + 0 * SIZE]
1377#endif
1378
1379	STF	c09, [C5 + 0 * SIZE]
1380	STF	c11, [C6 + 0 * SIZE]
1381	STF	c13, [C7 + 0 * SIZE]
1382	STF	c15, [C8 + 0 * SIZE]
1383
1384#ifdef TRMMKERNEL
1385#if ( defined(LEFT) &&  defined(TRANSA)) || \
1386    (!defined(LEFT) && !defined(TRANSA))
1387	sub	K, KK, TEMP1
1388#ifdef LEFT
1389	add	TEMP1, -1, TEMP1
1390#else
1391	add	TEMP1, -8, TEMP1
1392#endif
1393	sll	TEMP1, BASE_SHIFT + 0, TEMP2
1394	sll	TEMP1, BASE_SHIFT + 3, TEMP1
1395
1396	add	AO, TEMP2, AO
1397	add	BO, TEMP1, BO
1398#endif
1399
1400#ifdef LEFT
1401	add	KK, 1, KK
1402#endif
1403#endif
1404	.align 4
1405
1406.LL29:
1407#if defined(TRMMKERNEL) && !defined(LEFT)
1408	add	KK, 8, KK
1409#endif
1410
1411	add	J, -1, J
1412	cmp	J, 0
1413	bg,pt	%icc, .LL11
1414	mov	BO, B
1415	.align 4
1416
1417.LL30:
1418	and	N, 4, J
1419	cmp	J, 0
1420	ble,pn	%icc, .LL50
1421	mov	C,  C1
1422
1423	add	C,  LDC, C2
1424	add	C2, LDC, C3
1425	add	C3, LDC, C4
1426	add	C4, LDC, C
1427
1428#if defined(TRMMKERNEL) &&  defined(LEFT)
1429	mov	OFFSET, KK
1430#endif
1431
1432	sra	M, 1, I
1433	cmp	I, 0
1434	ble,pn	%icc, .LL40
1435	mov	A, AO
1436	.align 4
1437
1438.LL32:
1439#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))))
1440	mov	B, BO
1441#else
1442	sll	KK, BASE_SHIFT + 1, TEMP1
1443	sll	KK, BASE_SHIFT + 2, TEMP2
1444
1445	add	AO, TEMP1, AO
1446	add	B,  TEMP2, BO
1447#endif
1448
1449	LDF	[AO +  0 * SIZE], a1
1450	LDF	[AO +  1 * SIZE], a2
1451
1452	LDF	[BO +  0 * SIZE], b1
1453	LDF	[BO +  1 * SIZE], b2
1454	LDF	[BO +  2 * SIZE], b3
1455	LDF	[BO +  3 * SIZE], b4
1456	LDF	[BO +  4 * SIZE], b5
1457
1458	LDF	[BO +  5 * SIZE], b6
1459	FCLR	(cc01)
1460	LDF	[BO +  6 * SIZE], b7
1461	FCLR	(cc02)
1462	LDF	[BO +  7 * SIZE], b8
1463	FCLR	(cc03)
1464	LDF	[BO +  8 * SIZE], b9
1465	FCLR	(cc04)
1466
1467	prefetch [C1 + 2 * SIZE], 3
1468	FCLR	(cc05)
1469	prefetch [C2 + 2 * SIZE], 3
1470	FCLR	(cc06)
1471	prefetch [C3 + 2 * SIZE], 3
1472	FCLR	(cc07)
1473	prefetch [C4 + 2 * SIZE], 3
1474	FCLR	(cc08)
1475
1476#ifndef TRMMKERNEL
1477	sra	K,  2, L
1478#else
1479#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1480	sub	K, KK, L
1481#elif defined(LEFT)
1482	add	KK, 2, L
1483#else
1484	add	KK, 4, L
1485#endif
1486	sra	L,  2, L
1487#endif
1488	cmp	L,  0
1489	ble,pn	%icc, .LL35
1490	nop
1491	.align 4
1492
1493.LL33:
1494	FMADD	(aa1, bb1, cc01, cc01)
1495	LDF	[AO +  2 * SIZE], a3
1496	FMADD	(aa2, bb1, cc02, cc02)
1497	LDF	[AO +  3 * SIZE], a4
1498
1499	FMADD	(aa1, bb2, cc03, cc03)
1500	LDF	[BO + 16 * SIZE], b1
1501	FMADD	(aa2, bb2, cc04, cc04)
1502	LDF	[BO +  9 * SIZE], b2
1503
1504	FMADD	(aa1, bb3, cc05, cc05)
1505	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
1506	FMADD	(aa2, bb3, cc06, cc06)
1507	add	L, -1, L
1508
1509	FMADD	(aa1, bb4, cc07, cc07)
1510	LDF	[BO + 10 * SIZE], b3
1511	FMADD	(aa2, bb4, cc08, cc08)
1512	LDF	[BO + 11 * SIZE], b4
1513
1514	FMADD	(aa3, bb5, cc01, cc01)
1515	LDF	[AO +  4 * SIZE], a1
1516	FMADD	(aa4, bb5, cc02, cc02)
1517	LDF	[AO +  5 * SIZE], a2
1518
1519	FMADD	(aa3, bb6, cc03, cc03)
1520	LDF	[BO + 12 * SIZE], b5
1521	FMADD	(aa4, bb6, cc04, cc04)
1522	LDF	[BO + 13 * SIZE], b6
1523
1524	FMADD	(aa3, bb7, cc05, cc05)
1525	cmp	L, 0
1526	FMADD	(aa4, bb7, cc06, cc06)
1527	add	AO,  8 * SIZE, AO
1528
1529	FMADD	(aa3, bb8, cc07, cc07)
1530	LDF	[BO + 14 * SIZE], b7
1531	FMADD	(aa4, bb8, cc08, cc08)
1532	LDF	[BO + 15 * SIZE], b8
1533
1534	FMADD	(aa1, bb9, cc01, cc01)
1535	LDF	[AO -  2 * SIZE], a3
1536	FMADD	(aa2, bb9, cc02, cc02)
1537	LDF	[AO -  1 * SIZE], a4
1538
1539	FMADD	(aa1, bb2, cc03, cc03)
1540	LDF	[BO + 24 * SIZE], b9
1541	FMADD	(aa2, bb2, cc04, cc04)
1542	LDF	[BO + 17 * SIZE], b2
1543
1544	FMADD	(aa1, bb3, cc05, cc05)
1545	add	BO, 16 * SIZE, BO
1546	FMADD	(aa2, bb3, cc06, cc06)
1547	nop
1548
1549	FMADD	(aa1, bb4, cc07, cc07)
1550	LDF	[BO +  2 * SIZE], b3
1551	FMADD	(aa2, bb4, cc08, cc08)
1552	LDF	[BO +  3 * SIZE], b4
1553
1554	FMADD	(aa3, bb5, cc01, cc01)
1555	LDF	[AO +  0 * SIZE], a1
1556	FMADD	(aa4, bb5, cc02, cc02)
1557	LDF	[AO +  1 * SIZE], a2
1558	FMADD	(aa3, bb6, cc03, cc03)
1559	LDF	[BO +  4 * SIZE], b5
1560	FMADD	(aa4, bb6, cc04, cc04)
1561	LDF	[BO +  5 * SIZE], b6
1562
1563	FMADD	(aa3, bb7, cc05, cc05)
1564	nop
1565	FMADD	(aa4, bb7, cc06, cc06)
1566	LDF	[BO +  6 * SIZE], b7
1567
1568	FMADD	(aa3, bb8, cc07, cc07)
1569	FMADD	(aa4, bb8, cc08, cc08)
1570	bg,pt	%icc, .LL33
1571	LDF	[BO +  7 * SIZE], b8
1572	.align 4
1573
1574.LL35:
1575#ifndef TRMMKERNEL
1576	and	K,  3, L
1577#else
1578#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1579	sub	K, KK, L
1580#elif defined(LEFT)
1581	add	KK, 2, L
1582#else
1583	add	KK, 4, L
1584#endif
1585	and	L,  3, L
1586#endif
1587	cmp	L,  0
1588	ble,a,pn %icc, .LL38
1589	nop
1590	.align 4
1591
1592.LL37:
1593
1594	FMADD	(aa1, bb1, cc01, cc01)
1595	add	L, -1, L
1596	FMADD	(aa2, bb1, cc02, cc02)
1597	LDF	[BO + 4 * SIZE], b1
1598
1599	FMADD	(aa1, bb2, cc03, cc03)
1600	add	AO, 2 * SIZE, AO
1601	FMADD	(aa2, bb2, cc04, cc04)
1602	LDF	[BO + 5 * SIZE], b2
1603
1604	FMADD	(aa1, bb3, cc05, cc05)
1605	cmp	L, 0
1606	FMADD	(aa2, bb3, cc06, cc06)
1607	LDF	[BO + 6 * SIZE], b3
1608
1609	FMADD	(aa1, bb4, cc07, cc07)
1610	LDF	[AO + 0 * SIZE], a1
1611	FMADD	(aa2, bb4, cc08, cc08)
1612	LDF	[AO + 1 * SIZE], a2
1613
1614	LDF	[BO + 7 * SIZE], b4
1615	bg,pt	%icc, .LL37
1616	add	BO, 4 * SIZE, BO
1617	.align 4
1618
1619.LL38:
1620#ifndef TRMMKERNEL
1621	LDF	[C1 + 0 * SIZE], a1
1622	LDF	[C1 + 1 * SIZE], a2
1623	LDF	[C2 + 0 * SIZE], a3
1624	LDF	[C2 + 1 * SIZE], a4
1625
1626	FMADD	(alpha, cc01, aa1, cc01)
1627	LDF	[C3 + 0 * SIZE], b1
1628	FMADD	(alpha, cc02, aa2, cc02)
1629	LDF	[C3 + 1 * SIZE], b2
1630	FMADD	(alpha, cc03, aa3, cc03)
1631	LDF	[C4 + 0 * SIZE], b3
1632	FMADD	(alpha, cc04, aa4, cc04)
1633	LDF	[C4 + 1 * SIZE], b4
1634
1635	FMADD	(alpha, cc05, bb1, cc05)
1636	STF	c01, [C1 + 0 * SIZE]
1637	FMADD	(alpha, cc06, bb2, cc06)
1638	STF	c02, [C1 + 1 * SIZE]
1639	FMADD	(alpha, cc07, bb3, cc07)
1640	STF	c03, [C2 + 0 * SIZE]
1641	FMADD	(alpha, cc08, bb4, cc08)
1642	STF	c04, [C2 + 1 * SIZE]
1643#else
1644
1645	FMUL	ALPHA, c01, c01
1646	FMUL	ALPHA, c02, c02
1647	FMUL	ALPHA, c03, c03
1648	FMUL	ALPHA, c04, c04
1649
1650	FMUL	ALPHA, c05, c05
1651	STF	c01, [C1 + 0 * SIZE]
1652	FMUL	ALPHA, c06, c06
1653	STF	c02, [C1 + 1 * SIZE]
1654	FMUL	ALPHA, c07, c07
1655	STF	c03, [C2 + 0 * SIZE]
1656	FMUL	ALPHA, c08, c08
1657	STF	c04, [C2 + 1 * SIZE]
1658#endif
1659
1660	STF	c05, [C3 + 0 * SIZE]
1661	add	C1, 2 * SIZE, C1
1662	STF	c06, [C3 + 1 * SIZE]
1663	add	C2, 2 * SIZE, C2
1664	STF	c07, [C4 + 0 * SIZE]
1665	add	C3, 2 * SIZE, C3
1666	STF	c08, [C4 + 1 * SIZE]
1667	add	C4, 2 * SIZE, C4
1668
1669#ifdef TRMMKERNEL
1670#if ( defined(LEFT) &&  defined(TRANSA)) || \
1671    (!defined(LEFT) && !defined(TRANSA))
1672	sub	K, KK, TEMP1
1673#ifdef LEFT
1674	add	TEMP1, -2, TEMP1
1675#else
1676	add	TEMP1, -4, TEMP1
1677#endif
1678	sll	TEMP1, BASE_SHIFT + 1, TEMP2
1679	sll	TEMP1, BASE_SHIFT + 2, TEMP1
1680
1681	add	AO, TEMP2, AO
1682	add	BO, TEMP1, BO
1683#endif
1684
1685#ifdef LEFT
1686	add	KK, 2, KK
1687#endif
1688#endif
1689
1690	add	I, -1, I
1691	cmp	I, 0
1692	bg,pt	%icc, .LL32
1693	nop
1694
1695.LL40:
1696	and	M, 1, I
1697	cmp	I, 0
1698	ble,pn	%icc, .LL49
1699	nop
1700
1701#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))))
1702	mov	B, BO
1703#else
1704	sll	KK, BASE_SHIFT + 0, TEMP1
1705	sll	KK, BASE_SHIFT + 2, TEMP2
1706
1707	add	AO, TEMP1, AO
1708	add	B,  TEMP2, BO
1709#endif
1710
1711	LDF	[AO +  0 * SIZE], a1
1712	LDF	[AO +  1 * SIZE], a2
1713	LDF	[AO +  2 * SIZE], a3
1714	LDF	[AO +  3 * SIZE], a4
1715
1716	LDF	[BO +  0 * SIZE], b1
1717	LDF	[BO +  1 * SIZE], b2
1718	LDF	[BO +  2 * SIZE], b3
1719	LDF	[BO +  3 * SIZE], b4
1720	LDF	[BO +  4 * SIZE], b5
1721	LDF	[BO +  5 * SIZE], b6
1722	FCLR	(cc01)
1723	LDF	[BO +  6 * SIZE], b7
1724	FCLR	(cc03)
1725	LDF	[BO +  7 * SIZE], b8
1726	FCLR	(cc05)
1727	LDF	[BO +  8 * SIZE], b9
1728	FCLR	(cc07)
1729
1730#ifndef TRMMKERNEL
1731	sra	K,  2, L
1732#else
1733#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1734	sub	K, KK, L
1735#elif defined(LEFT)
1736	add	KK, 1, L
1737#else
1738	add	KK, 4, L
1739#endif
1740	sra	L,  2, L
1741#endif
1742	cmp	L,  0
1743	ble,pn	%icc, .LL45
1744	nop
1745
1746.LL43:
1747	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
1748	add	L, -1, L
1749
1750	FMADD	(aa1, bb1, cc01, cc01)
1751	LDF	[BO + 16 * SIZE], b1
1752	FMADD	(aa1, bb2, cc03, cc03)
1753	LDF	[BO +  9 * SIZE], b2
1754	FMADD	(aa1, bb3, cc05, cc05)
1755	LDF	[BO + 10 * SIZE], b3
1756	FMADD	(aa1, bb4, cc07, cc07)
1757	LDF	[BO + 11 * SIZE], b4
1758
1759	LDF	[AO +  4 * SIZE], a1
1760	cmp	L, 0
1761
1762	FMADD	(aa2, bb5, cc01, cc01)
1763	LDF	[BO + 12 * SIZE], b5
1764	FMADD	(aa2, bb6, cc03, cc03)
1765	LDF	[BO + 13 * SIZE], b6
1766	FMADD	(aa2, bb7, cc05, cc05)
1767	LDF	[BO + 14 * SIZE], b7
1768	FMADD	(aa2, bb8, cc07, cc07)
1769	LDF	[BO + 15 * SIZE], b8
1770
1771	LDF	[AO +  5 * SIZE], a2
1772	add	AO,  4 * SIZE, AO
1773
1774	FMADD	(aa3, bb9, cc01, cc01)
1775	LDF	[BO + 24 * SIZE], b9
1776	FMADD	(aa3, bb2, cc03, cc03)
1777	LDF	[BO + 17 * SIZE], b2
1778	FMADD	(aa3, bb3, cc05, cc05)
1779	LDF	[BO + 18 * SIZE], b3
1780	FMADD	(aa3, bb4, cc07, cc07)
1781	LDF	[BO + 19 * SIZE], b4
1782
1783	LDF	[AO +  2 * SIZE], a3
1784	add	BO, 16 * SIZE, BO
1785
1786	FMADD	(aa4, bb5, cc01, cc01)
1787	LDF	[BO +  4 * SIZE], b5
1788	FMADD	(aa4, bb6, cc03, cc03)
1789	LDF	[BO +  5 * SIZE], b6
1790	FMADD	(aa4, bb7, cc05, cc05)
1791	LDF	[BO +  6 * SIZE], b7
1792	FMADD	(aa4, bb8, cc07, cc07)
1793	LDF	[BO +  7 * SIZE], b8
1794
1795	bg,pt	%icc, .LL43
1796	LDF	[AO +  3 * SIZE], a4
1797	.align 4
1798
1799.LL45:
1800#ifndef TRMMKERNEL
1801	and	K,  3, L
1802#else
1803#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1804	sub	K, KK, L
1805#elif defined(LEFT)
1806	add	KK, 1, L
1807#else
1808	add	KK, 4, L
1809#endif
1810	and	L,  3, L
1811#endif
1812	cmp	L,  0
1813	ble,a,pn %icc, .LL48
1814	nop
1815	.align 4
1816
1817.LL47:
1818	FMADD	(aa1, bb1, cc01, cc01)
1819	LDF	[BO + 4 * SIZE], b1
1820	add	L, -1, L
1821	FMADD	(aa1, bb2, cc03, cc03)
1822	LDF	[BO + 5 * SIZE], b2
1823	add	AO, 1 * SIZE, AO
1824
1825	FMADD	(aa1, bb3, cc05, cc05)
1826	LDF	[BO + 6 * SIZE], b3
1827	cmp	L, 0
1828	FMADD	(aa1, bb4, cc07, cc07)
1829	LDF	[BO + 7 * SIZE], b4
1830	add	BO, 4 * SIZE, BO
1831
1832	bg,pt	%icc, .LL47
1833	LDF	[AO + 0 * SIZE], a1
1834	.align 4
1835
1836.LL48:
1837#ifndef TRMMKERNEL
1838	LDF	[C1 + 0 * SIZE], a1
1839	LDF	[C2 + 0 * SIZE], a2
1840	LDF	[C3 + 0 * SIZE], a3
1841	LDF	[C4 + 0 * SIZE], a4
1842
1843	FMADD	(alpha, cc01, aa1, cc01)
1844	FMADD	(alpha, cc03, aa2, cc03)
1845	FMADD	(alpha, cc05, aa3, cc05)
1846	FMADD	(alpha, cc07, aa4, cc07)
1847#else
1848	FMUL	ALPHA, c01, c01
1849	FMUL	ALPHA, c03, c03
1850	FMUL	ALPHA, c05, c05
1851	FMUL	ALPHA, c07, c07
1852#endif
1853
1854	STF	c01, [C1 + 0 * SIZE]
1855	STF	c03, [C2 + 0 * SIZE]
1856	STF	c05, [C3 + 0 * SIZE]
1857	STF	c07, [C4 + 0 * SIZE]
1858
1859#ifdef TRMMKERNEL
1860#if ( defined(LEFT) &&  defined(TRANSA)) || \
1861    (!defined(LEFT) && !defined(TRANSA))
1862	sub	K, KK, TEMP1
1863#ifdef LEFT
1864	add	TEMP1, -1, TEMP1
1865#else
1866	add	TEMP1, -4, TEMP1
1867#endif
1868	sll	TEMP1, BASE_SHIFT + 0, TEMP2
1869	sll	TEMP1, BASE_SHIFT + 2, TEMP1
1870
1871	add	AO, TEMP2, AO
1872	add	BO, TEMP1, BO
1873#endif
1874
1875#ifdef LEFT
1876	add	KK, 1, KK
1877#endif
1878#endif
1879	.align 4
1880
1881.LL49:
1882#if defined(TRMMKERNEL) && !defined(LEFT)
1883	add	KK, 4, KK
1884#endif
1885	mov	BO, B
1886	.align 4
1887
1888.LL50:
1889	and	N, 2, J
1890	cmp	J, 0
1891	ble,pn	%icc, .LL70
1892	mov	C,  C1
1893
1894	add	C,  LDC, C2
1895	add	C2, LDC, C
1896
1897#if defined(TRMMKERNEL) &&  defined(LEFT)
1898	mov	OFFSET, KK
1899#endif
1900
1901	sra	M, 1, I
1902	cmp	I, 0
1903	ble,pn	%icc, .LL60
1904	mov	A, AO
1905	.align 4
1906
1907.LL52:
1908#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))))
1909	mov	B, BO
1910#else
1911	sll	KK, BASE_SHIFT + 1, TEMP1
1912	sll	KK, BASE_SHIFT + 1, TEMP2
1913
1914	add	AO, TEMP1, AO
1915	add	B,  TEMP2, BO
1916#endif
1917
1918	LDF	[AO +  0 * SIZE], a1
1919	LDF	[AO +  1 * SIZE], a2
1920	LDF	[AO +  2 * SIZE], a3
1921	LDF	[AO +  3 * SIZE], a4
1922
1923	LDF	[BO +  0 * SIZE], b1
1924	LDF	[BO +  1 * SIZE], b2
1925	LDF	[BO +  2 * SIZE], b3
1926	FCLR	(cc01)
1927	LDF	[BO +  3 * SIZE], b4
1928	FCLR	(cc02)
1929
1930	LDF	[BO +  4 * SIZE], b5
1931	FCLR	(cc03)
1932	LDF	[BO +  5 * SIZE], b6
1933	FCLR	(cc04)
1934	LDF	[BO +  6 * SIZE], b7
1935	FCLR	(cc05)
1936	LDF	[BO +  7 * SIZE], b8
1937	FCLR	(cc06)
1938
1939	prefetch [C1 + 2 * SIZE], 3
1940	FCLR	(cc07)
1941	prefetch [C2 + 2 * SIZE], 3
1942	FCLR	(cc08)
1943
1944#ifndef TRMMKERNEL
1945	sra	K,  2, L
1946#else
1947#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1948	sub	K, KK, L
1949#elif defined(LEFT)
1950	add	KK, 2, L
1951#else
1952	add	KK, 2, L
1953#endif
1954	sra	L,  2, L
1955#endif
1956	cmp	L,  0
1957	ble,pn	%icc, .LL55
1958	nop
1959	.align 4
1960
1961.LL53:
1962	FMADD	(aa1, bb1, cc01, cc01)
1963	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
1964	FMADD	(aa2, bb1, cc02, cc02)
1965	LDF	[BO +  8 * SIZE], b1
1966
1967	FMADD	(aa1, bb2, cc03, cc03)
1968	LDF	[AO +  4 * SIZE], a1
1969	FMADD	(aa2, bb2, cc04, cc04)
1970	LDF	[AO +  5 * SIZE], a2
1971
1972	FMADD	(aa3, bb3, cc01, cc01)
1973	LDF	[BO +  9 * SIZE], b2
1974	FMADD	(aa4, bb3, cc02, cc02)
1975	LDF	[BO + 10 * SIZE], b3
1976
1977	FMADD	(aa3, bb4, cc03, cc03)
1978	LDF	[AO +  6 * SIZE], a3
1979	FMADD	(aa4, bb4, cc04, cc04)
1980	LDF	[AO +  7 * SIZE], a4
1981
1982	FMADD	(aa1, bb5, cc01, cc01)
1983	LDF	[BO + 11 * SIZE], b4
1984	FMADD	(aa2, bb5, cc02, cc02)
1985	LDF	[BO + 12 * SIZE], b5
1986
1987	FMADD	(aa1, bb6, cc03, cc03)
1988	LDF	[AO +  8 * SIZE], a1
1989	FMADD	(aa2, bb6, cc04, cc04)
1990	LDF	[AO +  9 * SIZE], a2
1991
1992	FMADD	(aa3, bb7, cc01, cc01)
1993	LDF	[BO + 13 * SIZE], b6
1994
1995	FMADD	(aa4, bb7, cc02, cc02)
1996	LDF	[BO + 14 * SIZE], b7
1997
1998	FMADD	(aa3, bb8, cc03, cc03)
1999	LDF	[AO + 10 * SIZE], a3
2000	FMADD	(aa4, bb8, cc04, cc04)
2001	LDF	[AO + 11 * SIZE], a4
2002
2003	add	AO,  8 * SIZE, AO
2004	add	L, -1, L
2005	add	BO,  8 * SIZE, BO
2006	cmp	L, 0
2007
2008	bg,pt	%icc, .LL53
2009	LDF	[BO +  7 * SIZE], b8
2010	.align 4
2011
2012.LL55:
2013#ifndef TRMMKERNEL
2014	and	K,  3, L
2015#else
2016#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2017	sub	K, KK, L
2018#elif defined(LEFT)
2019	add	KK, 2, L
2020#else
2021	add	KK, 2, L
2022#endif
2023	and	L,  3, L
2024#endif
2025	cmp	L,  0
2026	ble,a,pn %icc, .LL58
2027	nop
2028	.align 4
2029
2030.LL57:
2031	FMADD	(aa1, bb1, cc01, cc01)
2032	add	L, -1, L
2033	FMADD	(aa2, bb1, cc02, cc02)
2034	LDF	[BO + 2 * SIZE], b1
2035
2036	FMADD	(aa1, bb2, cc03, cc03)
2037	LDF	[AO + 2 * SIZE], a1
2038	FMADD	(aa2, bb2, cc04, cc04)
2039	LDF	[AO + 3 * SIZE], a2
2040
2041	add	AO, 2 * SIZE, AO
2042	cmp	L, 0
2043	add	BO, 2 * SIZE, BO
2044	bg,pt	%icc, .LL57
2045	LDF	[BO + 1 * SIZE], b2
2046	.align 4
2047
2048.LL58:
2049#ifndef TRMMKERNEL
2050	LDF	[C1 + 0 * SIZE], a1
2051	LDF	[C1 + 1 * SIZE], a2
2052	LDF	[C2 + 0 * SIZE], a3
2053	LDF	[C2 + 1 * SIZE], a4
2054
2055	FMADD	(alpha, cc01, aa1, cc01)
2056	FMADD	(alpha, cc02, aa2, cc02)
2057	FMADD	(alpha, cc03, aa3, cc03)
2058	FMADD	(alpha, cc04, aa4, cc04)
2059#else
2060	FMUL	ALPHA, c01, c01
2061	FMUL	ALPHA, c02, c02
2062	FMUL	ALPHA, c03, c03
2063	FMUL	ALPHA, c04, c04
2064#endif
2065
2066	STF	c01, [C1 + 0 * SIZE]
2067	add	I, -1, I
2068	STF	c02, [C1 + 1 * SIZE]
2069	add	C1, 2 * SIZE, C1
2070
2071	STF	c03, [C2 + 0 * SIZE]
2072	cmp	I, 0
2073	STF	c04, [C2 + 1 * SIZE]
2074	add	C2, 2 * SIZE, C2
2075
2076#ifdef TRMMKERNEL
2077#if ( defined(LEFT) &&  defined(TRANSA)) || \
2078    (!defined(LEFT) && !defined(TRANSA))
2079	sub	K, KK, TEMP1
2080#ifdef LEFT
2081	add	TEMP1, -2, TEMP1
2082#else
2083	add	TEMP1, -2, TEMP1
2084#endif
2085	sll	TEMP1, BASE_SHIFT + 1, TEMP2
2086	sll	TEMP1, BASE_SHIFT + 1, TEMP1
2087
2088	add	AO, TEMP2, AO
2089	add	BO, TEMP1, BO
2090#endif
2091
2092#ifdef LEFT
2093	add	KK, 2, KK
2094#endif
2095#endif
2096
2097	bg,pt	%icc, .LL52
2098	nop
2099	.align 4
2100
2101.LL60:
2102	and	M, 1, I
2103	cmp	I, 0
2104	ble,pn	%icc, .LL69
2105	nop
2106
2107#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))))
2108	mov	B, BO
2109#else
2110	sll	KK, BASE_SHIFT + 0, TEMP1
2111	sll	KK, BASE_SHIFT + 1, TEMP2
2112
2113	add	AO, TEMP1, AO
2114	add	B,  TEMP2, BO
2115#endif
2116
2117	LDF	[AO +  0 * SIZE], a1
2118	LDF	[AO +  1 * SIZE], a2
2119	LDF	[AO +  2 * SIZE], a3
2120	LDF	[AO +  3 * SIZE], a4
2121
2122	LDF	[BO +  0 * SIZE], b1
2123	LDF	[BO +  1 * SIZE], b2
2124	LDF	[BO +  2 * SIZE], b3
2125	LDF	[BO +  3 * SIZE], b4
2126	LDF	[BO +  4 * SIZE], b5
2127	LDF	[BO +  5 * SIZE], b6
2128	LDF	[BO +  6 * SIZE], b7
2129	FCLR	(cc01)
2130	LDF	[BO +  7 * SIZE], b8
2131	FCLR	(cc03)
2132
2133#ifndef TRMMKERNEL
2134	sra	K,  2, L
2135#else
2136#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2137	sub	K, KK, L
2138#elif defined(LEFT)
2139	add	KK, 1, L
2140#else
2141	add	KK, 2, L
2142#endif
2143	sra	L,  2, L
2144#endif
2145	cmp	L,  0
2146	ble,pn	%icc, .LL65
2147	nop
2148	.align 4
2149
2150.LL63:
2151	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
2152	add	L, -1, L
2153
2154	FMADD	(aa1, bb1, cc01, cc01)
2155	LDF	[BO +  8 * SIZE], b1
2156	FMADD	(aa1, bb2, cc03, cc03)
2157	LDF	[BO +  9 * SIZE], b2
2158
2159	LDF	[AO +  4 * SIZE], a1
2160	cmp	L, 0
2161
2162	FMADD	(aa2, bb3, cc01, cc01)
2163	LDF	[BO + 10 * SIZE], b3
2164	FMADD	(aa2, bb4, cc03, cc03)
2165	LDF	[BO + 11 * SIZE], b4
2166
2167	LDF	[AO +  5 * SIZE], a2
2168	add	AO,  4 * SIZE, AO
2169
2170	FMADD	(aa3, bb5, cc01, cc01)
2171	LDF	[BO + 12 * SIZE], b5
2172	FMADD	(aa3, bb6, cc03, cc03)
2173	LDF	[BO + 13 * SIZE], b6
2174
2175	LDF	[AO +  2 * SIZE], a3
2176	add	BO,  8 * SIZE, BO
2177
2178	FMADD	(aa4, bb7, cc01, cc01)
2179	LDF	[BO +  6 * SIZE], b7
2180	FMADD	(aa4, bb8, cc03, cc03)
2181	LDF	[BO + 7 * SIZE], b8
2182
2183	bg,pt	%icc, .LL63
2184	LDF	[AO +  3 * SIZE], a4
2185	.align 4
2186
2187.LL65:
2188#ifndef TRMMKERNEL
2189	and	K,  3, L
2190#else
2191#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2192	sub	K, KK, L
2193#elif defined(LEFT)
2194	add	KK, 1, L
2195#else
2196	add	KK, 2, L
2197#endif
2198	and	L,  3, L
2199#endif
2200	cmp	L,  0
2201	ble,a,pn %icc, .LL68
2202	nop
2203	.align 4
2204
2205.LL67:
2206	FMADD	(aa1, bb1, cc01, cc01)
2207	LDF	[BO + 2 * SIZE], b1
2208	FMADD	(aa1, bb2, cc03, cc03)
2209	LDF	[BO + 3 * SIZE], b2
2210
2211	LDF	[AO + 1 * SIZE], a1
2212	add	L, -1, L
2213	add	AO, 1 * SIZE, AO
2214	cmp	L, 0
2215
2216	bg,pt	%icc, .LL67
2217	add	BO, 2 * SIZE, BO
2218	.align 4
2219
2220.LL68:
2221#ifndef TRMMKERNEL
2222	LDF	[C1 + 0 * SIZE], a1
2223	LDF	[C2 + 0 * SIZE], a2
2224
2225	FMADD	(alpha, cc01, aa1, cc01)
2226	FMADD	(alpha, cc03, aa2, cc03)
2227#else
2228	FMUL	ALPHA, c01, c01
2229	FMUL	ALPHA, c03, c03
2230#endif
2231
2232	STF	c01, [C1 + 0 * SIZE]
2233	STF	c03, [C2 + 0 * SIZE]
2234
2235#ifdef TRMMKERNEL
2236#if ( defined(LEFT) &&  defined(TRANSA)) || \
2237    (!defined(LEFT) && !defined(TRANSA))
2238	sub	K, KK, TEMP1
2239#ifdef LEFT
2240	add	TEMP1, -1, TEMP1
2241#else
2242	add	TEMP1, -2, TEMP1
2243#endif
2244	sll	TEMP1, BASE_SHIFT + 0, TEMP2
2245	sll	TEMP1, BASE_SHIFT + 1, TEMP1
2246
2247	add	AO, TEMP2, AO
2248	add	BO, TEMP1, BO
2249#endif
2250
2251#ifdef LEFT
2252	add	KK, 1, KK
2253#endif
2254#endif
2255	.align 4
2256
2257.LL69:
2258#if defined(TRMMKERNEL) && !defined(LEFT)
2259	add	KK, 2, KK
2260#endif
2261	mov	BO, B
2262	.align 4
2263
2264.LL70:
2265	and	N, 1, J
2266	cmp	J, 0
2267	ble,pn	%icc, .LL999
2268	mov	C,  C1
2269
2270#if defined(TRMMKERNEL) &&  defined(LEFT)
2271	mov	OFFSET, KK
2272#endif
2273
2274	sra	M, 1, I
2275	cmp	I, 0
2276	ble,pn	%icc, .LL80
2277	mov	A, AO
2278	.align 4
2279
2280.LL72:
2281#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))))
2282	mov	B, BO
2283#else
2284	sll	KK, BASE_SHIFT + 1, TEMP1
2285	sll	KK, BASE_SHIFT + 0, TEMP2
2286
2287	add	AO, TEMP1, AO
2288	add	B,  TEMP2, BO
2289#endif
2290
2291	LDF	[AO +  0 * SIZE], a1
2292	LDF	[AO +  1 * SIZE], a2
2293	LDF	[AO +  2 * SIZE], a3
2294	LDF	[AO +  3 * SIZE], a4
2295
2296	LDF	[BO +  0 * SIZE], b1
2297	LDF	[BO +  1 * SIZE], b2
2298	LDF	[BO +  2 * SIZE], b3
2299	FCLR	(cc01)
2300	LDF	[BO +  3 * SIZE], b4
2301	FCLR	(cc02)
2302
2303	prefetch [C1 + 2 * SIZE], 3
2304
2305#ifndef TRMMKERNEL
2306	sra	K,  2, L
2307#else
2308#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2309	sub	K, KK, L
2310#elif defined(LEFT)
2311	add	KK, 2, L
2312#else
2313	add	KK, 1, L
2314#endif
2315	sra	L,  2, L
2316#endif
2317	cmp	L,  0
2318	ble,pn	%icc, .LL75
2319	nop
2320
2321.LL73:
2322	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
2323	add	L, -1, L
2324
2325	FMADD	(aa1, bb1, cc01, cc01)
2326	LDF	[AO +  4 * SIZE], a1
2327	FMADD	(aa2, bb1, cc02, cc02)
2328	LDF	[AO +  5 * SIZE], a2
2329
2330	LDF	[BO +  4 * SIZE], b1
2331	cmp	L, 0
2332
2333	FMADD	(aa3, bb2, cc01, cc01)
2334	LDF	[AO +  6 * SIZE], a3
2335	FMADD	(aa4, bb2, cc02, cc02)
2336	LDF	[AO +  7 * SIZE], a4
2337
2338	LDF	[BO +  5 * SIZE], b2
2339	add	BO,  4 * SIZE, BO
2340
2341	FMADD	(aa1, bb3, cc01, cc01)
2342	LDF	[AO +  8 * SIZE], a1
2343	FMADD	(aa2, bb3, cc02, cc02)
2344	LDF	[AO +  9 * SIZE], a2
2345
2346	LDF	[BO +  2 * SIZE], b3
2347	add	AO,  8 * SIZE, AO
2348
2349	FMADD	(aa3, bb4, cc01, cc01)
2350	LDF	[AO +  2 * SIZE], a3
2351	FMADD	(aa4, bb4, cc02, cc02)
2352	LDF	[AO +  3 * SIZE], a4
2353
2354	bg,pt	%icc, .LL73
2355	LDF	[BO +  3 * SIZE], b4
2356	.align 4
2357
2358.LL75:
2359#ifndef TRMMKERNEL
2360	and	K,  3, L
2361#else
2362#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2363	sub	K, KK, L
2364#elif defined(LEFT)
2365	add	KK, 2, L
2366#else
2367	add	KK, 1, L
2368#endif
2369	and	L,  3, L
2370#endif
2371	cmp	L,  0
2372	ble,a,pn %icc, .LL78
2373	nop
2374	.align 4
2375
2376.LL77:
2377	FMADD	(aa1, bb1, cc01, cc01)
2378	LDF	[AO + 2 * SIZE], a1
2379	FMADD	(aa2, bb1, cc02, cc02)
2380	LDF	[AO + 3 * SIZE], a2
2381
2382	LDF	[BO + 1 * SIZE], b1
2383	add	L, -1, L
2384	add	AO, 2 * SIZE, AO
2385	cmp	L, 0
2386	bg,pt	%icc, .LL77
2387	add	BO, 1 * SIZE, BO
2388	.align 4
2389
2390.LL78:
2391#ifndef TRMMKERNEL
2392	LDF	[C1 + 0 * SIZE], a1
2393	LDF	[C1 + 1 * SIZE], a2
2394
2395	FMADD	(alpha, cc01, aa1, cc01)
2396	FMADD	(alpha, cc02, aa2, cc02)
2397#else
2398	FMUL	ALPHA, c01, c01
2399	FMUL	ALPHA, c02, c02
2400#endif
2401
2402	STF	c01, [C1 + 0 * SIZE]
2403	add	I, -1, I
2404	STF	c02, [C1 + 1 * SIZE]
2405	cmp	I, 0
2406
2407#ifdef TRMMKERNEL
2408#if ( defined(LEFT) &&  defined(TRANSA)) || \
2409    (!defined(LEFT) && !defined(TRANSA))
2410	sub	K, KK, TEMP1
2411#ifdef LEFT
2412	add	TEMP1, -2, TEMP1
2413#else
2414	add	TEMP1, -1, TEMP1
2415#endif
2416	sll	TEMP1, BASE_SHIFT + 1, TEMP2
2417	sll	TEMP1, BASE_SHIFT + 0, TEMP1
2418
2419	add	AO, TEMP2, AO
2420	add	BO, TEMP1, BO
2421#endif
2422
2423#ifdef LEFT
2424	add	KK, 2, KK
2425#endif
2426#endif
2427
2428	bg,pt	%icc, .LL72
2429	add	C1, 2 * SIZE, C1
2430	.align 4
2431
2432.LL80:
2433	and	M, 1, I
2434	cmp	I, 0
2435	ble,pn	%icc, .LL999
2436	nop
2437
2438#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))))
2439	mov	B, BO
2440#else
2441	sll	KK, BASE_SHIFT + 0, TEMP1
2442	sll	KK, BASE_SHIFT + 0, TEMP2
2443
2444	add	AO, TEMP1, AO
2445	add	B,  TEMP2, BO
2446#endif
2447
2448	LDF	[AO +  0 * SIZE], a1
2449	LDF	[BO +  0 * SIZE], b1
2450	LDF	[AO +  1 * SIZE], a2
2451	LDF	[BO +  1 * SIZE], b2
2452	LDF	[AO +  2 * SIZE], a3
2453	LDF	[BO +  2 * SIZE], b3
2454	LDF	[AO +  3 * SIZE], a4
2455	LDF	[BO +  3 * SIZE], b4
2456
2457#ifndef TRMMKERNEL
2458	sra	K,  2, L
2459#else
2460#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2461	sub	K, KK, L
2462#elif defined(LEFT)
2463	add	KK, 1, L
2464#else
2465	add	KK, 1, L
2466#endif
2467	sra	L,  2, L
2468#endif
2469	cmp	L,  0
2470	ble,pn	%icc, .LL85
2471	FCLR	(cc01)
2472	.align 4
2473
2474.LL83:
2475	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
2476	add	L, -1, L
2477
2478	FMADD	(aa1, bb1, cc01, cc01)
2479	LDF	[AO +  4 * SIZE], a1
2480	LDF	[BO +  4 * SIZE], b1
2481
2482	FMADD	(aa2, bb2, cc01, cc01)
2483	LDF	[AO +  5 * SIZE], a2
2484	LDF	[BO +  5 * SIZE], b2
2485
2486	FMADD	(aa3, bb3, cc01, cc01)
2487	LDF	[AO +  6 * SIZE], a3
2488	LDF	[BO +  6 * SIZE], b3
2489
2490	FMADD	(aa4, bb4, cc01, cc01)
2491	LDF	[AO +  7 * SIZE], a4
2492	LDF	[BO +  7 * SIZE], b4
2493
2494	add	AO,  4 * SIZE, AO
2495	cmp	L, 0
2496
2497	bg,pt	%icc, .LL83
2498	add	BO,  4 * SIZE, BO
2499	.align 4
2500
2501.LL85:
2502#ifndef TRMMKERNEL
2503	and	K,  3, L
2504#else
2505#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2506	sub	K, KK, L
2507#elif defined(LEFT)
2508	add	KK, 1, L
2509#else
2510	add	KK, 1, L
2511#endif
2512	and	L,  3, L
2513#endif
2514	cmp	L,  0
2515	ble,a,pn %icc, .LL88
2516	nop
2517	.align 4
2518
2519.LL87:
2520	FMADD	(aa1, bb1, cc01, cc01)
2521	LDF	[AO + 1 * SIZE], a1
2522	LDF	[BO + 1 * SIZE], b1
2523
2524	add	AO, 1 * SIZE, AO
2525	add	L, -1, L
2526	cmp	L, 0
2527	bg,pt	%icc, .LL87
2528	add	BO, 1 * SIZE, BO
2529	.align 4
2530
2531.LL88:
2532#ifndef TRMMKERNEL
2533	LDF	[C1 + 0 * SIZE], a1
2534
2535	FMADD	(alpha, cc01, aa1, cc01)
2536#else
2537	FMUL	ALPHA, c01, c01
2538#endif
2539
2540	STF	c01, [C1 + 0 * SIZE]
2541	.align 4
2542
2543.LL999:
2544#ifdef TRMMKERNEL
2545#ifndef __64BIT__
2546	ld	[%sp + STACK_START +  8], %g1
2547	ld	[%sp + STACK_START + 12], %g2
2548	ld	[%sp + STACK_START + 16], %g3
2549	ld	[%sp + STACK_START + 20], %g4
2550#else
2551	ldx	[%sp + STACK_START + 32], %g1
2552	ldx	[%sp + STACK_START + 40], %g2
2553	ldx	[%sp + STACK_START + 48], %g3
2554	ldx	[%sp + STACK_START + 56], %g4
2555#endif
2556#endif
2557
2558	return	%i7 + 8
2559	clr	%o0
2560
2561	EPILOGUE
2562