1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define M	%i0
43#define N	%i1
44#define K	%i2
45
46#if defined(DOUBLE) && !defined(__64BIT__)
47#define A	%i5
48#define B	%i4
49#else
50#define A	%i4
51#define B	%i5
52#endif
53
54#define C	%o4
55#define LDC	%o5
56
57#define AO	%l0
58#define BO	%l1
59#define I	%l2
60#define J	%l3
61#define L	%l4
62
63#define C1	%o0
64#define C2	%o1
65#define C3	%o2
66#define C4	%o3
67
68#define OFFSET	%l5
69#define	KK	%l6
70#define TEMP1	%l7
71#define TEMP2	%i3
72#define AORIG	%g1
73
74#ifdef DOUBLE
75#define c01	%f0
76#define c02	%f2
77#define c03	%f4
78#define c04	%f6
79#define c05	%f8
80#define c06	%f10
81#define c07	%f12
82#define c08	%f14
83#define c09	%f16
84#define c10	%f18
85#define c11	%f20
86#define c12	%f22
87#define c13	%f24
88#define c14	%f26
89#define c15	%f28
90#define c16	%f30
91
92#define t1	%f32
93#define	t2 	%f34
94#define t3	%f36
95#define	t4 	%f38
96
97#define a1	%f40
98#define a2	%f42
99#define a3	%f44
100#define a4	%f46
101#define a5	%f58
102
103#define b1	%f48
104#define b2	%f50
105#define b3	%f52
106#define b4	%f54
107#define b5	%f56
108
109#define FZERO	%f60
110#define ALPHA	%f62
111#else
112#define c01	%f0
113#define c02	%f1
114#define c03	%f2
115#define c04	%f3
116#define c05	%f4
117#define c06	%f5
118#define c07	%f6
119#define c08	%f7
120#define c09	%f8
121#define c10	%f9
122#define c11	%f10
123#define c12	%f11
124#define c13	%f12
125#define c14	%f13
126#define c15	%f14
127#define c16	%f15
128
129#define t1	%f16
130#define	t2 	%f17
131#define t3	%f18
132#define	t4 	%f19
133
134#define a1	%f20
135#define a2	%f21
136#define a3	%f22
137#define a4	%f23
138#define a5	%f31
139
140#define b1	%f24
141#define b2	%f25
142#define b3	%f26
143#define b4	%f27
144#define b5	%f28
145
146#define FZERO	%f29
147#define ALPHA	%f30
148#endif
149
150#define APREFETCHSIZE 40
151#define BPREFETCHSIZE 40
152
153#define APREFETCH_CATEGORY 0
154#define BPREFETCH_CATEGORY 0
155
156	PROLOGUE
157	SAVESP
158	nop
159
160#ifndef __64BIT__
161#ifdef DOUBLE
162	ld	[%sp + STACK_START + 28], B
163	ld	[%sp + STACK_START + 32], C
164	ld	[%sp + STACK_START + 36], LDC
165	ld	[%sp + STACK_START + 40], OFFSET
166#else
167	ld	[%sp + STACK_START + 28], C
168	ld	[%sp + STACK_START + 32], LDC
169	ld	[%sp + STACK_START + 36], OFFSET
170#endif
171#else
172	ldx	[%sp+  STACK_START + 56], C
173	ldx	[%sp+  STACK_START + 64], LDC
174	ldx	[%sp+  STACK_START + 72], OFFSET
175#endif
176
177	FCLR(29)
178
179	sll	LDC, BASE_SHIFT, LDC
180
181#ifdef LN
182	smul	M, K, TEMP1
183	sll	TEMP1, BASE_SHIFT, TEMP1
184	add	A, TEMP1, A
185
186	sll	M, BASE_SHIFT, TEMP1
187	add	C, TEMP1, C
188#endif
189
190#ifdef RN
191	neg	OFFSET, KK
192#endif
193
194#ifdef RT
195	smul	N, K, TEMP1
196	sll	TEMP1, BASE_SHIFT, TEMP1
197	add	B, TEMP1, B
198
199	smul	N, LDC, TEMP1
200	add	C, TEMP1, C
201
202	sub	N, OFFSET, KK
203#endif
204
205	sra	N, 2, J
206	cmp	J, 0
207	ble,pn	%icc, .LL100
208	nop
209
210.LL11:
211#ifdef RT
212	sll	K, 2 + BASE_SHIFT, TEMP1
213	sub	B, TEMP1, B
214
215	sll	LDC, 2, TEMP1
216	sub	C, TEMP1, C
217#endif
218
219	mov	C,  C1
220	add	C,  LDC, C2
221	add	C2, LDC, C3
222	add	C3, LDC, C4
223
224#ifdef LN
225	add	M, OFFSET, KK
226#endif
227
228#ifdef LT
229	mov	OFFSET, KK
230#endif
231
232#if defined(LN) || defined(RT)
233	mov	A, AORIG
234#else
235	mov	A, AO
236#endif
237
238#ifndef RT
239	add	C4, LDC, C
240#endif
241
242	and	M, 1, I
243	cmp	I, 0
244	ble,pn	%icc, .LL50
245	nop
246
247#if defined(LT) || defined(RN)
248	sra	KK, 2, L
249
250	mov	B, BO
251	cmp	L,  0
252#else
253
254#ifdef LN
255	sll	K,  0 + BASE_SHIFT, TEMP1
256	sub	AORIG, TEMP1, AORIG
257#endif
258
259	sll	KK, 0 + BASE_SHIFT, TEMP1
260	sll	KK, 2 + BASE_SHIFT, TEMP2
261
262	add	AORIG, TEMP1, AO
263	add	B,     TEMP2, BO
264
265	sub	K, KK, TEMP1
266	sra	TEMP1, 2, L
267	cmp	L,  0
268#endif
269
270	LDF	[AO + 0 * SIZE], a1
271	FMOV	FZERO, c01
272	LDF	[BO + 0 * SIZE], b1
273	FMOV	FZERO, t1
274 	LDF	[AO + 1 * SIZE], a2
275	FMOV	FZERO, c02
276	LDF	[BO + 1 * SIZE], b2
277	FMOV	FZERO, t2
278	LDF	[AO + 2 * SIZE], a3
279	FMOV	FZERO, c03
280	LDF	[BO + 2 * SIZE], b3
281	FMOV	FZERO, t3
282	LDF	[AO + 3 * SIZE], a4
283	FMOV	FZERO, c04
284	LDF	[BO + 3 * SIZE], b4
285	FMOV	FZERO, t4
286
287	ble,pn	%icc, .LL75
288	nop
289
290.LL72:
291	FADD	c01, t1, c01
292	add	L, -1, L
293	FMUL	a1, b1, t1
294	LDF	[BO + 4 * SIZE], b1
295
296	FADD	c02, t2, c02
297	cmp	L, 0
298	FMUL	a1, b2, t2
299	LDF	[BO + 5 * SIZE], b2
300
301	FADD	c03, t3, c03
302	FMUL	a1, b3, t3
303	LDF	[BO + 6 * SIZE], b3
304
305	FADD	c04, t4, c04
306	FMUL	a1, b4, t4
307	LDF	[BO + 7 * SIZE], b4
308	LDF	[AO +  4 * SIZE], a1
309
310	FADD	c01, t1, c01
311	add	AO,  4 * SIZE, AO
312	FMUL	a2, b1, t1
313	LDF	[BO +  8 * SIZE], b1
314
315	FADD	c02, t2, c02
316	FMUL	a2, b2, t2
317	LDF	[BO +  9 * SIZE], b2
318
319	FADD	c03, t3, c03
320	FMUL	a2, b3, t3
321	LDF	[BO + 10 * SIZE], b3
322
323	FADD	c04, t4, c04
324	FMUL	a2, b4, t4
325	LDF	[BO + 11 * SIZE], b4
326	LDF	[AO +  1 * SIZE], a2
327
328	FADD	c01, t1, c01
329	FMUL	a3, b1, t1
330	LDF	[BO + 12 * SIZE], b1
331
332	FADD	c02, t2, c02
333	FMUL	a3, b2, t2
334	LDF	[BO + 13 * SIZE], b2
335
336	FADD	c03, t3, c03
337	FMUL	a3, b3, t3
338	LDF	[BO + 14 * SIZE], b3
339
340	FADD	c04, t4, c04
341	FMUL	a3, b4, t4
342	LDF	[BO + 15 * SIZE], b4
343	LDF	[AO +  2 * SIZE], a3
344
345	FADD	c01, t1, c01
346	FMUL	a4, b1, t1
347	LDF	[BO + 16 * SIZE], b1
348
349	FADD	c02, t2, c02
350	FMUL	a4, b2, t2
351	LDF	[BO + 17 * SIZE], b2
352
353	FADD	c03, t3, c03
354	FMUL	a4, b3, t3
355	LDF	[BO + 18 * SIZE], b3
356
357	FADD	c04, t4, c04
358	FMUL	a4, b4, t4
359	LDF	[BO + 19 * SIZE], b4
360
361	add	BO, 16 * SIZE, BO
362	bg,pt	%icc, .LL72
363	LDF	[AO +  3 * SIZE], a4
364
365.LL75:
366#if defined(LT) || defined(RN)
367	and	KK,  3, L
368#else
369	and	TEMP1, 3, L
370#endif
371	cmp	L,  0
372	ble,a,pn %icc, .LL79
373	nop
374
375.LL76:
376	FADD	c01, t1, c01
377	add	AO, 1 * SIZE, AO
378	FMUL	a1, b1, t1
379	LDF	[BO + 4 * SIZE], b1
380
381	FADD	c02, t2, c02
382	add	L, -1, L
383	FMUL	a1, b2, t2
384	LDF	[BO + 5 * SIZE], b2
385
386	FADD	c03, t3, c03
387	cmp	L, 0
388	FMUL	a1, b3, t3
389	LDF	[BO + 6 * SIZE], b3
390
391	FADD	c04, t4, c04
392	add	BO, 4 * SIZE, BO
393	FMUL	a1, b4, t4
394	LDF	[AO + 0 * SIZE], a1
395
396	bg,pt	%icc, .LL76
397	LDF	[BO + 3 * SIZE], b4
398
399
400.LL79:
401	FADD	c01, t1, c01
402	FADD	c02, t2, c02
403	FADD	c03, t3, c03
404	FADD	c04, t4, c04
405
406#if defined(LN) || defined(RT)
407#ifdef LN
408	sub	KK, 1, TEMP1
409#else
410	sub	KK, 4, TEMP1
411#endif
412	sll	TEMP1, 0 + BASE_SHIFT, TEMP2
413	sll	TEMP1, 2 + BASE_SHIFT, TEMP1
414	add	AORIG, TEMP2, AO
415	add	B,     TEMP1, BO
416#endif
417
418#if defined(LN) || defined(LT)
419	LDF	[BO +  0 * SIZE], a1
420	LDF	[BO +  1 * SIZE], a2
421	LDF	[BO +  2 * SIZE], a3
422	LDF	[BO +  3 * SIZE], a4
423
424	FSUB	a1, c01, c01
425	FSUB	a2, c02, c02
426	FSUB	a3, c03, c03
427	FSUB	a4, c04, c04
428#else
429	LDF	[AO +  0 * SIZE], a1
430	LDF	[AO +  1 * SIZE], a2
431	LDF	[AO +  2 * SIZE], a3
432	LDF	[AO +  3 * SIZE], a4
433
434	FSUB	a1, c01, c01
435	FSUB	a2, c02, c02
436	FSUB	a3, c03, c03
437	FSUB	a4, c04, c04
438#endif
439
440#ifdef LN
441	LDF	[AO +  0 * SIZE], a1
442
443	FMUL	a1, c01, c01
444	FMUL	a1, c02, c02
445	FMUL	a1, c03, c03
446	FMUL	a1, c04, c04
447#endif
448
449#ifdef LT
450	LDF	[AO +  0 * SIZE], a1
451
452	FMUL	a1, c01, c01
453	FMUL	a1, c02, c02
454	FMUL	a1, c03, c03
455	FMUL	a1, c04, c04
456#endif
457
458#ifdef RN
459	LDF	[BO +  0 * SIZE], a1
460	LDF	[BO +  1 * SIZE], a2
461	LDF	[BO +  2 * SIZE], a3
462	LDF	[BO +  3 * SIZE], a4
463
464	FMUL	a1, c01, c01
465	FMUL	a2, c01, t1
466	FSUB	c02, t1, c02
467	FMUL	a3, c01, t1
468	FSUB	c03, t1, c03
469	FMUL	a4, c01, t1
470	FSUB	c04, t1, c04
471
472	LDF	[BO +  5 * SIZE], a1
473	LDF	[BO +  6 * SIZE], a2
474	LDF	[BO +  7 * SIZE], a3
475
476	FMUL	a1, c02, c02
477	FMUL	a2, c02, t1
478	FSUB	c03, t1, c03
479	FMUL	a3, c02, t1
480	FSUB	c04, t1, c04
481
482	LDF	[BO + 10 * SIZE], a1
483	LDF	[BO + 11 * SIZE], a2
484
485	FMUL	a1, c03, c03
486	FMUL	a2, c03, t1
487	FSUB	c04, t1, c04
488
489	LDF	[BO + 15 * SIZE], a1
490
491	FMUL	a1, c04, c04
492#endif
493
494#ifdef RT
495	LDF	[BO + 15 * SIZE], a1
496	LDF	[BO + 14 * SIZE], a2
497	LDF	[BO + 13 * SIZE], a3
498	LDF	[BO + 12 * SIZE], a4
499
500	FMUL	a1, c04, c04
501	FMUL	a2, c04, t1
502	FSUB	c03, t1, c03
503	FMUL	a3, c04, t1
504	FSUB	c02, t1, c02
505	FMUL	a4, c04, t1
506	FSUB	c01, t1, c01
507
508	LDF	[BO + 10 * SIZE], a1
509	LDF	[BO +  9 * SIZE], a2
510	LDF	[BO +  8 * SIZE], a3
511
512	FMUL	a1, c03, c03
513	FMUL	a2, c03, t1
514	FSUB	c02, t1, c02
515	FMUL	a3, c03, t1
516	FSUB	c01, t1, c01
517
518	LDF	[BO +  5 * SIZE], a1
519	LDF	[BO +  4 * SIZE], a2
520
521	FMUL	a1, c02, c02
522	FMUL	a2, c02, t1
523	FSUB	c01, t1, c01
524
525	LDF	[BO +  0 * SIZE], a1
526
527	FMUL	a1, c01, c01
528#endif
529
530#ifdef LN
531	add	C1, -1 * SIZE, C1
532	add	C2, -1 * SIZE, C2
533	add	C3, -1 * SIZE, C3
534	add	C4, -1 * SIZE, C4
535#endif
536
537#if defined(LN) || defined(LT)
538	STF	c01, [BO +  0 * SIZE]
539	STF	c02, [BO +  1 * SIZE]
540	STF	c03, [BO +  2 * SIZE]
541	STF	c04, [BO +  3 * SIZE]
542#else
543	STF	c01, [AO +  0 * SIZE]
544	STF	c02, [AO +  1 * SIZE]
545	STF	c03, [AO +  2 * SIZE]
546	STF	c04, [AO +  3 * SIZE]
547#endif
548
549	STF	c01, [C1 + 0 * SIZE]
550	STF	c02, [C2 + 0 * SIZE]
551	STF	c03, [C3 + 0 * SIZE]
552	STF	c04, [C4 + 0 * SIZE]
553
554	FMOV	FZERO, t1
555	FMOV	FZERO, t2
556	FMOV	FZERO, t3
557	FMOV	FZERO, t4
558
559#ifndef LN
560	add	C1, 1 * SIZE, C1
561	add	C2, 1 * SIZE, C2
562	add	C3, 1 * SIZE, C3
563	add	C4, 1 * SIZE, C4
564#endif
565
566#ifdef RT
567	sll	K, 0 + BASE_SHIFT, TEMP1
568	add	AORIG, TEMP1, AORIG
569#endif
570
571#if defined(LT) || defined(RN)
572	sub	K, KK, TEMP1
573	sll	TEMP1, 0 + BASE_SHIFT, TEMP2
574	sll	TEMP1, 2 + BASE_SHIFT, TEMP1
575	add	AO, TEMP2, AO
576	add	BO, TEMP1, BO
577#endif
578
579#ifdef LT
580	add	KK, 1, KK
581#endif
582
583#ifdef LN
584	sub	KK, 1, KK
585#endif
586
587.LL50:
588	and	M, 2, I
589	cmp	I, 0
590	ble,pn	%icc, .LL70
591	nop
592
593#if defined(LT) || defined(RN)
594	sra	KK, 2, L
595
596	mov	B, BO
597	cmp	L,  0
598#else
599
600#ifdef LN
601	sll	K,  1 + BASE_SHIFT, TEMP1
602	sub	AORIG, TEMP1, AORIG
603#endif
604
605	sll	KK, 1 + BASE_SHIFT, TEMP1
606	sll	KK, 2 + BASE_SHIFT, TEMP2
607
608	add	AORIG, TEMP1, AO
609	add	B,     TEMP2, BO
610
611	sub	K, KK, TEMP1
612	sra	TEMP1, 2, L
613	cmp	L,  0
614#endif
615
616	FMOV	FZERO, c02
617	FMOV	FZERO, t1
618	FMOV	FZERO, c04
619
620	LDF	[AO + 0 * SIZE], a1
621	FMOV	FZERO, t2
622	LDF	[BO + 0 * SIZE], b1
623	FMOV	FZERO, c06
624	LDF	[AO + 1 * SIZE], a2
625	FMOV	FZERO, t3
626	LDF	[BO + 1 * SIZE], b2
627	FMOV	FZERO, c08
628	LDF	[AO + 2 * SIZE], a3
629	FMOV	FZERO, t4
630	LDF	[BO + 2 * SIZE], b3
631	FMOV	FZERO, c01
632	LDF	[AO + 3 * SIZE], a4
633	FMOV	FZERO, c03
634	LDF	[BO + 3 * SIZE], b4
635	FMOV	FZERO, c05
636
637	ble,pn	%icc, .LL55
638	FMOV	FZERO, c07
639
640.LL52:
641	FADD	c02, t1, c02
642	add	AO,  8 * SIZE, AO
643	prefetch [AO + APREFETCHSIZE * SIZE], 0
644
645	FMUL	a1, b1, t1
646	add	BO, 16 * SIZE, BO
647
648	FADD	c04, t2, c04
649	add	L, -1, L
650	FMUL	a1, b2, t2
651
652	FADD	c06, t3, c06
653	cmp	L, 0
654	FMUL	a1, b3, t3
655
656	FADD	c08, t4, c08
657	FMUL	a1, b4, t4
658	LDF	[AO -  4 * SIZE], a1
659
660	FADD	c01, t1, c01
661	FMUL	a2, b1, t1
662	LDF	[BO - 12 * SIZE], b1
663	FADD	c03, t2, c03
664	FMUL	a2, b2, t2
665	LDF	[BO - 11 * SIZE], b2
666
667	FADD	c05, t3, c05
668	FMUL	a2, b3, t3
669	LDF	[BO - 10 * SIZE], b3
670	FADD	c07, t4, c07
671	FMUL	a2, b4, t4
672	LDF	[BO -  9 * SIZE], b4
673
674	FADD	c02, t1, c02
675	FMUL	a3, b1, t1
676	LDF	[AO -  3 * SIZE], a2
677	FADD	c04, t2, c04
678	FMUL	a3, b2, t2
679
680	FADD	c06, t3, c06
681	FMUL	a3, b3, t3
682	FADD	c08, t4, c08
683	FMUL	a3, b4, t4
684	LDF	[AO -  2 * SIZE], a3
685
686	FADD	c01, t1, c01
687	FMUL	a4, b1, t1
688	LDF	[BO -  8 * SIZE], b1
689	FADD	c03, t2, c03
690	FMUL	a4, b2, t2
691	LDF	[BO -  7 * SIZE], b2
692
693	FADD	c05, t3, c05
694	FMUL	a4, b3, t3
695	LDF	[BO -  6 * SIZE], b3
696	FADD	c07, t4, c07
697	FMUL	a4, b4, t4
698	LDF	[BO -  5 * SIZE], b4
699
700	FADD	c02, t1, c02
701	FMUL	a1, b1, t1
702	LDF	[AO -  1 * SIZE], a4
703	FADD	c04, t2, c04
704	FMUL	a1, b2, t2
705
706	FADD	c06, t3, c06
707	FMUL	a1, b3, t3
708	FADD	c08, t4, c08
709	FMUL	a1, b4, t4
710	LDF	[AO +  0 * SIZE], a1
711
712	FADD	c01, t1, c01
713	FMUL	a2, b1, t1
714	LDF	[BO -  4 * SIZE], b1
715
716	FADD	c03, t2, c03
717	FMUL	a2, b2, t2
718	LDF	[BO -  3 * SIZE], b2
719
720	FADD	c05, t3, c05
721	FMUL	a2, b3, t3
722	LDF	[BO -  2 * SIZE], b3
723	FADD	c07, t4, c07
724	FMUL	a2, b4, t4
725	LDF	[BO -  1 * SIZE], b4
726
727	FADD	c02, t1, c02
728	FMUL	a3, b1, t1
729	LDF	[AO +  1 * SIZE], a2
730	FADD	c04, t2, c04
731	FMUL	a3, b2, t2
732
733	FADD	c06, t3, c06
734	FMUL	a3, b3, t3
735	FADD	c08, t4, c08
736	FMUL	a3, b4, t4
737	LDF	[AO +  2 * SIZE], a3
738
739	FADD	c01, t1, c01
740	FMUL	a4, b1, t1
741	LDF	[BO +  0 * SIZE], b1
742	FADD	c03, t2, c03
743	FMUL	a4, b2, t2
744	LDF	[BO +  1 * SIZE], b2
745
746	FADD	c05, t3, c05
747	FMUL	a4, b3, t3
748	LDF	[BO +  2 * SIZE], b3
749	FADD	c07, t4, c07
750	FMUL	a4, b4, t4
751	LDF	[BO +  3 * SIZE], b4
752
753	bg,pt	%icc, .LL52
754	LDF	[AO +  3 * SIZE], a4
755
756.LL55:
757#if defined(LT) || defined(RN)
758	and	KK,  3, L
759#else
760	and	TEMP1, 3, L
761#endif
762	cmp	L,  0
763	ble,a,pn %icc, .LL59
764	nop
765
766.LL56:
767	FADD	c02, t1, c02
768	add	AO, 2 * SIZE, AO
769	FMUL	a1, b1, t1
770	add	L, -1, L
771
772	add	BO, 4 * SIZE, BO
773	FADD	c04, t2, c04
774	cmp	L, 0
775	FMUL	a1, b2, t2
776
777	FADD	c06, t3, c06
778	FMUL	a1, b3, t3
779	FADD	c08, t4, c08
780	FMUL	a1, b4, t4
781	LDF	[AO + 0 * SIZE], a1
782
783	FADD	c01, t1, c01
784	FMUL	a2, b1, t1
785	LDF	[BO + 0 * SIZE], b1
786	FADD	c03, t2, c03
787	FMUL	a2, b2, t2
788	LDF	[BO + 1 * SIZE], b2
789
790	FADD	c05, t3, c05
791	FMUL	a2, b3, t3
792	LDF	[BO + 2 * SIZE], b3
793	FADD	c07, t4, c07
794	FMUL	a2, b4, t4
795	LDF	[BO + 3 * SIZE], b4
796
797	bg,pt	%icc, .LL56
798	LDF	[AO + 1 * SIZE], a2
799
800.LL59:
801#if defined(LN) || defined(RT)
802#ifdef LN
803	sub	KK, 2, TEMP1
804#else
805	sub	KK, 4, TEMP1
806#endif
807	sll	TEMP1, 1 + BASE_SHIFT, TEMP2
808	sll	TEMP1, 2 + BASE_SHIFT, TEMP1
809	add	AORIG, TEMP2, AO
810	add	B,     TEMP1, BO
811#endif
812
813	FADD	c02, t1, c02
814	FADD	c04, t2, c04
815	FADD	c06, t3, c06
816	FADD	c08, t4, c08
817
818#if defined(LN) || defined(LT)
819	LDF	[BO +  0 * SIZE], a1
820	LDF	[BO +  1 * SIZE], a2
821	LDF	[BO +  2 * SIZE], a3
822	LDF	[BO +  3 * SIZE], a4
823
824	LDF	[BO +  4 * SIZE], b1
825	LDF	[BO +  5 * SIZE], b2
826	LDF	[BO +  6 * SIZE], b3
827	LDF	[BO +  7 * SIZE], b4
828
829	FSUB	a1, c01, c01
830	FSUB	a2, c03, c03
831	FSUB	a3, c05, c05
832	FSUB	a4, c07, c07
833
834	FSUB	b1, c02, c02
835	FSUB	b2, c04, c04
836	FSUB	b3, c06, c06
837	FSUB	b4, c08, c08
838#else
839	LDF	[AO +  0 * SIZE], a1
840	LDF	[AO +  1 * SIZE], a2
841	LDF	[AO +  2 * SIZE], a3
842	LDF	[AO +  3 * SIZE], a4
843
844	LDF	[AO +  4 * SIZE], b1
845	LDF	[AO +  5 * SIZE], b2
846	LDF	[AO +  6 * SIZE], b3
847	LDF	[AO +  7 * SIZE], b4
848
849	FSUB	a1, c01, c01
850	FSUB	a2, c02, c02
851	FSUB	a3, c03, c03
852	FSUB	a4, c04, c04
853
854	FSUB	b1, c05, c05
855	FSUB	b2, c06, c06
856	FSUB	b3, c07, c07
857	FSUB	b4, c08, c08
858#endif
859
860#ifdef LN
861	LDF	[AO +  3 * SIZE], a1
862	LDF	[AO +  2 * SIZE], a2
863	LDF	[AO +  0 * SIZE], a3
864
865	FMUL	a1, c02, c02
866	FMUL	a1, c04, c04
867	FMUL	a1, c06, c06
868	FMUL	a1, c08, c08
869
870	FMUL	a2, c02, t1
871	FMUL	a2, c04, t2
872	FMUL	a2, c06, t3
873	FMUL	a2, c08, t4
874
875	FSUB	c01, t1, c01
876	FSUB	c03, t2, c03
877	FSUB	c05, t3, c05
878	FSUB	c07, t4, c07
879
880	FMUL	a3, c01, c01
881	FMUL	a3, c03, c03
882	FMUL	a3, c05, c05
883	FMUL	a3, c07, c07
884#endif
885
886#ifdef LT
887	LDF	[AO +  0 * SIZE], a1
888	LDF	[AO +  1 * SIZE], a2
889	LDF	[AO +  3 * SIZE], a3
890
891	FMUL	a1, c01, c01
892	FMUL	a1, c03, c03
893	FMUL	a1, c05, c05
894	FMUL	a1, c07, c07
895
896	FMUL	a2, c01, t1
897	FMUL	a2, c03, t2
898	FMUL	a2, c05, t3
899	FMUL	a2, c07, t4
900
901	FSUB	c02, t1, c02
902	FSUB	c04, t2, c04
903	FSUB	c06, t3, c06
904	FSUB	c08, t4, c08
905
906	FMUL	a3, c02, c02
907	FMUL	a3, c04, c04
908	FMUL	a3, c06, c06
909	FMUL	a3, c08, c08
910#endif
911
912#ifdef RN
913	LDF	[BO +  0 * SIZE], a1
914	LDF	[BO +  1 * SIZE], a2
915	LDF	[BO +  2 * SIZE], a3
916	LDF	[BO +  3 * SIZE], a4
917
918	FMUL	a1, c01, c01
919	FMUL	a1, c02, c02
920
921	FMUL	a2, c01, t1
922	FMUL	a2, c02, t2
923
924	FSUB	c03, t1, c03
925	FSUB	c04, t2, c04
926
927	FMUL	a3, c01, t1
928	FMUL	a3, c02, t2
929
930	FSUB	c05, t1, c05
931	FSUB	c06, t2, c06
932
933	FMUL	a4, c01, t1
934	FMUL	a4, c02, t2
935
936	FSUB	c07, t1, c07
937	FSUB	c08, t2, c08
938
939	LDF	[BO +  5 * SIZE], a1
940	LDF	[BO +  6 * SIZE], a2
941	LDF	[BO +  7 * SIZE], a3
942
943	FMUL	a1, c03, c03
944	FMUL	a1, c04, c04
945
946	FMUL	a2, c03, t1
947	FMUL	a2, c04, t2
948
949	FSUB	c05, t1, c05
950	FSUB	c06, t2, c06
951
952	FMUL	a3, c03, t1
953	FMUL	a3, c04, t2
954
955	FSUB	c07, t1, c07
956	FSUB	c08, t2, c08
957
958	LDF	[BO + 10 * SIZE], a1
959	LDF	[BO + 11 * SIZE], a2
960
961	FMUL	a1, c05, c05
962	FMUL	a1, c06, c06
963
964	FMUL	a2, c05, t1
965	FMUL	a2, c06, t2
966
967	FSUB	c07, t1, c07
968	FSUB	c08, t2, c08
969
970	LDF	[BO + 15 * SIZE], a1
971
972	FMUL	a1, c07, c07
973	FMUL	a1, c08, c08
974#endif
975
976#ifdef RT
977	LDF	[BO + 15 * SIZE], a1
978	LDF	[BO + 14 * SIZE], a2
979	LDF	[BO + 13 * SIZE], a3
980	LDF	[BO + 12 * SIZE], a4
981
982	FMUL	a1, c07, c07
983	FMUL	a1, c08, c08
984
985	FMUL	a2, c07, t1
986	FMUL	a2, c08, t2
987
988	FSUB	c05, t1, c05
989	FSUB	c06, t2, c06
990
991	FMUL	a3, c07, t1
992	FMUL	a3, c08, t2
993
994	FSUB	c03, t1, c03
995	FSUB	c04, t2, c04
996
997	FMUL	a4, c07, t1
998	FMUL	a4, c08, t2
999
1000	FSUB	c01, t1, c01
1001	FSUB	c02, t2, c02
1002
1003	LDF	[BO + 10 * SIZE], a1
1004	LDF	[BO +  9 * SIZE], a2
1005	LDF	[BO +  8 * SIZE], a3
1006
1007	FMUL	a1, c05, c05
1008	FMUL	a1, c06, c06
1009
1010	FMUL	a2, c05, t1
1011	FMUL	a2, c06, t2
1012
1013	FSUB	c03, t1, c03
1014	FSUB	c04, t2, c04
1015
1016	FMUL	a3, c05, t1
1017	FMUL	a3, c06, t2
1018
1019	FSUB	c01, t1, c01
1020	FSUB	c02, t2, c02
1021
1022	LDF	[BO +  5 * SIZE], a1
1023	LDF	[BO +  4 * SIZE], a2
1024
1025	FMUL	a1, c03, c03
1026	FMUL	a1, c04, c04
1027
1028	FMUL	a2, c03, t1
1029	FMUL	a2, c04, t2
1030
1031	FSUB	c01, t1, c01
1032	FSUB	c02, t2, c02
1033
1034	LDF	[BO +  0 * SIZE], a1
1035
1036	FMUL	a1, c01, c01
1037	FMUL	a1, c02, c02
1038#endif
1039
1040#ifdef LN
1041	add	C1, -2 * SIZE, C1
1042	add	C2, -2 * SIZE, C2
1043	add	C3, -2 * SIZE, C3
1044	add	C4, -2 * SIZE, C4
1045#endif
1046
1047#if defined(LN) || defined(LT)
1048	STF	c01, [BO +  0 * SIZE]
1049	STF	c03, [BO +  1 * SIZE]
1050	STF	c05, [BO +  2 * SIZE]
1051	STF	c07, [BO +  3 * SIZE]
1052
1053	STF	c02, [BO +  4 * SIZE]
1054	STF	c04, [BO +  5 * SIZE]
1055	STF	c06, [BO +  6 * SIZE]
1056	STF	c08, [BO +  7 * SIZE]
1057#else
1058	STF	c01, [AO +  0 * SIZE]
1059	STF	c02, [AO +  1 * SIZE]
1060	STF	c03, [AO +  2 * SIZE]
1061	STF	c04, [AO +  3 * SIZE]
1062
1063	STF	c05, [AO +  4 * SIZE]
1064	STF	c06, [AO +  5 * SIZE]
1065	STF	c07, [AO +  6 * SIZE]
1066	STF	c08, [AO +  7 * SIZE]
1067#endif
1068
1069	STF	c01, [C1 + 0 * SIZE]
1070	STF	c02, [C1 + 1 * SIZE]
1071	STF	c03, [C2 + 0 * SIZE]
1072	STF	c04, [C2 + 1 * SIZE]
1073
1074	STF	c05, [C3 + 0 * SIZE]
1075	STF	c06, [C3 + 1 * SIZE]
1076	STF	c07, [C4 + 0 * SIZE]
1077	STF	c08, [C4 + 1 * SIZE]
1078
1079	FMOV	FZERO, t1
1080	FMOV	FZERO, t2
1081	FMOV	FZERO, t3
1082	FMOV	FZERO, t4
1083
1084#ifndef LN
1085	add	C1, 2 * SIZE, C1
1086	add	C2, 2 * SIZE, C2
1087	add	C3, 2 * SIZE, C3
1088	add	C4, 2 * SIZE, C4
1089#endif
1090
1091#ifdef RT
1092	sll	K, 1 + BASE_SHIFT, TEMP1
1093	add	AORIG, TEMP1, AORIG
1094#endif
1095
1096#if defined(LT) || defined(RN)
1097	sub	K, KK, TEMP1
1098	sll	TEMP1, 1 + BASE_SHIFT, TEMP2
1099	sll	TEMP1, 2 + BASE_SHIFT, TEMP1
1100	add	AO, TEMP2, AO
1101	add	BO, TEMP1, BO
1102#endif
1103
1104#ifdef LT
1105	add	KK, 2, KK
1106#endif
1107
1108#ifdef LN
1109	sub	KK, 2, KK
1110#endif
1111
1112.LL70:
1113	sra	M, 2, I
1114	cmp	I, 0
1115	ble,pn	%icc, .LL99
1116	nop
1117
1118.LL21:
1119	FMOV	FZERO, t1
1120	FMOV	FZERO, t2
1121	FMOV	FZERO, t3
1122	FMOV	FZERO, t4
1123
1124	FMOV	FZERO, c01
1125	FMOV	FZERO, c02
1126	FMOV	FZERO, c03
1127
1128#if defined(LT) || defined(RN)
1129	sra	KK, 2, L
1130
1131	mov	B, BO
1132	cmp	L,  0
1133#else
1134
1135#ifdef LN
1136	sll	K,  2 + BASE_SHIFT, TEMP1
1137	sub	AORIG, TEMP1, AORIG
1138#endif
1139
1140	sll	KK, 2 + BASE_SHIFT, TEMP1
1141
1142	add	AORIG, TEMP1, AO
1143	add	B,     TEMP1, BO
1144
1145	sub	K, KK, TEMP1
1146
1147	sra	TEMP1, 2, L
1148	cmp	L,  0
1149#endif
1150
1151	LDF	[AO + 0 * SIZE], a1
1152	FMOV	FZERO, c04
1153	LDF	[BO + 0 * SIZE], b1
1154	FMOV	FZERO, c05
1155	LDF	[AO + 1 * SIZE], a2
1156	FMOV	FZERO, c06
1157	LDF	[BO + 1 * SIZE], b2
1158	FMOV	FZERO, c07
1159
1160	LDF	[AO + 2 * SIZE], a3
1161	FMOV	FZERO, c08
1162	LDF	[BO + 2 * SIZE], b3
1163	FMOV	FZERO, c09
1164	LDF	[AO + 3 * SIZE], a4
1165	FMOV	FZERO, c10
1166	LDF	[BO + 3 * SIZE], b4
1167	FMOV	FZERO, c11
1168	LDF	[BO +  4 * SIZE], b5	/* ***** */
1169
1170	LDF	[AO +  4 * SIZE], a5	/* ***** */
1171
1172#ifdef LN
1173	prefetch [C1 + 3 * SIZE], 3
1174	FMOV	FZERO, c12
1175	prefetch [C2 + 3 * SIZE], 3
1176	FMOV	FZERO, c13
1177	prefetch [C3 + 3 * SIZE], 3
1178	FMOV	FZERO, c14
1179	prefetch [C4 + 3 * SIZE], 3
1180	FMOV	FZERO, c15
1181#else
1182	prefetch [C1 - 3 * SIZE], 3
1183	FMOV	FZERO, c12
1184	prefetch [C2 - 3 * SIZE], 3
1185	FMOV	FZERO, c13
1186	prefetch [C3 - 3 * SIZE], 3
1187	FMOV	FZERO, c14
1188	prefetch [C4 - 3 * SIZE], 3
1189	FMOV	FZERO, c15
1190#endif
1191
1192	ble,pn	%icc, .LL25
1193	FMOV	FZERO, c16
1194
1195.LL22:
1196	FADD	c04, t1, c04
1197	prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY
1198	FMUL	a1, b1, t1
1199	nop
1200
1201	FADD	c08, t2, c08
1202	prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY
1203	FMUL	a1, b2, t2
1204	add	AO, 16 * SIZE, AO
1205
1206	FADD	c12, t3, c12
1207	LDF	[AO - 13 * SIZE], a4
1208	FMUL	a1, b3, t3
1209	add	BO, 16 * SIZE, BO
1210
1211	FADD	c16, t4, c16
1212	nop
1213	FMUL	a1, b4, t4
1214	LDF	[AO -  8 * SIZE], a1
1215
1216	FADD	c01, t1, c01
1217	nop
1218	FMUL	a2, b1, t1
1219	nop
1220
1221	FADD	c05, t2, c05
1222	nop
1223	FMUL	a2, b2, t2
1224	nop
1225
1226	FADD	c09, t3, c09
1227	nop
1228	FMUL	a2, b3, t3
1229	nop
1230
1231	FADD	c13, t4, c13
1232	add	L, -1, L
1233	FMUL	a2, b4, t4
1234	LDF	[AO - 11 * SIZE], a2
1235
1236	FADD	c02, t1, c02
1237	nop
1238	FMUL	a3, b1, t1
1239	nop
1240
1241	FADD	c06, t2, c06
1242	nop
1243	FMUL	a3, b2, t2
1244	nop
1245
1246	FADD	c10, t3, c10
1247	nop
1248	FMUL	a3, b3, t3
1249	nop
1250
1251	FADD	c14, t4, c14
1252	nop
1253	FMUL	a3, b4, t4
1254	LDF	[AO - 10 * SIZE], a3
1255
1256	FADD	c03, t1, c03
1257	nop
1258	FMUL	a4, b1, t1
1259	LDF	[BO -  8 * SIZE], b1
1260
1261	FADD	c07, t2, c07
1262	nop
1263	FMUL	a4, b2, t2
1264	LDF	[BO - 11 * SIZE], b2
1265
1266	FADD	c11, t3, c11
1267	nop
1268	FMUL	a4, b3, t3
1269	LDF	[BO - 10 * SIZE], b3
1270
1271	FADD	c15, t4, c15
1272	nop
1273	FMUL	a4, b4, t4
1274	LDF	[BO -  9 * SIZE], b4
1275
1276	FADD	c04, t1, c04
1277	nop
1278	FMUL	a5, b5, t1
1279	LDF	[AO -  9 * SIZE], a4
1280
1281	FADD	c08, t2, c08
1282	nop
1283	FMUL	a5, b2, t2
1284	nop
1285
1286	FADD	c12, t3, c12
1287	nop
1288	FMUL	a5, b3, t3
1289	nop
1290
1291	FADD	c16, t4, c16
1292	nop
1293	FMUL	a5, b4, t4
1294	LDF	[AO - 4 * SIZE], a5
1295
1296	FADD	c01, t1, c01
1297	nop
1298	FMUL	a2, b5, t1
1299	nop
1300
1301	FADD	c05, t2, c05
1302	nop
1303	FMUL	a2, b2, t2
1304	nop
1305
1306	FADD	c09, t3, c09
1307	nop
1308	FMUL	a2, b3, t3
1309	nop
1310
1311	FADD	c13, t4, c13
1312	nop
1313	FMUL	a2, b4, t4
1314	LDF	[AO -  7 * SIZE], a2
1315
1316	FADD	c02, t1, c02
1317	nop
1318	FMUL	a3, b5, t1
1319	nop
1320
1321	FADD	c06, t2, c06
1322	nop
1323	FMUL	a3, b2, t2
1324	nop
1325
1326	FADD	c10, t3, c10
1327	nop
1328	FMUL	a3, b3, t3
1329	nop
1330
1331	FADD	c14, t4, c14
1332	nop
1333	FMUL	a3, b4, t4
1334	LDF	[AO -  6 * SIZE], a3
1335
1336	FADD	c03, t1, c03
1337	nop
1338	FMUL	a4, b5, t1
1339	LDF	[BO - 4 * SIZE], b5
1340
1341	FADD	c07, t2, c07
1342	nop
1343	FMUL	a4, b2, t2
1344	LDF	[BO -  7 * SIZE], b2
1345
1346	FADD	c11, t3, c11
1347	nop
1348	FMUL	a4, b3, t3
1349	LDF	[BO -  6 * SIZE], b3
1350
1351	FADD	c15, t4, c15
1352	nop
1353	FMUL	a4, b4, t4
1354	LDF	[BO -  5 * SIZE], b4
1355
1356	FADD	c04, t1, c04
1357	nop
1358	FMUL	a1, b1, t1
1359	LDF	[AO -  5 * SIZE], a4
1360
1361	FADD	c08, t2, c08
1362	nop
1363	FMUL	a1, b2, t2
1364	nop
1365
1366	FADD	c12, t3, c12
1367	nop
1368	FMUL	a1, b3, t3
1369	nop
1370
1371	FADD	c16, t4, c16
1372	nop
1373	FMUL	a1, b4, t4
1374	LDF	[AO -  0 * SIZE], a1
1375
1376	FADD	c01, t1, c01
1377	nop
1378	FMUL	a2, b1, t1
1379	nop
1380
1381#ifdef DOUBLE
1382	prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY
1383#else
1384	nop
1385#endif
1386	FADD	c05, t2, c05
1387	nop
1388	FMUL	a2, b2, t2
1389
1390	FADD	c09, t3, c09
1391	nop
1392	FMUL	a2, b3, t3
1393	nop
1394
1395	FADD	c13, t4, c13
1396	nop
1397	FMUL	a2, b4, t4
1398	nop
1399
1400	FADD	c02, t1, c02
1401	nop
1402	FMUL	a3, b1, t1
1403	LDF	[AO - 3 * SIZE], a2
1404
1405	FADD	c06, t2, c06
1406#ifdef DOUBLE
1407	prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY
1408#else
1409	nop
1410#endif
1411	FMUL	a3, b2, t2
1412	nop
1413
1414	FADD	c10, t3, c10
1415	nop
1416	FMUL	a3, b3, t3
1417	nop
1418
1419	FADD	c14, t4, c14
1420	nop
1421	FMUL	a3, b4, t4
1422	LDF	[AO - 2 * SIZE], a3
1423
1424	FADD	c03, t1, c03
1425	nop
1426	FMUL	a4, b1, t1
1427	LDF	[BO -  0 * SIZE], b1
1428
1429	FADD	c07, t2, c07
1430	nop
1431	FMUL	a4, b2, t2
1432	LDF	[BO - 3 * SIZE], b2
1433
1434	FADD	c11, t3, c11
1435	nop
1436	FMUL	a4, b3, t3
1437	LDF	[BO - 2 * SIZE], b3
1438
1439	FADD	c15, t4, c15
1440	nop
1441	FMUL	a4, b4, t4
1442	LDF	[BO - 1 * SIZE], b4
1443
1444	FADD	c04, t1, c04
1445	nop
1446	FMUL	a5, b5, t1
1447	LDF	[AO - 1 * SIZE], a4
1448
1449	FADD	c08, t2, c08
1450	FMUL	a5, b2, t2
1451	FADD	c12, t3, c12
1452	FMUL	a5, b3, t3
1453
1454	FADD	c16, t4, c16
1455	nop
1456	FMUL	a5, b4, t4
1457	LDF	[AO +  4 * SIZE], a5
1458
1459	FADD	c01, t1, c01
1460	nop
1461	FMUL	a2, b5, t1
1462	nop
1463
1464	FADD	c05, t2, c05
1465	nop
1466	FMUL	a2, b2, t2
1467	nop
1468
1469	FADD	c09, t3, c09
1470	nop
1471	FMUL	a2, b3, t3
1472	nop
1473
1474	FADD	c13, t4, c13
1475	nop
1476	FMUL	a2, b4, t4
1477	LDF	[AO +  1 * SIZE], a2
1478
1479	FADD	c02, t1, c02
1480	nop
1481	FMUL	a3, b5, t1
1482	nop
1483
1484	FADD	c06, t2, c06
1485	nop
1486	FMUL	a3, b2, t2
1487	nop
1488
1489	FADD	c10, t3, c10
1490	nop
1491	FMUL	a3, b3, t3
1492	nop
1493
1494	FADD	c14, t4, c14
1495	nop
1496	FMUL	a3, b4, t4
1497	LDF	[AO +  2 * SIZE], a3
1498
1499	FADD	c03, t1, c03
1500	cmp	L, 0
1501	FMUL	a4, b5, t1
1502	LDF	[BO +  4 * SIZE], b5
1503
1504	FADD	c07, t2, c07
1505	nop
1506	FMUL	a4, b2, t2
1507	LDF	[BO +  1 * SIZE], b2
1508
1509	FADD	c11, t3, c11
1510	nop
1511	FMUL	a4, b3, t3
1512	LDF	[BO +  2 * SIZE], b3
1513
1514	FADD	c15, t4, c15
1515	FMUL	a4, b4, t4
1516	bg,pt	%icc, .LL22
1517	LDF	[BO +  3 * SIZE], b4
1518
1519.LL25:
1520#if defined(LT) || defined(RN)
1521	and	KK,  3, L
1522#else
1523	and	TEMP1, 3, L
1524#endif
1525	cmp	L,  0
1526	ble,a,pn %icc, .LL29
1527	nop
1528
1529.LL26:
1530	FADD	c04, t1, c04
1531	LDF	[AO +  3 * SIZE], a4
1532	FMUL	a1, b1, t1
1533	add	AO, 4 * SIZE, AO
1534
1535	FADD	c08, t2, c08
1536	add	BO, 4 * SIZE, BO
1537	FMUL	a1, b2, t2
1538	add	L, -1, L
1539
1540	FADD	c12, t3, c12
1541	nop
1542	FMUL	a1, b3, t3
1543	cmp	L, 0
1544
1545	FADD	c16, t4, c16
1546	nop
1547	FMUL	a1, b4, t4
1548	LDF	[AO + 0 * SIZE], a1
1549
1550	FADD	c01, t1, c01
1551	nop
1552	FMUL	a2, b1, t1
1553	nop
1554
1555	FADD	c05, t2, c05
1556	nop
1557	FMUL	a2, b2, t2
1558	nop
1559
1560	FADD	c09, t3, c09
1561	nop
1562	FMUL	a2, b3, t3
1563	nop
1564
1565	FADD	c13, t4, c13
1566	nop
1567	FMUL	a2, b4, t4
1568	LDF	[AO + 1 * SIZE], a2
1569
1570	FADD	c02, t1, c02
1571	nop
1572	FMUL	a3, b1, t1
1573	nop
1574
1575	FADD	c06, t2, c06
1576	nop
1577	FMUL	a3, b2, t2
1578	nop
1579
1580	FADD	c10, t3, c10
1581	nop
1582	FMUL	a3, b3, t3
1583	nop
1584
1585	FADD	c14, t4, c14
1586	nop
1587	FMUL	a3, b4, t4
1588	LDF	[AO + 2 * SIZE], a3
1589
1590	FADD	c03, t1, c03
1591	nop
1592	FMUL	a4, b1, t1
1593	LDF	[BO + 0 * SIZE], b1
1594
1595	FADD	c07, t2, c07
1596	nop
1597	FMUL	a4, b2, t2
1598	LDF	[BO + 1 * SIZE], b2
1599
1600	FADD	c11, t3, c11
1601	nop
1602	FMUL	a4, b3, t3
1603	LDF	[BO + 2 * SIZE], b3
1604
1605	FADD	c15, t4, c15
1606	FMUL	a4, b4, t4
1607	bg,pt	%icc, .LL26
1608	LDF	[BO + 3 * SIZE], b4
1609
1610.LL29:
1611#if defined(LN) || defined(RT)
1612	sub	KK, 4, TEMP1
1613	sll	TEMP1, 2 + BASE_SHIFT, TEMP1
1614	add	AORIG, TEMP1, AO
1615	add	B,     TEMP1, BO
1616#endif
1617
1618	FADD	c04, t1, c04
1619	FADD	c08, t2, c08
1620	FADD	c12, t3, c12
1621	FADD	c16, t4, c16
1622
1623#if defined(LN) || defined(LT)
1624	LDF	[BO +  0 * SIZE], a1
1625	LDF	[BO +  1 * SIZE], a2
1626	LDF	[BO +  2 * SIZE], a3
1627	LDF	[BO +  3 * SIZE], a4
1628
1629	LDF	[BO +  4 * SIZE], b1
1630	LDF	[BO +  5 * SIZE], b2
1631	LDF	[BO +  6 * SIZE], b3
1632	LDF	[BO +  7 * SIZE], b4
1633
1634	FSUB	a1, c01, c01
1635	FSUB	a2, c05, c05
1636	FSUB	a3, c09, c09
1637	FSUB	a4, c13, c13
1638
1639	FSUB	b1, c02, c02
1640	FSUB	b2, c06, c06
1641	FSUB	b3, c10, c10
1642	FSUB	b4, c14, c14
1643
1644	LDF	[BO +  8 * SIZE], a1
1645	LDF	[BO +  9 * SIZE], a2
1646	LDF	[BO + 10 * SIZE], a3
1647	LDF	[BO + 11 * SIZE], a4
1648
1649	LDF	[BO + 12 * SIZE], b1
1650	LDF	[BO + 13 * SIZE], b2
1651	LDF	[BO + 14 * SIZE], b3
1652	LDF	[BO + 15 * SIZE], b4
1653
1654	FSUB	a1, c03, c03
1655	FSUB	a2, c07, c07
1656	FSUB	a3, c11, c11
1657	FSUB	a4, c15, c15
1658
1659	FSUB	b1, c04, c04
1660	FSUB	b2, c08, c08
1661	FSUB	b3, c12, c12
1662	FSUB	b4, c16, c16
1663#else
1664	LDF	[AO +  0 * SIZE], a1
1665	LDF	[AO +  1 * SIZE], a2
1666	LDF	[AO +  2 * SIZE], a3
1667	LDF	[AO +  3 * SIZE], a4
1668
1669	LDF	[AO +  4 * SIZE], b1
1670	LDF	[AO +  5 * SIZE], b2
1671	LDF	[AO +  6 * SIZE], b3
1672	LDF	[AO +  7 * SIZE], b4
1673
1674	FSUB	a1, c01, c01
1675	FSUB	a2, c02, c02
1676	FSUB	a3, c03, c03
1677	FSUB	a4, c04, c04
1678
1679	FSUB	b1, c05, c05
1680	FSUB	b2, c06, c06
1681	FSUB	b3, c07, c07
1682	FSUB	b4, c08, c08
1683
1684	LDF	[AO +  8 * SIZE], a1
1685	LDF	[AO +  9 * SIZE], a2
1686	LDF	[AO + 10 * SIZE], a3
1687	LDF	[AO + 11 * SIZE], a4
1688
1689	LDF	[AO + 12 * SIZE], b1
1690	LDF	[AO + 13 * SIZE], b2
1691	LDF	[AO + 14 * SIZE], b3
1692	LDF	[AO + 15 * SIZE], b4
1693
1694	FSUB	a1, c09, c09
1695	FSUB	a2, c10, c10
1696	FSUB	a3, c11, c11
1697	FSUB	a4, c12, c12
1698
1699	FSUB	b1, c13, c13
1700	FSUB	b2, c14, c14
1701	FSUB	b3, c15, c15
1702	FSUB	b4, c16, c16
1703#endif
1704
1705#ifdef LN
1706	LDF	[AO + 15 * SIZE], a1
1707	LDF	[AO + 14 * SIZE], a2
1708	LDF	[AO + 13 * SIZE], a3
1709	LDF	[AO + 12 * SIZE], a4
1710
1711	FMUL	a1, c04, c04
1712	FMUL	a1, c08, c08
1713	FMUL	a1, c12, c12
1714	FMUL	a1, c16, c16
1715
1716	FMUL	a2, c04, t1
1717	FMUL	a2, c08, t2
1718	FMUL	a2, c12, t3
1719	FMUL	a2, c16, t4
1720
1721	FSUB	c03, t1, c03
1722	FSUB	c07, t2, c07
1723	FSUB	c11, t3, c11
1724	FSUB	c15, t4, c15
1725
1726	FMUL	a3, c04, t1
1727	FMUL	a3, c08, t2
1728	FMUL	a3, c12, t3
1729	FMUL	a3, c16, t4
1730
1731	FSUB	c02, t1, c02
1732	FSUB	c06, t2, c06
1733	FSUB	c10, t3, c10
1734	FSUB	c14, t4, c14
1735
1736	FMUL	a4, c04, t1
1737	FMUL	a4, c08, t2
1738	FMUL	a4, c12, t3
1739	FMUL	a4, c16, t4
1740
1741	FSUB	c01, t1, c01
1742	FSUB	c05, t2, c05
1743	FSUB	c09, t3, c09
1744	FSUB	c13, t4, c13
1745
1746	LDF	[AO + 10 * SIZE], a1
1747	LDF	[AO +  9 * SIZE], a2
1748	LDF	[AO +  8 * SIZE], a3
1749
1750	FMUL	a1, c03, c03
1751	FMUL	a1, c07, c07
1752	FMUL	a1, c11, c11
1753	FMUL	a1, c15, c15
1754
1755	FMUL	a2, c03, t1
1756	FMUL	a2, c07, t2
1757	FMUL	a2, c11, t3
1758	FMUL	a2, c15, t4
1759
1760	FSUB	c02, t1, c02
1761	FSUB	c06, t2, c06
1762	FSUB	c10, t3, c10
1763	FSUB	c14, t4, c14
1764
1765	FMUL	a3, c03, t1
1766	FMUL	a3, c07, t2
1767	FMUL	a3, c11, t3
1768	FMUL	a3, c15, t4
1769
1770	FSUB	c01, t1, c01
1771	FSUB	c05, t2, c05
1772	FSUB	c09, t3, c09
1773	FSUB	c13, t4, c13
1774
1775	LDF	[AO +  5 * SIZE], a1
1776	LDF	[AO +  4 * SIZE], a2
1777
1778	FMUL	a1, c02, c02
1779	FMUL	a1, c06, c06
1780	FMUL	a1, c10, c10
1781	FMUL	a1, c14, c14
1782
1783	FMUL	a2, c02, t1
1784	FMUL	a2, c06, t2
1785	FMUL	a2, c10, t3
1786	FMUL	a2, c14, t4
1787
1788	FSUB	c01, t1, c01
1789	FSUB	c05, t2, c05
1790	FSUB	c09, t3, c09
1791	FSUB	c13, t4, c13
1792
1793	LDF	[AO +  0 * SIZE], a1
1794
1795	FMUL	a1, c01, c01
1796	FMUL	a1, c05, c05
1797	FMUL	a1, c09, c09
1798	FMUL	a1, c13, c13
1799#endif
1800
1801#ifdef LT
1802	LDF	[AO +  0 * SIZE], a1
1803	LDF	[AO +  1 * SIZE], a2
1804	LDF	[AO +  2 * SIZE], a3
1805	LDF	[AO +  3 * SIZE], a4
1806
1807	FMUL	a1, c01, c01
1808	FMUL	a1, c05, c05
1809	FMUL	a1, c09, c09
1810	FMUL	a1, c13, c13
1811
1812	FMUL	a2, c01, t1
1813	FMUL	a2, c05, t2
1814	FMUL	a2, c09, t3
1815	FMUL	a2, c13, t4
1816
1817	FSUB	c02, t1, c02
1818	FSUB	c06, t2, c06
1819	FSUB	c10, t3, c10
1820	FSUB	c14, t4, c14
1821
1822	FMUL	a3, c01, t1
1823	FMUL	a3, c05, t2
1824	FMUL	a3, c09, t3
1825	FMUL	a3, c13, t4
1826
1827	FSUB	c03, t1, c03
1828	FSUB	c07, t2, c07
1829	FSUB	c11, t3, c11
1830	FSUB	c15, t4, c15
1831
1832	FMUL	a4, c01, t1
1833	FMUL	a4, c05, t2
1834	FMUL	a4, c09, t3
1835	FMUL	a4, c13, t4
1836
1837	FSUB	c04, t1, c04
1838	FSUB	c08, t2, c08
1839	FSUB	c12, t3, c12
1840	FSUB	c16, t4, c16
1841
1842	LDF	[AO +  5 * SIZE], a1
1843	LDF	[AO +  6 * SIZE], a2
1844	LDF	[AO +  7 * SIZE], a3
1845
1846	FMUL	a1, c02, c02
1847	FMUL	a1, c06, c06
1848	FMUL	a1, c10, c10
1849	FMUL	a1, c14, c14
1850
1851	FMUL	a2, c02, t1
1852	FMUL	a2, c06, t2
1853	FMUL	a2, c10, t3
1854	FMUL	a2, c14, t4
1855
1856	FSUB	c03, t1, c03
1857	FSUB	c07, t2, c07
1858	FSUB	c11, t3, c11
1859	FSUB	c15, t4, c15
1860
1861	FMUL	a3, c02, t1
1862	FMUL	a3, c06, t2
1863	FMUL	a3, c10, t3
1864	FMUL	a3, c14, t4
1865
1866	FSUB	c04, t1, c04
1867	FSUB	c08, t2, c08
1868	FSUB	c12, t3, c12
1869	FSUB	c16, t4, c16
1870
1871	LDF	[AO + 10 * SIZE], a1
1872	LDF	[AO + 11 * SIZE], a2
1873
1874	FMUL	a1, c03, c03
1875	FMUL	a1, c07, c07
1876	FMUL	a1, c11, c11
1877	FMUL	a1, c15, c15
1878
1879	FMUL	a2, c03, t1
1880	FMUL	a2, c07, t2
1881	FMUL	a2, c11, t3
1882	FMUL	a2, c15, t4
1883
1884	FSUB	c04, t1, c04
1885	FSUB	c08, t2, c08
1886	FSUB	c12, t3, c12
1887	FSUB	c16, t4, c16
1888
1889	LDF	[AO + 15 * SIZE], a1
1890
1891	FMUL	a1, c04, c04
1892	FMUL	a1, c08, c08
1893	FMUL	a1, c12, c12
1894	FMUL	a1, c16, c16
1895#endif
1896
1897#ifdef RN
1898	LDF	[BO +  0 * SIZE], a1
1899	LDF	[BO +  1 * SIZE], a2
1900	LDF	[BO +  2 * SIZE], a3
1901	LDF	[BO +  3 * SIZE], a4
1902
1903	FMUL	a1, c01, c01
1904	FMUL	a1, c02, c02
1905	FMUL	a1, c03, c03
1906	FMUL	a1, c04, c04
1907
1908	FMUL	a2, c01, t1
1909	FMUL	a2, c02, t2
1910	FMUL	a2, c03, t3
1911	FMUL	a2, c04, t4
1912
1913	FSUB	c05, t1, c05
1914	FSUB	c06, t2, c06
1915	FSUB	c07, t3, c07
1916	FSUB	c08, t4, c08
1917
1918	FMUL	a3, c01, t1
1919	FMUL	a3, c02, t2
1920	FMUL	a3, c03, t3
1921	FMUL	a3, c04, t4
1922
1923	FSUB	c09, t1, c09
1924	FSUB	c10, t2, c10
1925	FSUB	c11, t3, c11
1926	FSUB	c12, t4, c12
1927
1928	FMUL	a4, c01, t1
1929	FMUL	a4, c02, t2
1930	FMUL	a4, c03, t3
1931	FMUL	a4, c04, t4
1932
1933	FSUB	c13, t1, c13
1934	FSUB	c14, t2, c14
1935	FSUB	c15, t3, c15
1936	FSUB	c16, t4, c16
1937
1938	LDF	[BO +  5 * SIZE], a1
1939	LDF	[BO +  6 * SIZE], a2
1940	LDF	[BO +  7 * SIZE], a3
1941
1942	FMUL	a1, c05, c05
1943	FMUL	a1, c06, c06
1944	FMUL	a1, c07, c07
1945	FMUL	a1, c08, c08
1946
1947	FMUL	a2, c05, t1
1948	FMUL	a2, c06, t2
1949	FMUL	a2, c07, t3
1950	FMUL	a2, c08, t4
1951
1952	FSUB	c09, t1, c09
1953	FSUB	c10, t2, c10
1954	FSUB	c11, t3, c11
1955	FSUB	c12, t4, c12
1956
1957	FMUL	a3, c05, t1
1958	FMUL	a3, c06, t2
1959	FMUL	a3, c07, t3
1960	FMUL	a3, c08, t4
1961
1962	FSUB	c13, t1, c13
1963	FSUB	c14, t2, c14
1964	FSUB	c15, t3, c15
1965	FSUB	c16, t4, c16
1966
1967	LDF	[BO + 10 * SIZE], a1
1968	LDF	[BO + 11 * SIZE], a2
1969
1970	FMUL	a1, c09, c09
1971	FMUL	a1, c10, c10
1972	FMUL	a1, c11, c11
1973	FMUL	a1, c12, c12
1974
1975	FMUL	a2, c09, t1
1976	FMUL	a2, c10, t2
1977	FMUL	a2, c11, t3
1978	FMUL	a2, c12, t4
1979
1980	FSUB	c13, t1, c13
1981	FSUB	c14, t2, c14
1982	FSUB	c15, t3, c15
1983	FSUB	c16, t4, c16
1984
1985	LDF	[BO + 15 * SIZE], a1
1986
1987	FMUL	a1, c13, c13
1988	FMUL	a1, c14, c14
1989	FMUL	a1, c15, c15
1990	FMUL	a1, c16, c16
1991#endif
1992
1993#ifdef RT
1994	LDF	[BO + 15 * SIZE], a1
1995	LDF	[BO + 14 * SIZE], a2
1996	LDF	[BO + 13 * SIZE], a3
1997	LDF	[BO + 12 * SIZE], a4
1998
1999	FMUL	a1, c13, c13
2000	FMUL	a1, c14, c14
2001	FMUL	a1, c15, c15
2002	FMUL	a1, c16, c16
2003
2004	FMUL	a2, c13, t1
2005	FMUL	a2, c14, t2
2006	FMUL	a2, c15, t3
2007	FMUL	a2, c16, t4
2008
2009	FSUB	c09, t1, c09
2010	FSUB	c10, t2, c10
2011	FSUB	c11, t3, c11
2012	FSUB	c12, t4, c12
2013
2014	FMUL	a3, c13, t1
2015	FMUL	a3, c14, t2
2016	FMUL	a3, c15, t3
2017	FMUL	a3, c16, t4
2018
2019	FSUB	c05, t1, c05
2020	FSUB	c06, t2, c06
2021	FSUB	c07, t3, c07
2022	FSUB	c08, t4, c08
2023
2024	FMUL	a4, c13, t1
2025	FMUL	a4, c14, t2
2026	FMUL	a4, c15, t3
2027	FMUL	a4, c16, t4
2028
2029	FSUB	c01, t1, c01
2030	FSUB	c02, t2, c02
2031	FSUB	c03, t3, c03
2032	FSUB	c04, t4, c04
2033
2034	LDF	[BO + 10 * SIZE], a1
2035	LDF	[BO +  9 * SIZE], a2
2036	LDF	[BO +  8 * SIZE], a3
2037
2038	FMUL	a1, c09, c09
2039	FMUL	a1, c10, c10
2040	FMUL	a1, c11, c11
2041	FMUL	a1, c12, c12
2042
2043	FMUL	a2, c09, t1
2044	FMUL	a2, c10, t2
2045	FMUL	a2, c11, t3
2046	FMUL	a2, c12, t4
2047
2048	FSUB	c05, t1, c05
2049	FSUB	c06, t2, c06
2050	FSUB	c07, t3, c07
2051	FSUB	c08, t4, c08
2052
2053	FMUL	a3, c09, t1
2054	FMUL	a3, c10, t2
2055	FMUL	a3, c11, t3
2056	FMUL	a3, c12, t4
2057
2058	FSUB	c01, t1, c01
2059	FSUB	c02, t2, c02
2060	FSUB	c03, t3, c03
2061	FSUB	c04, t4, c04
2062
2063	LDF	[BO +  5 * SIZE], a1
2064	LDF	[BO +  4 * SIZE], a2
2065
2066	FMUL	a1, c05, c05
2067	FMUL	a1, c06, c06
2068	FMUL	a1, c07, c07
2069	FMUL	a1, c08, c08
2070
2071	FMUL	a2, c05, t1
2072	FMUL	a2, c06, t2
2073	FMUL	a2, c07, t3
2074	FMUL	a2, c08, t4
2075
2076	FSUB	c01, t1, c01
2077	FSUB	c02, t2, c02
2078	FSUB	c03, t3, c03
2079	FSUB	c04, t4, c04
2080
2081	LDF	[BO +  0 * SIZE], a1
2082
2083	FMUL	a1, c01, c01
2084	FMUL	a1, c02, c02
2085	FMUL	a1, c03, c03
2086	FMUL	a1, c04, c04
2087#endif
2088
2089#ifdef LN
2090	add	C1, -4 * SIZE, C1
2091	add	C2, -4 * SIZE, C2
2092	add	C3, -4 * SIZE, C3
2093	add	C4, -4 * SIZE, C4
2094#endif
2095
2096#if defined(LN) || defined(LT)
2097	STF	c01, [BO +  0 * SIZE]
2098	STF	c05, [BO +  1 * SIZE]
2099	STF	c09, [BO +  2 * SIZE]
2100	STF	c13, [BO +  3 * SIZE]
2101
2102	STF	c02, [BO +  4 * SIZE]
2103	STF	c06, [BO +  5 * SIZE]
2104	STF	c10, [BO +  6 * SIZE]
2105	STF	c14, [BO +  7 * SIZE]
2106
2107	STF	c03, [BO +  8 * SIZE]
2108	STF	c07, [BO +  9 * SIZE]
2109	STF	c11, [BO + 10 * SIZE]
2110	STF	c15, [BO + 11 * SIZE]
2111
2112	STF	c04, [BO + 12 * SIZE]
2113	STF	c08, [BO + 13 * SIZE]
2114	STF	c12, [BO + 14 * SIZE]
2115	STF	c16, [BO + 15 * SIZE]
2116#else
2117	STF	c01, [AO +  0 * SIZE]
2118	STF	c02, [AO +  1 * SIZE]
2119	STF	c03, [AO +  2 * SIZE]
2120	STF	c04, [AO +  3 * SIZE]
2121
2122	STF	c05, [AO +  4 * SIZE]
2123	STF	c06, [AO +  5 * SIZE]
2124	STF	c07, [AO +  6 * SIZE]
2125	STF	c08, [AO +  7 * SIZE]
2126
2127	STF	c09, [AO +  8 * SIZE]
2128	STF	c10, [AO +  9 * SIZE]
2129	STF	c11, [AO + 10 * SIZE]
2130	STF	c12, [AO + 11 * SIZE]
2131
2132	STF	c13, [AO + 12 * SIZE]
2133	STF	c14, [AO + 13 * SIZE]
2134	STF	c15, [AO + 14 * SIZE]
2135	STF	c16, [AO + 15 * SIZE]
2136#endif
2137
2138	STF	c01, [C1 + 0 * SIZE]
2139	STF	c02, [C1 + 1 * SIZE]
2140	STF	c03, [C1 + 2 * SIZE]
2141	STF	c04, [C1 + 3 * SIZE]
2142
2143	STF	c05, [C2 + 0 * SIZE]
2144	STF	c06, [C2 + 1 * SIZE]
2145	STF	c07, [C2 + 2 * SIZE]
2146	STF	c08, [C2 + 3 * SIZE]
2147
2148	STF	c09, [C3 + 0 * SIZE]
2149	STF	c10, [C3 + 1 * SIZE]
2150	STF	c11, [C3 + 2 * SIZE]
2151	STF	c12, [C3 + 3 * SIZE]
2152
2153	STF	c13, [C4 + 0 * SIZE]
2154	STF	c14, [C4 + 1 * SIZE]
2155	STF	c15, [C4 + 2 * SIZE]
2156	STF	c16, [C4 + 3 * SIZE]
2157
2158	FMOV	FZERO, t1
2159	FMOV	FZERO, t2
2160	FMOV	FZERO, t3
2161	FMOV	FZERO, t4
2162
2163#ifndef LN
2164	add	C1, 4 * SIZE, C1
2165	add	C2, 4 * SIZE, C2
2166	add	C3, 4 * SIZE, C3
2167	add	C4, 4 * SIZE, C4
2168#endif
2169
2170#ifdef RT
2171	sll	K, 2 + BASE_SHIFT, TEMP1
2172	add	AORIG, TEMP1, AORIG
2173#endif
2174
2175#if defined(LT) || defined(RN)
2176	sub	K, KK, TEMP1
2177	sll	TEMP1, 2 + BASE_SHIFT, TEMP1
2178	add	AO, TEMP1, AO
2179	add	BO, TEMP1, BO
2180#endif
2181
2182#ifdef LT
2183	add	KK, 4, KK
2184#endif
2185
2186#ifdef LN
2187	sub	KK, 4, KK
2188#endif
2189
2190	add	I, -1, I
2191	cmp	I, 0
2192
2193	sra	K, 2, L
2194	bg,pt	%icc, .LL21
2195	FMOV	FZERO, c01
2196
2197
2198
2199
2200
2201
2202
2203.LL99:
2204#ifdef LN
2205	sll	K, 2 + BASE_SHIFT, TEMP1
2206	add	B, TEMP1, B
2207#endif
2208
2209#if defined(LT) || defined(RN)
2210	mov	BO, B
2211#endif
2212
2213#ifdef RN
2214	add	KK, 4, KK
2215#endif
2216
2217#ifdef RT
2218	sub	KK, 4, KK
2219#endif
2220
2221	add	J, -1, J
2222	cmp	J, 0
2223	bg,pt	%icc, .LL11
2224	nop
2225
2226.LL100:  /* n & 2 */
2227	and	N, 2, J
2228	cmp	J, 0
2229	ble,pn	%icc, .LL200
2230	nop
2231
2232#ifdef RT
2233	sll	K, 1 + BASE_SHIFT, TEMP1
2234	sub	B, TEMP1, B
2235
2236	sll	LDC, 1, TEMP1
2237	sub	C, TEMP1, C
2238#endif
2239
2240	mov	C, C1
2241	add	C, LDC, C2
2242
2243#ifdef LN
2244	add	M, OFFSET, KK
2245#endif
2246
2247#ifdef LT
2248	mov	OFFSET, KK
2249#endif
2250
2251#if defined(LN) || defined(RT)
2252	mov	A, AORIG
2253#else
2254	mov	A, AO
2255#endif
2256
2257#ifndef RT
2258	add	C2, LDC, C
2259#endif
2260
2261	and	M, 1, I
2262	cmp	I, 0
2263	ble,pn	%icc, .LL150
2264	nop
2265
2266#if defined(LT) || defined(RN)
2267	sra	KK, 2, L
2268
2269	mov	B, BO
2270	cmp	L,  0
2271#else
2272
2273#ifdef LN
2274	sll	K,  0 + BASE_SHIFT, TEMP1
2275	sub	AORIG, TEMP1, AORIG
2276#endif
2277
2278	sll	KK, 0 + BASE_SHIFT, TEMP1
2279	sll	KK, 1 + BASE_SHIFT, TEMP2
2280
2281	add	AORIG, TEMP1, AO
2282	add	B,     TEMP2, BO
2283
2284	sub	K, KK, TEMP1
2285	sra	TEMP1, 2, L
2286	cmp	L,  0
2287#endif
2288
2289	LDF	[AO + 0 * SIZE], a1
2290	FMOV	FZERO, c01
2291	LDF	[BO + 0 * SIZE], b1
2292	FMOV	FZERO, t1
2293
2294 	LDF	[AO + 1 * SIZE], a2
2295	FMOV	FZERO, c02
2296	LDF	[BO + 1 * SIZE], b2
2297	FMOV	FZERO, t2
2298
2299	LDF	[AO + 2 * SIZE], a3
2300	FMOV	FZERO, c03
2301
2302	LDF	[BO + 2 * SIZE], b3
2303	FMOV	FZERO, t3
2304
2305	LDF	[AO + 3 * SIZE], a4
2306	FMOV	FZERO, c04
2307	LDF	[BO + 3 * SIZE], b4
2308	FMOV	FZERO, t4
2309
2310	ble,pn	%icc, .LL175
2311	nop
2312
2313.LL172:
2314	FADD	c01, t1, c01
2315	add	AO,  4 * SIZE, AO
2316	FMUL	a1, b1, t1
2317	LDF	[BO + 4 * SIZE], b1
2318
2319	FADD	c02, t2, c02
2320	FMUL	a1, b2, t2
2321	LDF	[BO + 5 * SIZE], b2
2322
2323	add	L, -1, L
2324	LDF	[AO + 0 * SIZE], a1
2325
2326	FADD	c03, t3, c03
2327	cmp	L, 0
2328	FMUL	a2, b3, t3
2329	LDF	[BO + 6 * SIZE], b3
2330
2331	FADD	c04, t4, c04
2332	FMUL	a2, b4, t4
2333	LDF	[BO + 7 * SIZE], b4
2334	LDF	[AO + 1 * SIZE], a2
2335
2336	FADD	c01, t1, c01
2337	FMUL	a3, b1, t1
2338	LDF	[BO +  8 * SIZE], b1
2339
2340	FADD	c02, t2, c02
2341	FMUL	a3, b2, t2
2342	LDF	[BO +  9 * SIZE], b2
2343	LDF	[AO + 2 * SIZE], a3
2344
2345	FADD	c03, t3, c03
2346	FMUL	a4, b3, t3
2347	LDF	[BO + 10 * SIZE], b3
2348	FADD	c04, t4, c04
2349	FMUL	a4, b4, t4
2350	LDF	[BO + 11 * SIZE], b4
2351	add	BO,  8 * SIZE, BO
2352
2353	bg,pt	%icc, .LL172
2354	LDF	[AO + 3 * SIZE], a4
2355
2356.LL175:
2357#if defined(LT) || defined(RN)
2358	and	KK,  3, L
2359#else
2360	and	TEMP1, 3, L
2361#endif
2362	cmp	L,  0
2363	ble,a,pn %icc, .LL179
2364	nop
2365
2366.LL176:
2367	FADD	c01, t1, c01
2368	add	L, -1, L
2369	FMUL	a1, b1, t1
2370	add	AO, 1 * SIZE, AO
2371	LDF	[BO + 2 * SIZE], b1
2372	FADD	c02, t2, c02
2373	cmp	L, 0
2374	FMUL	a1, b2, t2
2375	LDF	[BO + 3 * SIZE], b2
2376
2377	add	BO, 2 * SIZE, BO
2378	bg,pt	%icc, .LL176
2379	LDF	[AO + 0 * SIZE], a1
2380
2381.LL179:
2382	FADD	c01, t1, c01
2383	FADD	c02, t2, c02
2384	FADD	c03, t3, c03
2385	FADD	c04, t4, c04
2386
2387	FADD	c01, c03, c01
2388	FADD	c02, c04, c02
2389
2390
2391#if defined(LN) || defined(RT)
2392#ifdef LN
2393	sub	KK, 1, TEMP1
2394#else
2395	sub	KK, 2, TEMP1
2396#endif
2397	sll	TEMP1, 0 + BASE_SHIFT, TEMP2
2398	sll	TEMP1, 1 + BASE_SHIFT, TEMP1
2399	add	AORIG, TEMP2, AO
2400	add	B,     TEMP1, BO
2401#endif
2402
2403#if defined(LN) || defined(LT)
2404	LDF	[BO +  0 * SIZE], a1
2405	LDF	[BO +  1 * SIZE], a2
2406
2407	FSUB	a1, c01, c01
2408	FSUB	a2, c02, c02
2409#else
2410	LDF	[AO +  0 * SIZE], a1
2411	LDF	[AO +  1 * SIZE], a2
2412
2413	FSUB	a1, c01, c01
2414	FSUB	a2, c02, c02
2415#endif
2416
2417#ifdef LN
2418	LDF	[AO +  0 * SIZE], a1
2419
2420	FMUL	a1, c01, c01
2421	FMUL	a1, c02, c02
2422#endif
2423
2424#ifdef LT
2425	LDF	[AO +  0 * SIZE], a1
2426
2427	FMUL	a1, c01, c01
2428	FMUL	a1, c02, c02
2429#endif
2430
2431#ifdef RN
2432	LDF	[BO +  0 * SIZE], a1
2433	LDF	[BO +  1 * SIZE], a2
2434	LDF	[BO +  3 * SIZE], a3
2435
2436	FMUL	a1, c01, c01
2437	FMUL	a2, c01, t1
2438	FSUB	c02, t1, c02
2439	FMUL	a3, c02, c02
2440#endif
2441
2442#ifdef RT
2443	LDF	[BO +  3 * SIZE], a1
2444	LDF	[BO +  2 * SIZE], a2
2445	LDF	[BO +  0 * SIZE], a3
2446
2447	FMUL	a1, c02, c02
2448	FMUL	a2, c02, t1
2449	FSUB	c01, t1, c01
2450	FMUL	a3, c01, c01
2451#endif
2452
2453#ifdef LN
2454	add	C1, -1 * SIZE, C1
2455	add	C2, -1 * SIZE, C2
2456#endif
2457
2458#if defined(LN) || defined(LT)
2459	STF	c01, [BO +  0 * SIZE]
2460	STF	c02, [BO +  1 * SIZE]
2461#else
2462	STF	c01, [AO +  0 * SIZE]
2463	STF	c02, [AO +  1 * SIZE]
2464#endif
2465
2466	STF	c01, [C1 + 0 * SIZE]
2467	STF	c02, [C2 + 0 * SIZE]
2468
2469	FMOV	FZERO, t1
2470	FMOV	FZERO, t2
2471	FMOV	FZERO, t3
2472	FMOV	FZERO, t4
2473
2474#ifndef LN
2475	add	C1, 1 * SIZE, C1
2476	add	C2, 1 * SIZE, C2
2477#endif
2478
2479#ifdef RT
2480	sll	K, 0 + BASE_SHIFT, TEMP1
2481	add	AORIG, TEMP1, AORIG
2482#endif
2483
2484#if defined(LT) || defined(RN)
2485	sub	K, KK, TEMP1
2486	sll	TEMP1, 0 + BASE_SHIFT, TEMP2
2487	sll	TEMP1, 1 + BASE_SHIFT, TEMP1
2488	add	AO, TEMP2, AO
2489	add	BO, TEMP1, BO
2490#endif
2491
2492#ifdef LT
2493	add	KK, 1, KK
2494#endif
2495
2496#ifdef LN
2497	sub	KK, 1, KK
2498#endif
2499
2500.LL150:
2501	and	M, 2, I
2502	cmp	I, 0
2503	ble,pn	%icc, .LL170
2504	nop
2505
2506#if defined(LT) || defined(RN)
2507	sra	KK, 2, L
2508
2509	mov	B, BO
2510	cmp	L,  0
2511#else
2512
2513#ifdef LN
2514	sll	K,  1 + BASE_SHIFT, TEMP1
2515	sub	AORIG, TEMP1, AORIG
2516#endif
2517
2518	sll	KK, 1 + BASE_SHIFT, TEMP1
2519	sll	KK, 1 + BASE_SHIFT, TEMP2
2520
2521	add	AORIG, TEMP1, AO
2522	add	B,     TEMP2, BO
2523
2524	sub	K, KK, TEMP1
2525	sra	TEMP1, 2, L
2526	cmp	L,  0
2527#endif
2528
2529	LDF	[AO + 0 * SIZE], a1
2530	FMOV	FZERO, c01
2531	LDF	[BO + 0 * SIZE], b1
2532	FMOV	FZERO, t1
2533
2534	LDF	[AO + 1 * SIZE], a2
2535	cmp	L,  0
2536	FMOV	FZERO, c02
2537	LDF	[BO + 1 * SIZE], b2
2538	FMOV	FZERO, t2
2539
2540	LDF	[AO + 2 * SIZE], a3
2541	FMOV	FZERO, c03
2542	LDF	[BO + 2 * SIZE], b3
2543	FMOV	FZERO, t3
2544
2545	LDF	[AO + 3 * SIZE], a4
2546	FMOV	FZERO, c04
2547	LDF	[BO + 3 * SIZE], b4
2548	FMOV	FZERO, t4
2549	ble,pn	%icc, .LL155
2550	nop
2551
2552.LL152:
2553	FADD	c01, t1, c01
2554	add	L, -1, L
2555	FMUL	a1, b1, t1
2556	prefetch [AO + APREFETCHSIZE * SIZE], 0
2557
2558	FADD	c02, t2, c02
2559	add	BO,  8 * SIZE, BO
2560	FMUL	a1, b2, t2
2561	LDF	[AO + 4 * SIZE], a1
2562
2563	FADD	c03, t3, c03
2564	cmp	L, 0
2565	FMUL	a2, b1, t3
2566	LDF	[BO - 4 * SIZE], b1
2567
2568	FADD	c04, t4, c04
2569	nop
2570	FMUL	a2, b2, t4
2571	LDF	[AO + 5 * SIZE], a2
2572
2573	FADD	c01, t1, c01
2574	nop
2575	FMUL	a3, b3, t1
2576	LDF	[BO - 3 * SIZE], b2
2577
2578	FADD	c02, t2, c02
2579	nop
2580	FMUL	a3, b4, t2
2581	LDF	[AO + 6 * SIZE], a3
2582
2583	FADD	c03, t3, c03
2584	nop
2585	FMUL	a4, b3, t3
2586	LDF	[BO - 2 * SIZE], b3
2587
2588	FADD	c04, t4, c04
2589	nop
2590	FMUL	a4, b4, t4
2591	LDF	[AO + 7 * SIZE], a4
2592
2593	FADD	c01, t1, c01
2594	nop
2595	FMUL	a1, b1, t1
2596	LDF	[BO - 1 * SIZE], b4
2597
2598	FADD	c02, t2, c02
2599	FMUL	a1, b2, t2
2600	LDF	[AO +  8 * SIZE], a1
2601
2602	FADD	c03, t3, c03
2603	FMUL	a2, b1, t3
2604	LDF	[BO +  0 * SIZE], b1
2605
2606	FADD	c04, t4, c04
2607	FMUL	a2, b2, t4
2608	LDF	[AO +  9 * SIZE], a2
2609
2610	FADD	c01, t1, c01
2611	FMUL	a3, b3, t1
2612	LDF	[BO +  1 * SIZE], b2
2613
2614	FADD	c02, t2, c02
2615	FMUL	a3, b4, t2
2616	LDF	[AO + 10 * SIZE], a3
2617
2618	FADD	c03, t3, c03
2619	FMUL	a4, b3, t3
2620	LDF	[BO +  2 * SIZE], b3
2621
2622	FADD	c04, t4, c04
2623	FMUL	a4, b4, t4
2624	LDF	[AO + 11 * SIZE], a4
2625
2626	add	AO,  8 * SIZE, AO
2627	bg,pt	%icc, .LL152
2628	LDF	[BO +  3 * SIZE], b4
2629
2630.LL155:
2631#if defined(LT) || defined(RN)
2632	and	KK,  3, L
2633#else
2634	and	TEMP1, 3, L
2635#endif
2636	cmp	L,  0
2637	ble,a,pn %icc, .LL159
2638	nop
2639
2640.LL156:
2641	LDF	[AO + 0 * SIZE], a1
2642	LDF	[AO + 1 * SIZE], a2
2643
2644	LDF	[BO + 0 * SIZE], b1
2645	LDF	[BO + 1 * SIZE], b2
2646
2647	FADD	c01, t1, c01
2648	FADD	c02, t2, c02
2649	FADD	c03, t3, c03
2650	FADD	c04, t4, c04
2651
2652	FMUL	a1, b1, t1
2653	FMUL	a1, b2, t2
2654	FMUL	a2, b1, t3
2655	FMUL	a2, b2, t4
2656
2657	add	AO, 2 * SIZE, AO
2658	add	BO, 2 * SIZE, BO
2659
2660	add	L, -1, L
2661	cmp	L, 0
2662	bg,pt	%icc, .LL156
2663	nop
2664
2665.LL159:
2666	FADD	c01, t1, c01
2667	FADD	c02, t2, c02
2668	FADD	c03, t3, c03
2669	FADD	c04, t4, c04
2670
2671#if defined(LN) || defined(RT)
2672#ifdef LN
2673	sub	KK, 2, TEMP1
2674#else
2675	sub	KK, 2, TEMP1
2676#endif
2677	sll	TEMP1, 1 + BASE_SHIFT, TEMP2
2678	sll	TEMP1, 1 + BASE_SHIFT, TEMP1
2679	add	AORIG, TEMP2, AO
2680	add	B,     TEMP1, BO
2681#endif
2682
2683#if defined(LN) || defined(LT)
2684	LDF	[BO +  0 * SIZE], a1
2685	LDF	[BO +  1 * SIZE], a2
2686	LDF	[BO +  2 * SIZE], a3
2687	LDF	[BO +  3 * SIZE], a4
2688
2689	FSUB	a1, c01, c01
2690	FSUB	a2, c02, c02
2691	FSUB	a3, c03, c03
2692	FSUB	a4, c04, c04
2693#else
2694	LDF	[AO +  0 * SIZE], a1
2695	LDF	[AO +  1 * SIZE], a2
2696	LDF	[AO +  2 * SIZE], a3
2697	LDF	[AO +  3 * SIZE], a4
2698
2699	FSUB	a1, c01, c01
2700	FSUB	a2, c03, c03
2701	FSUB	a3, c02, c02
2702	FSUB	a4, c04, c04
2703#endif
2704
2705#ifdef LN
2706	LDF	[AO +  3 * SIZE], a1
2707	LDF	[AO +  2 * SIZE], a2
2708	LDF	[AO +  0 * SIZE], a3
2709
2710	FMUL	a1, c03, c03
2711	FMUL	a1, c04, c04
2712	FMUL	a2, c03, t1
2713	FMUL	a2, c04, t2
2714
2715	FSUB	c01, t1, c01
2716	FSUB	c02, t2, c02
2717	FMUL	a3, c01, c01
2718	FMUL	a3, c02, c02
2719#endif
2720
2721#ifdef LT
2722	LDF	[AO +  0 * SIZE], a1
2723	LDF	[AO +  1 * SIZE], a2
2724	LDF	[AO +  3 * SIZE], a3
2725
2726	FMUL	a1, c01, c01
2727	FMUL	a1, c02, c02
2728
2729	FMUL	a2, c01, t1
2730	FMUL	a2, c02, t2
2731
2732	FSUB	c03, t1, c03
2733	FSUB	c04, t2, c04
2734
2735	FMUL	a3, c03, c03
2736	FMUL	a3, c04, c04
2737#endif
2738
2739#ifdef RN
2740	LDF	[BO +  0 * SIZE], a1
2741	LDF	[BO +  1 * SIZE], a2
2742	LDF	[BO +  3 * SIZE], a3
2743
2744	FMUL	a1, c01, c01
2745	FMUL	a1, c03, c03
2746	FMUL	a2, c01, t1
2747	FMUL	a2, c03, t2
2748
2749	FSUB	c02, t1, c02
2750	FSUB	c04, t2, c04
2751	FMUL	a3, c02, c02
2752	FMUL	a3, c04, c04
2753#endif
2754
2755#ifdef RT
2756	LDF	[BO +  3 * SIZE], a1
2757	LDF	[BO +  2 * SIZE], a2
2758	LDF	[BO +  0 * SIZE], a3
2759
2760	FMUL	a1, c02, c02
2761	FMUL	a1, c04, c04
2762
2763	FMUL	a2, c02, t1
2764	FMUL	a2, c04, t2
2765	FSUB	c01, t1, c01
2766	FSUB	c03, t2, c03
2767
2768	FMUL	a3, c01, c01
2769	FMUL	a3, c03, c03
2770#endif
2771
2772#ifdef LN
2773	add	C1, -2 * SIZE, C1
2774	add	C2, -2 * SIZE, C2
2775#endif
2776
2777#if defined(LN) || defined(LT)
2778	STF	c01, [BO +  0 * SIZE]
2779	STF	c02, [BO +  1 * SIZE]
2780	STF	c03, [BO +  2 * SIZE]
2781	STF	c04, [BO +  3 * SIZE]
2782#else
2783	STF	c01, [AO +  0 * SIZE]
2784	STF	c03, [AO +  1 * SIZE]
2785	STF	c02, [AO +  2 * SIZE]
2786	STF	c04, [AO +  3 * SIZE]
2787#endif
2788
2789	STF	c01, [C1 + 0 * SIZE]
2790	STF	c03, [C1 + 1 * SIZE]
2791	STF	c02, [C2 + 0 * SIZE]
2792	STF	c04, [C2 + 1 * SIZE]
2793
2794	FMOV	FZERO, t1
2795	FMOV	FZERO, t2
2796	FMOV	FZERO, t3
2797	FMOV	FZERO, t4
2798
2799#ifndef LN
2800	add	C1, 2 * SIZE, C1
2801	add	C2, 2 * SIZE, C2
2802#endif
2803
2804#ifdef RT
2805	sll	K, 1 + BASE_SHIFT, TEMP1
2806	add	AORIG, TEMP1, AORIG
2807#endif
2808
2809#if defined(LT) || defined(RN)
2810	sub	K, KK, TEMP1
2811	sll	TEMP1, 1 + BASE_SHIFT, TEMP2
2812	sll	TEMP1, 1 + BASE_SHIFT, TEMP1
2813	add	AO, TEMP2, AO
2814	add	BO, TEMP1, BO
2815#endif
2816
2817#ifdef LT
2818	add	KK, 2, KK
2819#endif
2820
2821#ifdef LN
2822	sub	KK, 2, KK
2823#endif
2824
2825.LL170:
2826	sra	M, 2, I
2827	cmp	I, 0
2828	ble,pn	%icc, .LL199
2829	FMOV	FZERO, c03
2830
2831.LL121:
2832#if defined(LT) || defined(RN)
2833	sra	KK, 2, L
2834
2835	mov	B, BO
2836	cmp	L,  0
2837#else
2838
2839#ifdef LN
2840	sll	K,  2 + BASE_SHIFT, TEMP1
2841	sub	AORIG, TEMP1, AORIG
2842#endif
2843
2844	sll	KK, 2 + BASE_SHIFT, TEMP1
2845	sll	KK, 1 + BASE_SHIFT, TEMP2
2846
2847	add	AORIG, TEMP1, AO
2848	add	B,     TEMP2, BO
2849
2850	sub	K, KK, TEMP1
2851	sra	TEMP1, 2, L
2852	cmp	L,  0
2853#endif
2854
2855	LDF	[AO + 0 * SIZE], a1
2856	FMOV	FZERO, t1
2857	LDF	[BO + 0 * SIZE], b1
2858	FMOV	FZERO, c07
2859
2860	LDF	[AO + 1 * SIZE], a2
2861	FMOV	FZERO, t2
2862	LDF	[BO + 1 * SIZE], b2
2863	FMOV	FZERO, c04
2864
2865	LDF	[AO + 2 * SIZE], a3
2866	FMOV	FZERO, t3
2867	LDF	[BO + 2 * SIZE], b3
2868	FMOV	FZERO, c08
2869
2870	LDF	[AO + 3 * SIZE], a4
2871	FMOV	FZERO, t4
2872	LDF	[BO + 3 * SIZE], b4
2873	FMOV	FZERO, c01
2874
2875#ifdef LN
2876	prefetch [C1 - 3 * SIZE], 2
2877	FMOV	FZERO, c05
2878	prefetch [C2 - 3 * SIZE], 2
2879	FMOV	FZERO, c02
2880#else
2881	prefetch [C1 + 3 * SIZE], 2
2882	FMOV	FZERO, c05
2883	prefetch [C2 + 3 * SIZE], 2
2884	FMOV	FZERO, c02
2885#endif
2886
2887	ble,pn	%icc, .LL125
2888	FMOV	FZERO, c06
2889
2890.LL122:
2891	FADD	c03, t1, c03
2892	add	L, -1, L
2893	FMUL	a1, b1, t1
2894	prefetch [AO + APREFETCHSIZE * SIZE], 0
2895
2896	FADD	c07, t2, c07
2897	add	BO,  8 * SIZE, BO
2898	FMUL	a1, b2, t2
2899	LDF	[AO + 4 * SIZE], a1
2900
2901	FADD	c04, t3, c04
2902	add	AO, 16 * SIZE, AO
2903	FMUL	a2, b1, t3
2904	cmp	L,  0
2905
2906	FADD	c08, t4, c08
2907	nop
2908	FMUL	a2, b2, t4
2909	LDF	[AO - 11 * SIZE], a2
2910
2911	FADD	c01, t1, c01
2912	nop
2913	FMUL	a3, b1, t1
2914	nop
2915
2916	FADD	c05, t2, c05
2917	nop
2918	FMUL	a3, b2, t2
2919	LDF	[AO - 10 * SIZE], a3
2920
2921	FADD	c02, t3, c02
2922	nop
2923	FMUL	a4, b1, t3
2924	LDF	[BO -  4 * SIZE], b1
2925
2926	FADD	c06, t4, c06
2927	nop
2928	FMUL	a4, b2, t4
2929	LDF	[BO -  3 * SIZE], b2
2930
2931	FADD	c03, t1, c03
2932	nop
2933	FMUL	a1, b3, t1
2934	LDF	[AO -  9 * SIZE], a4
2935
2936	FADD	c07, t2, c07
2937	nop
2938	FMUL	a1, b4, t2
2939	LDF	[AO -  8 * SIZE], a1
2940
2941	FADD	c04, t3, c04
2942	nop
2943	FMUL	a2, b3, t3
2944	nop
2945
2946	FADD	c08, t4, c08
2947	nop
2948	FMUL	a2, b4, t4
2949	LDF	[AO -  7 * SIZE], a2
2950
2951	FADD	c01, t1, c01
2952	nop
2953	FMUL	a3, b3, t1
2954	nop
2955
2956	FADD	c05, t2, c05
2957	nop
2958	FMUL	a3, b4, t2
2959	LDF	[AO -  6 * SIZE], a3
2960
2961	FADD	c02, t3, c02
2962	nop
2963	FMUL	a4, b3, t3
2964	LDF	[BO -  2 * SIZE], b3
2965
2966	FADD	c06, t4, c06
2967	nop
2968	FMUL	a4, b4, t4
2969	LDF	[BO -  1 * SIZE], b4
2970
2971	FADD	c03, t1, c03
2972	nop
2973	FMUL	a1, b1, t1
2974	LDF	[AO -  5 * SIZE], a4
2975
2976	FADD	c07, t2, c07
2977	nop
2978	FMUL	a1, b2, t2
2979	LDF	[AO -  4 * SIZE], a1
2980
2981	FADD	c04, t3, c04
2982	nop
2983	FMUL	a2, b1, t3
2984	nop
2985
2986	FADD	c08, t4, c08
2987	nop
2988	FMUL	a2, b2, t4
2989	LDF	[AO -  3 * SIZE], a2
2990
2991	FADD	c01, t1, c01
2992	nop
2993	FMUL	a3, b1, t1
2994	nop
2995
2996	FADD	c05, t2, c05
2997	nop
2998	FMUL	a3, b2, t2
2999	LDF	[AO -  2 * SIZE], a3
3000
3001	FADD	c02, t3, c02
3002	nop
3003	FMUL	a4, b1, t3
3004	LDF	[BO +  0 * SIZE], b1
3005
3006	FADD	c06, t4, c06
3007	nop
3008	FMUL	a4, b2, t4
3009	LDF	[BO +  1 * SIZE], b2
3010
3011	FADD	c03, t1, c03
3012	nop
3013	FMUL	a1, b3, t1
3014	LDF	[AO -  1 * SIZE], a4
3015
3016	FADD	c07, t2, c07
3017	nop
3018	FMUL	a1, b4, t2
3019	LDF	[AO +  0 * SIZE], a1
3020
3021	FADD	c04, t3, c04
3022	nop
3023	FMUL	a2, b3, t3
3024	nop
3025
3026	FADD	c08, t4, c08
3027	nop
3028	FMUL	a2, b4, t4
3029	LDF	[AO +  1 * SIZE], a2
3030
3031	FADD	c01, t1, c01
3032	nop
3033	FMUL	a3, b3, t1
3034	nop
3035
3036	FADD	c05, t2, c05
3037	nop
3038	FMUL	a3, b4, t2
3039	LDF	[AO +  2 * SIZE], a3
3040
3041	FADD	c02, t3, c02
3042	nop
3043	FMUL	a4, b3, t3
3044	LDF	[BO +  2 * SIZE], b3
3045
3046	FADD	c06, t4, c06
3047	FMUL	a4, b4, t4
3048	LDF	[AO +  3 * SIZE], a4
3049
3050	bg,pt	%icc, .LL122
3051	LDF	[BO +  3 * SIZE], b4
3052
3053.LL125:
3054#if defined(LT) || defined(RN)
3055	and	KK,  3, L
3056#else
3057	and	TEMP1, 3, L
3058#endif
3059	cmp	L,  0
3060	ble,a,pn %icc, .LL129
3061	nop
3062
3063.LL126:
3064	FADD	c03, t1, c03
3065	add	AO, 4 * SIZE, AO
3066	FMUL	a1, b1, t1
3067	add	BO, 2 * SIZE, BO
3068
3069	FADD	c07, t2, c07
3070	add	L, -1, L
3071	FMUL	a1, b2, t2
3072	LDF	[AO + 0 * SIZE], a1
3073
3074	FADD	c04, t3, c04
3075	cmp	L, 0
3076	FMUL	a2, b1, t3
3077
3078	FADD	c08, t4, c08
3079	FMUL	a2, b2, t4
3080	LDF	[AO + 1 * SIZE], a2
3081
3082	FADD	c01, t1, c01
3083	FMUL	a3, b1, t1
3084	FADD	c05, t2, c05
3085	FMUL	a3, b2, t2
3086	LDF	[AO + 2 * SIZE], a3
3087
3088	FADD	c02, t3, c02
3089	FMUL	a4, b1, t3
3090	LDF	[BO + 0 * SIZE], b1
3091	FADD	c06, t4, c06
3092	FMUL	a4, b2, t4
3093	LDF	[BO + 1 * SIZE], b2
3094	bg,pt	%icc, .LL126
3095	LDF	[AO + 3 * SIZE], a4
3096
3097.LL129:
3098	FADD	c03, t1, c03
3099	FADD	c07, t2, c07
3100	FADD	c04, t3, c04
3101	FADD	c08, t4, c08
3102
3103#if defined(LN) || defined(RT)
3104#ifdef LN
3105	sub	KK, 4, TEMP1
3106#else
3107	sub	KK, 2, TEMP1
3108#endif
3109	sll	TEMP1, 2 + BASE_SHIFT, TEMP2
3110	sll	TEMP1, 1 + BASE_SHIFT, TEMP1
3111	add	AORIG, TEMP2, AO
3112	add	B,     TEMP1, BO
3113#endif
3114
3115#if defined(LN) || defined(LT)
3116	LDF	[BO +  0 * SIZE], a1
3117	LDF	[BO +  1 * SIZE], a2
3118	LDF	[BO +  2 * SIZE], a3
3119	LDF	[BO +  3 * SIZE], a4
3120
3121	LDF	[BO +  4 * SIZE], b1
3122	LDF	[BO +  5 * SIZE], b2
3123	LDF	[BO +  6 * SIZE], b3
3124	LDF	[BO +  7 * SIZE], b4
3125
3126	FSUB	a1, c01, c01
3127	FSUB	a2, c05, c05
3128	FSUB	a3, c02, c02
3129	FSUB	a4, c06, c06
3130
3131	FSUB	b1, c03, c03
3132	FSUB	b2, c07, c07
3133	FSUB	b3, c04, c04
3134	FSUB	b4, c08, c08
3135#else
3136	LDF	[AO +  0 * SIZE], a1
3137	LDF	[AO +  1 * SIZE], a2
3138	LDF	[AO +  2 * SIZE], a3
3139	LDF	[AO +  3 * SIZE], a4
3140
3141	LDF	[AO +  4 * SIZE], b1
3142	LDF	[AO +  5 * SIZE], b2
3143	LDF	[AO +  6 * SIZE], b3
3144	LDF	[AO +  7 * SIZE], b4
3145
3146	FSUB	a1, c01, c01
3147	FSUB	a2, c02, c02
3148	FSUB	a3, c03, c03
3149	FSUB	a4, c04, c04
3150
3151	FSUB	b1, c05, c05
3152	FSUB	b2, c06, c06
3153	FSUB	b3, c07, c07
3154	FSUB	b4, c08, c08
3155#endif
3156
3157#ifdef LN
3158	LDF	[AO + 15 * SIZE], a1
3159	LDF	[AO + 14 * SIZE], a2
3160	LDF	[AO + 13 * SIZE], a3
3161	LDF	[AO + 12 * SIZE], a4
3162
3163	FMUL	a1, c04, c04
3164	FMUL	a1, c08, c08
3165	FMUL	a2, c04, t1
3166	FMUL	a2, c08, t2
3167
3168	FSUB	c03, t1, c03
3169	FSUB	c07, t2, c07
3170	FMUL	a3, c04, t1
3171	FMUL	a3, c08, t2
3172
3173	FSUB	c02, t1, c02
3174	FSUB	c06, t2, c06
3175	FMUL	a4, c04, t1
3176	FMUL	a4, c08, t2
3177
3178	FSUB	c01, t1, c01
3179	FSUB	c05, t2, c05
3180
3181	LDF	[AO + 10 * SIZE], a1
3182	LDF	[AO +  9 * SIZE], a2
3183	LDF	[AO +  8 * SIZE], a3
3184
3185	FMUL	a1, c03, c03
3186	FMUL	a1, c07, c07
3187	FMUL	a2, c03, t1
3188	FMUL	a2, c07, t2
3189
3190	FSUB	c02, t1, c02
3191	FSUB	c06, t2, c06
3192	FMUL	a3, c03, t1
3193	FMUL	a3, c07, t2
3194
3195	FSUB	c01, t1, c01
3196	FSUB	c05, t2, c05
3197
3198	LDF	[AO +  5 * SIZE], a1
3199	LDF	[AO +  4 * SIZE], a2
3200
3201	FMUL	a1, c02, c02
3202	FMUL	a1, c06, c06
3203	FMUL	a2, c02, t1
3204	FMUL	a2, c06, t2
3205
3206	FSUB	c01, t1, c01
3207	FSUB	c05, t2, c05
3208
3209	LDF	[AO +  0 * SIZE], a1
3210
3211	FMUL	a1, c01, c01
3212	FMUL	a1, c05, c05
3213#endif
3214
3215#ifdef LT
3216	LDF	[AO +  0 * SIZE], a1
3217	LDF	[AO +  1 * SIZE], a2
3218	LDF	[AO +  2 * SIZE], a3
3219	LDF	[AO +  3 * SIZE], a4
3220
3221	FMUL	a1, c01, c01
3222	FMUL	a1, c05, c05
3223	FMUL	a2, c01, t1
3224	FMUL	a2, c05, t2
3225
3226	FSUB	c02, t1, c02
3227	FSUB	c06, t2, c06
3228	FMUL	a3, c01, t1
3229	FMUL	a3, c05, t2
3230
3231	FSUB	c03, t1, c03
3232	FSUB	c07, t2, c07
3233	FMUL	a4, c01, t1
3234	FMUL	a4, c05, t2
3235
3236	FSUB	c04, t1, c04
3237	FSUB	c08, t2, c08
3238
3239	LDF	[AO +  5 * SIZE], a1
3240	LDF	[AO +  6 * SIZE], a2
3241	LDF	[AO +  7 * SIZE], a3
3242
3243	FMUL	a1, c02, c02
3244	FMUL	a1, c06, c06
3245	FMUL	a2, c02, t1
3246	FMUL	a2, c06, t2
3247
3248	FSUB	c03, t1, c03
3249	FSUB	c07, t2, c07
3250	FMUL	a3, c02, t1
3251	FMUL	a3, c06, t2
3252	FSUB	c04, t1, c04
3253	FSUB	c08, t2, c08
3254
3255	LDF	[AO + 10 * SIZE], a1
3256	LDF	[AO + 11 * SIZE], a2
3257
3258	FMUL	a1, c03, c03
3259	FMUL	a1, c07, c07
3260	FMUL	a2, c03, t1
3261	FMUL	a2, c07, t2
3262
3263	FSUB	c04, t1, c04
3264	FSUB	c08, t2, c08
3265
3266	LDF	[AO + 15 * SIZE], a1
3267
3268	FMUL	a1, c04, c04
3269	FMUL	a1, c08, c08
3270#endif
3271
3272#ifdef RN
3273	LDF	[BO +  0 * SIZE], a1
3274	LDF	[BO +  1 * SIZE], a2
3275	LDF	[BO +  3 * SIZE], a3
3276
3277	FMUL	a1, c01, c01
3278	FMUL	a1, c02, c02
3279	FMUL	a1, c03, c03
3280	FMUL	a1, c04, c04
3281
3282	FMUL	a2, c01, t1
3283	FMUL	a2, c02, t2
3284	FMUL	a2, c03, t3
3285	FMUL	a2, c04, t4
3286
3287	FSUB	c05, t1, c05
3288	FSUB	c06, t2, c06
3289	FSUB	c07, t3, c07
3290	FSUB	c08, t4, c08
3291
3292	FMUL	a3, c05, c05
3293	FMUL	a3, c06, c06
3294	FMUL	a3, c07, c07
3295	FMUL	a3, c08, c08
3296#endif
3297
3298#ifdef RT
3299	LDF	[BO +  3 * SIZE], a1
3300	LDF	[BO +  2 * SIZE], a2
3301	LDF	[BO +  0 * SIZE], a3
3302
3303	FMUL	a1, c05, c05
3304	FMUL	a1, c06, c06
3305	FMUL	a1, c07, c07
3306	FMUL	a1, c08, c08
3307
3308	FMUL	a2, c05, t1
3309	FMUL	a2, c06, t2
3310	FMUL	a2, c07, t3
3311	FMUL	a2, c08, t4
3312
3313	FSUB	c01, t1, c01
3314	FSUB	c02, t2, c02
3315	FSUB	c03, t3, c03
3316	FSUB	c04, t4, c04
3317
3318	FMUL	a3, c01, c01
3319	FMUL	a3, c02, c02
3320	FMUL	a3, c03, c03
3321	FMUL	a3, c04, c04
3322#endif
3323
3324#ifdef LN
3325	add	C1, -4 * SIZE, C1
3326	add	C2, -4 * SIZE, C2
3327#endif
3328
3329#if defined(LN) || defined(LT)
3330	STF	c01, [BO +  0 * SIZE]
3331	STF	c05, [BO +  1 * SIZE]
3332	STF	c02, [BO +  2 * SIZE]
3333	STF	c06, [BO +  3 * SIZE]
3334
3335	STF	c03, [BO +  4 * SIZE]
3336	STF	c07, [BO +  5 * SIZE]
3337	STF	c04, [BO +  6 * SIZE]
3338	STF	c08, [BO +  7 * SIZE]
3339#else
3340	STF	c01, [AO +  0 * SIZE]
3341	STF	c02, [AO +  1 * SIZE]
3342	STF	c03, [AO +  2 * SIZE]
3343	STF	c04, [AO +  3 * SIZE]
3344
3345	STF	c05, [AO +  4 * SIZE]
3346	STF	c06, [AO +  5 * SIZE]
3347	STF	c07, [AO +  6 * SIZE]
3348	STF	c08, [AO +  7 * SIZE]
3349#endif
3350
3351	STF	c01, [C1 + 0 * SIZE]
3352	STF	c02, [C1 + 1 * SIZE]
3353	STF	c03, [C1 + 2 * SIZE]
3354	STF	c04, [C1 + 3 * SIZE]
3355
3356	STF	c05, [C2 + 0 * SIZE]
3357	STF	c06, [C2 + 1 * SIZE]
3358	STF	c07, [C2 + 2 * SIZE]
3359	STF	c08, [C2 + 3 * SIZE]
3360
3361	FMOV	FZERO, t1
3362	FMOV	FZERO, t2
3363	FMOV	FZERO, t3
3364	FMOV	FZERO, t4
3365
3366#ifndef LN
3367	add	C1, 4 * SIZE, C1
3368	add	C2, 4 * SIZE, C2
3369#endif
3370
3371#ifdef RT
3372	sll	K, 2 + BASE_SHIFT, TEMP1
3373	add	AORIG, TEMP1, AORIG
3374#endif
3375
3376#if defined(LT) || defined(RN)
3377	sub	K, KK, TEMP1
3378	sll	TEMP1, 2 + BASE_SHIFT, TEMP2
3379	sll	TEMP1, 1 + BASE_SHIFT, TEMP1
3380	add	AO, TEMP2, AO
3381	add	BO, TEMP1, BO
3382#endif
3383
3384#ifdef LT
3385	add	KK, 4, KK
3386#endif
3387
3388#ifdef LN
3389	sub	KK, 4, KK
3390#endif
3391
3392	add	I, -1, I
3393	cmp	I, 0
3394
3395	bg,pt	%icc, .LL121
3396	FMOV	FZERO, c03
3397
3398.LL199:
3399#ifdef LN
3400	sll	K, 1 + BASE_SHIFT, TEMP1
3401	add	B, TEMP1, B
3402#endif
3403
3404#if defined(LT) || defined(RN)
3405	mov	BO, B
3406#endif
3407
3408#ifdef RN
3409	add	KK, 2, KK
3410#endif
3411
3412#ifdef RT
3413	sub	KK, 2, KK
3414#endif
3415
3416.LL200:
3417	and	N, 1, J
3418
3419	cmp	J, 0
3420	ble,pn	%icc, .LL999
3421	nop
3422
3423#ifdef RT
3424	sll	K, 0 + BASE_SHIFT, TEMP1
3425	sub	B, TEMP1, B
3426
3427	sub	C, LDC, C
3428#endif
3429
3430	mov	C, C1
3431
3432#ifdef LN
3433	add	M, OFFSET, KK
3434#endif
3435
3436#ifdef LT
3437	mov	OFFSET, KK
3438#endif
3439
3440#if defined(LN) || defined(RT)
3441	mov	A, AORIG
3442#else
3443	mov	A, AO
3444#endif
3445
3446#ifndef RT
3447	add	C, LDC, C
3448#endif
3449
3450	and	M, 1, I
3451	cmp	I, 0
3452	ble,pn	%icc, .LL250
3453	nop
3454
3455#if defined(LT) || defined(RN)
3456	sra	KK, 2, L
3457
3458	mov	B, BO
3459	cmp	L,  0
3460#else
3461
3462#ifdef LN
3463	sll	K,  0 + BASE_SHIFT, TEMP1
3464	sub	AORIG, TEMP1, AORIG
3465#endif
3466
3467	sll	KK, 0 + BASE_SHIFT, TEMP1
3468
3469	add	AORIG, TEMP1, AO
3470	add	B,     TEMP1, BO
3471
3472	sub	K, KK, TEMP1
3473	sra	TEMP1, 2, L
3474	cmp	L,  0
3475#endif
3476
3477	LDF	[AO + 0 * SIZE], a1
3478	FMOV	FZERO, t1
3479 	LDF	[AO + 1 * SIZE], a2
3480	FMOV	FZERO, c01
3481
3482	LDF	[AO + 2 * SIZE], a3
3483	FMOV	FZERO, t2
3484	LDF	[AO + 3 * SIZE], a4
3485	FMOV	FZERO, c02
3486
3487	LDF	[BO + 0 * SIZE], b1
3488	FMOV	FZERO, t3
3489	LDF	[BO + 1 * SIZE], b2
3490	FMOV	FZERO, t4
3491	LDF	[BO + 2 * SIZE], b3
3492
3493	ble,pn	%icc, .LL275
3494	LDF	[BO + 3 * SIZE], b4
3495
3496.LL272:
3497	FADD	c01, t1, c01
3498	add	L, -1, L
3499	add	AO,  4 * SIZE, AO
3500
3501	FMUL	a1, b1, t1
3502	add	BO,  4 * SIZE, BO
3503	LDF	[AO + 0 * SIZE], a1
3504
3505	FADD	c02, t2, c02
3506	cmp	L, 0
3507	LDF	[BO + 0 * SIZE], b1
3508	FMUL	a2, b2, t2
3509
3510	LDF	[AO + 1 * SIZE], a2
3511	FADD	c01, t3, c01
3512	LDF	[BO + 1 * SIZE], b2
3513	FMUL	a3, b3, t3
3514
3515	LDF	[AO + 2 * SIZE], a3
3516	FADD	c02, t4, c02
3517	LDF	[BO + 2 * SIZE], b3
3518	FMUL	a4, b4, t4
3519	LDF	[AO + 3 * SIZE], a4
3520
3521	bg,pt	%icc, .LL272
3522	LDF	[BO + 3 * SIZE], b4
3523
3524.LL275:
3525#if defined(LT) || defined(RN)
3526	and	KK,  3, L
3527#else
3528	and	TEMP1, 3, L
3529#endif
3530	cmp	L,  0
3531	ble,a,pn %icc, .LL279
3532	nop
3533
3534.LL276:
3535	FADD	c01, t1, c01
3536	add	L, -1, L
3537	FMUL	a1, b1, t1
3538	LDF	[AO + 1 * SIZE], a1
3539
3540	LDF	[BO + 1 * SIZE], b1
3541	add	BO, 1 * SIZE, BO
3542	cmp	L, 0
3543	bg,pt	%icc, .LL276
3544	add	AO, 1 * SIZE, AO
3545
3546.LL279:
3547	FADD	c01, t1, c01
3548	FADD	c02, t2, c02
3549	FADD	c01, t3, c01
3550	FADD	c02, t4, c02
3551
3552	FADD	c01, c02, c01
3553
3554#if defined(LN) || defined(RT)
3555	sub	KK, 1, TEMP1
3556	sll	TEMP1, 0 + BASE_SHIFT, TEMP1
3557	add	AORIG, TEMP1, AO
3558	add	B,     TEMP1, BO
3559#endif
3560
3561#if defined(LN) || defined(LT)
3562	LDF	[BO +  0 * SIZE], a1
3563	FSUB	a1, c01, c01
3564#else
3565	LDF	[AO +  0 * SIZE], a1
3566	FSUB	a1, c01, c01
3567#endif
3568
3569#ifdef LN
3570	LDF	[AO +  0 * SIZE], a1
3571	FMUL	a1, c01, c01
3572#endif
3573
3574#ifdef LT
3575	LDF	[AO +  0 * SIZE], a1
3576	FMUL	a1, c01, c01
3577#endif
3578
3579#ifdef RN
3580	LDF	[BO +  0 * SIZE], a1
3581	FMUL	a1, c01, c01
3582#endif
3583
3584#ifdef RT
3585	LDF	[BO +  0 * SIZE], a1
3586	FMUL	a1, c01, c01
3587#endif
3588
3589#ifdef LN
3590	add	C1, -1 * SIZE, C1
3591#endif
3592
3593#if defined(LN) || defined(LT)
3594	STF	c01, [BO +  0 * SIZE]
3595#else
3596	STF	c01, [AO +  0 * SIZE]
3597#endif
3598
3599	STF	c01, [C1 + 0 * SIZE]
3600
3601	FMOV	FZERO, t1
3602	FMOV	FZERO, t2
3603	FMOV	FZERO, t3
3604	FMOV	FZERO, t4
3605
3606#ifndef LN
3607	add	C1, 1 * SIZE, C1
3608#endif
3609
3610#ifdef RT
3611	sll	K, 0 + BASE_SHIFT, TEMP1
3612	add	AORIG, TEMP1, AORIG
3613#endif
3614
3615#if defined(LT) || defined(RN)
3616	sub	K, KK, TEMP1
3617	sll	TEMP1, 0 + BASE_SHIFT, TEMP1
3618	add	AO, TEMP1, AO
3619	add	BO, TEMP1, BO
3620#endif
3621
3622#ifdef LT
3623	add	KK, 1, KK
3624#endif
3625
3626#ifdef LN
3627	sub	KK, 1, KK
3628#endif
3629
3630.LL250:
3631	and	M, 2, I
3632	cmp	I, 0
3633	ble,pn	%icc, .LL270
3634	nop
3635
3636#if defined(LT) || defined(RN)
3637	sra	KK, 2, L
3638
3639	mov	B, BO
3640	cmp	L,  0
3641#else
3642
3643#ifdef LN
3644	sll	K,  1 + BASE_SHIFT, TEMP1
3645	sub	AORIG, TEMP1, AORIG
3646#endif
3647
3648	sll	KK, 1 + BASE_SHIFT, TEMP1
3649	sll	KK, 0 + BASE_SHIFT, TEMP2
3650
3651	add	AORIG, TEMP1, AO
3652	add	B,     TEMP2, BO
3653
3654	sub	K, KK, TEMP1
3655	sra	TEMP1, 2, L
3656	cmp	L,  0
3657#endif
3658
3659	LDF	[AO + 0 * SIZE], a1
3660	FMOV	FZERO, c01
3661	LDF	[BO + 0 * SIZE], b1
3662	FMOV	FZERO, t1
3663
3664	LDF	[AO + 1 * SIZE], a2
3665	FMOV	FZERO, c02
3666	LDF	[BO + 1 * SIZE], b2
3667	FMOV	FZERO, t2
3668
3669	LDF	[AO + 2 * SIZE], a3
3670	FMOV	FZERO, c03
3671	LDF	[BO + 2 * SIZE], b3
3672	FMOV	FZERO, t3
3673
3674	LDF	[AO + 3 * SIZE], a4
3675	FMOV	FZERO, c04
3676	LDF	[BO + 3 * SIZE], b4
3677	FMOV	FZERO, t4
3678
3679	ble,pn	%icc, .LL255
3680	nop
3681
3682.LL252:
3683	FADD	c01, t1, c01
3684	add	L, -1, L
3685	FMUL	a1, b1, t1
3686	LDF	[AO + 4 * SIZE], a1
3687
3688	FADD	c02, t2, c02
3689	FMUL	a2, b1, t2
3690	LDF	[AO +  5 * SIZE], a2
3691	LDF	[BO +  4 * SIZE], b1
3692
3693	FADD	c03, t3, c03
3694	cmp	L, 0
3695	FMUL	a3, b2, t3
3696	LDF	[AO +  6 * SIZE], a3
3697
3698	FADD	c04, t4, c04
3699	FMUL	a4, b2, t4
3700	LDF	[AO +  7 * SIZE], a4
3701	LDF	[BO +  5 * SIZE], b2
3702
3703	FADD	c01, t1, c01
3704	FMUL	a1, b3, t1
3705	LDF	[AO +  8 * SIZE], a1
3706
3707	FADD	c02, t2, c02
3708	FMUL	a2, b3, t2
3709	LDF	[AO +  9 * SIZE], a2
3710	LDF	[BO +  6 * SIZE], b3
3711
3712	FADD	c03, t3, c03
3713	FMUL	a3, b4, t3
3714	LDF	[AO + 10 * SIZE], a3
3715
3716	FADD	c04, t4, c04
3717	FMUL	a4, b4, t4
3718	LDF	[AO + 11 * SIZE], a4
3719	add	AO,  8 * SIZE, AO
3720
3721	LDF	[BO +  7 * SIZE], b4
3722	bg,pt	%icc, .LL252
3723	add	BO,  4 * SIZE, BO
3724
3725.LL255:
3726#if defined(LT) || defined(RN)
3727	and	KK,  3, L
3728#else
3729	and	TEMP1, 3, L
3730#endif
3731
3732	cmp	L,  0
3733	ble,a,pn %icc, .LL259
3734	nop
3735
3736.LL256:
3737	FADD	c01, t1, c01
3738	add	L, -1, L
3739	FMUL	a1, b1, t1
3740	LDF	[AO + 2 * SIZE], a1
3741
3742	FADD	c02, t2, c02
3743	cmp	L, 0
3744	FMUL	a2, b1, t2
3745	LDF	[AO + 3 * SIZE], a2
3746
3747	LDF	[BO + 1 * SIZE], b1
3748	add	AO, 2 * SIZE, AO
3749
3750	bg,pt	%icc, .LL256
3751	add	BO, 1 * SIZE, BO
3752
3753.LL259:
3754	FADD	c01, t1, c01
3755	FADD	c02, t2, c02
3756	FADD	c03, t3, c03
3757	FADD	c04, t4, c04
3758
3759	FADD	c01, c03, c01
3760	FADD	c02, c04, c02
3761
3762#if defined(LN) || defined(RT)
3763#ifdef LN
3764	sub	KK, 2, TEMP1
3765#else
3766	sub	KK, 1, TEMP1
3767#endif
3768	sll	TEMP1, 1 + BASE_SHIFT, TEMP2
3769	sll	TEMP1, 0 + BASE_SHIFT, TEMP1
3770	add	AORIG, TEMP2, AO
3771	add	B,     TEMP1, BO
3772#endif
3773
3774#if defined(LN) || defined(LT)
3775	LDF	[BO +  0 * SIZE], a1
3776	LDF	[BO +  1 * SIZE], a2
3777
3778	FSUB	a1, c01, c01
3779	FSUB	a2, c02, c02
3780#else
3781	LDF	[AO +  0 * SIZE], a1
3782	LDF	[AO +  1 * SIZE], a2
3783
3784	FSUB	a1, c01, c01
3785	FSUB	a2, c02, c02
3786#endif
3787
3788#ifdef LN
3789	LDF	[AO +  3 * SIZE], a1
3790	LDF	[AO +  2 * SIZE], a2
3791	LDF	[AO +  0 * SIZE], a3
3792
3793	FMUL	a1, c02, c02
3794	FMUL	a2, c02, t1
3795	FSUB	c01, t1, c01
3796	FMUL	a3, c01, c01
3797#endif
3798
3799#ifdef LT
3800	LDF	[AO +  0 * SIZE], a1
3801	LDF	[AO +  1 * SIZE], a2
3802	LDF	[AO +  3 * SIZE], a3
3803
3804	FMUL	a1, c01, c01
3805	FMUL	a2, c01, t1
3806	FSUB	c02, t1, c02
3807	FMUL	a3, c02, c02
3808#endif
3809
3810#ifdef RN
3811	LDF	[BO +  0 * SIZE], a1
3812
3813	FMUL	a1, c01, c01
3814	FMUL	a1, c02, c02
3815#endif
3816
3817#ifdef RT
3818	LDF	[BO +  0 * SIZE], a1
3819
3820	FMUL	a1, c01, c01
3821	FMUL	a1, c02, c02
3822#endif
3823
3824#ifdef LN
3825	add	C1, -2 * SIZE, C1
3826#endif
3827
3828#if defined(LN) || defined(LT)
3829	STF	c01, [BO +  0 * SIZE]
3830	STF	c02, [BO +  1 * SIZE]
3831#else
3832	STF	c01, [AO +  0 * SIZE]
3833	STF	c02, [AO +  1 * SIZE]
3834#endif
3835
3836	STF	c01, [C1 + 0 * SIZE]
3837	STF	c02, [C1 + 1 * SIZE]
3838
3839	FMOV	FZERO, t1
3840	FMOV	FZERO, t2
3841	FMOV	FZERO, t3
3842	FMOV	FZERO, t4
3843
3844#ifndef LN
3845	add	C1, 2 * SIZE, C1
3846#endif
3847
3848#ifdef RT
3849	sll	K, 1 + BASE_SHIFT, TEMP1
3850	add	AORIG, TEMP1, AORIG
3851#endif
3852
3853#if defined(LT) || defined(RN)
3854	sub	K, KK, TEMP1
3855	sll	TEMP1, 1 + BASE_SHIFT, TEMP2
3856	sll	TEMP1, 0 + BASE_SHIFT, TEMP1
3857	add	AO, TEMP2, AO
3858	add	BO, TEMP1, BO
3859#endif
3860
3861#ifdef LT
3862	add	KK, 2, KK
3863#endif
3864
3865#ifdef LN
3866	sub	KK, 2, KK
3867#endif
3868
3869.LL270:
3870	sra	M, 2, I
3871	cmp	I, 0
3872	ble,pn	%icc, .LL299
3873	nop
3874
3875.LL221:
3876#if defined(LT) || defined(RN)
3877	sra	KK, 2, L
3878
3879	mov	B, BO
3880	cmp	L,  0
3881#else
3882
3883#ifdef LN
3884	sll	K,  2 + BASE_SHIFT, TEMP1
3885	sub	AORIG, TEMP1, AORIG
3886#endif
3887
3888	sll	KK, 2 + BASE_SHIFT, TEMP1
3889	sll	KK, 0 + BASE_SHIFT, TEMP2
3890
3891	add	AORIG, TEMP1, AO
3892	add	B,     TEMP2, BO
3893
3894	sub	K, KK, TEMP1
3895	sra	TEMP1, 2, L
3896	cmp	L,  0
3897#endif
3898
3899	LDF	[AO + 0 * SIZE], a1
3900	FMOV	FZERO, c01
3901	LDF	[BO + 0 * SIZE], b1
3902	FMOV	FZERO, t1
3903
3904	LDF	[AO + 1 * SIZE], a2
3905	FMOV	FZERO, c02
3906	LDF	[BO + 1 * SIZE], b2
3907	FMOV	FZERO, t2
3908
3909	LDF	[AO + 2 * SIZE], a3
3910	FMOV	FZERO, c03
3911	LDF	[BO + 2 * SIZE], b3
3912	FMOV	FZERO, t3
3913
3914	LDF	[AO + 3 * SIZE], a4
3915	FMOV	FZERO, c04
3916	LDF	[BO + 3 * SIZE], b4
3917	FMOV	FZERO, t4
3918
3919#ifdef LN
3920	prefetch [C1 - 3 * SIZE], 2
3921#else
3922	prefetch [C1 + 3 * SIZE], 2
3923#endif
3924
3925	ble,pn	%icc, .LL225
3926	prefetch [C1 + 4 * SIZE], 2
3927
3928.LL222:
3929	FADD	c01, t1, c01
3930	add	BO,  4 * SIZE, BO
3931	FMUL	a1, b1, t1
3932	LDF	[AO +  4 * SIZE], a1
3933
3934	FADD	c02, t2, c02
3935	FMUL	a2, b1, t2
3936	LDF	[AO +  5 * SIZE], a2
3937
3938	FADD	c03, t3, c03
3939	add	L, -1, L
3940	FMUL	a3, b1, t3
3941	LDF	[AO +  6 * SIZE], a3
3942
3943	FADD	c04, t4, c04
3944	FMUL	a4, b1, t4
3945	LDF	[AO +  7 * SIZE], a4
3946	LDF	[BO +  0 * SIZE], b1
3947
3948	FADD	c01, t1, c01
3949	cmp	L,  0
3950	FMUL	a1, b2, t1
3951	LDF	[AO +  8 * SIZE], a1
3952
3953	FADD	c02, t2, c02
3954	FMUL	a2, b2, t2
3955	LDF	[AO +  9 * SIZE], a2
3956
3957	FADD	c03, t3, c03
3958	FMUL	a3, b2, t3
3959	LDF	[AO + 10 * SIZE], a3
3960
3961	FADD	c04, t4, c04
3962	FMUL	a4, b2, t4
3963	LDF	[AO + 11 * SIZE], a4
3964	LDF	[BO +  1 * SIZE], b2
3965
3966	FADD	c01, t1, c01
3967	FMUL	a1, b3, t1
3968	LDF	[AO + 12 * SIZE], a1
3969
3970	FADD	c02, t2, c02
3971	FMUL	a2, b3, t2
3972	LDF	[AO + 13 * SIZE], a2
3973
3974	FADD	c03, t3, c03
3975	FMUL	a3, b3, t3
3976	LDF	[AO + 14 * SIZE], a3
3977
3978	FADD	c04, t4, c04
3979	FMUL	a4, b3, t4
3980	LDF	[AO + 15 * SIZE], a4
3981	LDF	[BO +  2 * SIZE], b3
3982
3983	FADD	c01, t1, c01
3984	FMUL	a1, b4, t1
3985	LDF	[AO + 16 * SIZE], a1
3986
3987	FADD	c02, t2, c02
3988	FMUL	a2, b4, t2
3989	LDF	[AO + 17 * SIZE], a2
3990
3991	FADD	c03, t3, c03
3992	FMUL	a3, b4, t3
3993	LDF	[AO + 18 * SIZE], a3
3994
3995	FADD	c04, t4, c04
3996	FMUL	a4, b4, t4
3997	LDF	[AO + 19 * SIZE], a4
3998	add	AO, 16 * SIZE, AO
3999
4000	bg,pt	%icc, .LL222
4001	LDF	[BO +  3 * SIZE], b4
4002
4003.LL225:
4004#if defined(LT) || defined(RN)
4005	and	KK,  3, L
4006#else
4007	and	TEMP1, 3, L
4008#endif
4009	cmp	L,  0
4010	ble,a,pn %icc, .LL229
4011	nop
4012
4013.LL226:
4014	FADD	c01, t1, c01
4015	add	BO, 1 * SIZE, BO
4016	FMUL	a1, b1, t1
4017	LDF	[AO + 4 * SIZE], a1
4018
4019	FADD	c02, t2, c02
4020	add	L, -1, L
4021	FMUL	a2, b1, t2
4022	LDF	[AO + 5 * SIZE], a2
4023
4024	FADD	c03, t3, c03
4025	cmp	L, 0
4026	FMUL	a3, b1, t3
4027	LDF	[AO + 6 * SIZE], a3
4028
4029	FADD	c04, t4, c04
4030	FMUL	a4, b1, t4
4031	LDF	[AO + 7 * SIZE], a4
4032	add	AO, 4 * SIZE, AO
4033
4034	bg,pt	%icc, .LL226
4035	LDF	[BO + 0 * SIZE], b1
4036
4037.LL229:
4038	FADD	c01, t1, c01
4039	FADD	c02, t2, c02
4040	FADD	c03, t3, c03
4041	FADD	c04, t4, c04
4042
4043#if defined(LN) || defined(RT)
4044#ifdef LN
4045	sub	KK, 4, TEMP1
4046#else
4047	sub	KK, 1, TEMP1
4048#endif
4049	sll	TEMP1, 2 + BASE_SHIFT, TEMP2
4050	sll	TEMP1, 0 + BASE_SHIFT, TEMP1
4051	add	AORIG, TEMP2, AO
4052	add	B,     TEMP1, BO
4053#endif
4054
4055#if defined(LN) || defined(LT)
4056	LDF	[BO +  0 * SIZE], a1
4057	LDF	[BO +  1 * SIZE], a2
4058	LDF	[BO +  2 * SIZE], a3
4059	LDF	[BO +  3 * SIZE], a4
4060
4061	FSUB	a1, c01, c01
4062	FSUB	a2, c02, c02
4063	FSUB	a3, c03, c03
4064	FSUB	a4, c04, c04
4065#else
4066	LDF	[AO +  0 * SIZE], a1
4067	LDF	[AO +  1 * SIZE], a2
4068	LDF	[AO +  2 * SIZE], a3
4069	LDF	[AO +  3 * SIZE], a4
4070
4071	FSUB	a1, c01, c01
4072	FSUB	a2, c02, c02
4073	FSUB	a3, c03, c03
4074	FSUB	a4, c04, c04
4075#endif
4076
4077#ifdef LN
4078	LDF	[AO + 15 * SIZE], a1
4079	LDF	[AO + 14 * SIZE], a2
4080	LDF	[AO + 13 * SIZE], a3
4081	LDF	[AO + 12 * SIZE], a4
4082
4083	FMUL	a1, c04, c04
4084	FMUL	a2, c04, t1
4085
4086	FSUB	c03, t1, c03
4087	FMUL	a3, c04, t1
4088
4089	FSUB	c02, t1, c02
4090	FMUL	a4, c04, t1
4091
4092	FSUB	c01, t1, c01
4093
4094	LDF	[AO + 10 * SIZE], a1
4095	LDF	[AO +  9 * SIZE], a2
4096	LDF	[AO +  8 * SIZE], a3
4097
4098	FMUL	a1, c03, c03
4099	FMUL	a2, c03, t1
4100
4101	FSUB	c02, t1, c02
4102	FMUL	a3, c03, t1
4103	FSUB	c01, t1, c01
4104
4105	LDF	[AO +  5 * SIZE], a1
4106	LDF	[AO +  4 * SIZE], a2
4107
4108	FMUL	a1, c02, c02
4109	FMUL	a2, c02, t1
4110	FSUB	c01, t1, c01
4111
4112	LDF	[AO +  0 * SIZE], a1
4113
4114	FMUL	a1, c01, c01
4115#endif
4116
4117#ifdef LT
4118	LDF	[AO +  0 * SIZE], a1
4119	LDF	[AO +  1 * SIZE], a2
4120	LDF	[AO +  2 * SIZE], a3
4121	LDF	[AO +  3 * SIZE], a4
4122
4123	FMUL	a1, c01, c01
4124	FMUL	a2, c01, t1
4125	FSUB	c02, t1, c02
4126	FMUL	a3, c01, t1
4127	FSUB	c03, t1, c03
4128	FMUL	a4, c01, t1
4129	FSUB	c04, t1, c04
4130
4131	LDF	[AO +  5 * SIZE], a1
4132	LDF	[AO +  6 * SIZE], a2
4133	LDF	[AO +  7 * SIZE], a3
4134
4135	FMUL	a1, c02, c02
4136	FMUL	a2, c02, t1
4137	FSUB	c03, t1, c03
4138	FMUL	a3, c02, t1
4139	FSUB	c04, t1, c04
4140
4141	LDF	[AO + 10 * SIZE], a1
4142	LDF	[AO + 11 * SIZE], a2
4143
4144	FMUL	a1, c03, c03
4145	FMUL	a2, c03, t1
4146
4147	FSUB	c04, t1, c04
4148
4149	LDF	[AO + 15 * SIZE], a1
4150
4151	FMUL	a1, c04, c04
4152#endif
4153
4154#ifdef RN
4155	LDF	[BO +  0 * SIZE], a1
4156
4157	FMUL	a1, c01, c01
4158	FMUL	a1, c02, c02
4159	FMUL	a1, c03, c03
4160	FMUL	a1, c04, c04
4161#endif
4162
4163#ifdef RT
4164	LDF	[BO +  0 * SIZE], a1
4165
4166	FMUL	a1, c01, c01
4167	FMUL	a1, c02, c02
4168	FMUL	a1, c03, c03
4169	FMUL	a1, c04, c04
4170#endif
4171
4172#ifdef LN
4173	add	C1, -4 * SIZE, C1
4174#endif
4175
4176#if defined(LN) || defined(LT)
4177	STF	c01, [BO +  0 * SIZE]
4178	STF	c02, [BO +  1 * SIZE]
4179	STF	c03, [BO +  2 * SIZE]
4180	STF	c04, [BO +  3 * SIZE]
4181#else
4182	STF	c01, [AO +  0 * SIZE]
4183	STF	c02, [AO +  1 * SIZE]
4184	STF	c03, [AO +  2 * SIZE]
4185	STF	c04, [AO +  3 * SIZE]
4186#endif
4187
4188	STF	c01, [C1 + 0 * SIZE]
4189	STF	c02, [C1 + 1 * SIZE]
4190	STF	c03, [C1 + 2 * SIZE]
4191	STF	c04, [C1 + 3 * SIZE]
4192
4193	FMOV	FZERO, t1
4194	FMOV	FZERO, t2
4195	FMOV	FZERO, t3
4196	FMOV	FZERO, t4
4197
4198#ifndef LN
4199	add	C1, 4 * SIZE, C1
4200#endif
4201
4202#ifdef RT
4203	sll	K, 2 + BASE_SHIFT, TEMP1
4204	add	AORIG, TEMP1, AORIG
4205#endif
4206
4207#if defined(LT) || defined(RN)
4208	sub	K, KK, TEMP1
4209	sll	TEMP1, 2 + BASE_SHIFT, TEMP2
4210	sll	TEMP1, 0 + BASE_SHIFT, TEMP1
4211	add	AO, TEMP2, AO
4212	add	BO, TEMP1, BO
4213#endif
4214
4215#ifdef LT
4216	add	KK, 4, KK
4217#endif
4218
4219#ifdef LN
4220	sub	KK, 4, KK
4221#endif
4222
4223	add	I, -1, I
4224	cmp	I, 0
4225
4226	bg,pt	%icc, .LL221
4227	nop
4228
4229
4230
4231.LL299:
4232#ifdef LN
4233	sll	K, 0 + BASE_SHIFT, TEMP1
4234	add	B, TEMP1, B
4235#endif
4236
4237#if defined(LT) || defined(RN)
4238	mov	BO, B
4239#endif
4240
4241#ifdef RN
4242	add	KK, 1, KK
4243#endif
4244
4245#ifdef RT
4246	sub	KK, 1, KK
4247#endif
4248
4249
4250.LL999:
4251	return	%i7 + 8
4252	clr	%o0
4253
4254	EPILOGUE
4255