1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#ifdef linux
26#ifndef __64BIT__
27#define M	r3
28#define IS	r4
29#define A	r5
30#define LDA	r6
31#define X	r7
32#define	INCX	r8
33#define	Y	r9
34#define	INCY	r10
35#define BUFFER	r14
36#else
37#define M	r3
38#define IS	r4
39#define A	r7
40#define LDA	r8
41#define X	r9
42#define	INCX	r10
43#define	Y	r5
44#define	INCY	r6
45#define BUFFER	r14
46#endif
47#endif
48
49#if defined(_AIX) || defined(__APPLE__)
50#if !defined(__64BIT__) && defined(DOUBLE)
51#define M	r3
52#define IS	r4
53#define A	r9
54#define LDA	r10
55#define X	r5
56#define	INCX	r6
57#define	Y	r7
58#define	INCY	r8
59#define BUFFER	r14
60#else
61#define M	r3
62#define IS	r4
63#define A	r7
64#define LDA	r8
65#define X	r9
66#define	INCX	r10
67#define	Y	r5
68#define	INCY	r6
69#define BUFFER	r14
70#endif
71#endif
72
73#define I	r11
74#define	J	r12
75
76#define AO1	r15
77#define AO2	r16
78#define XX	r19
79#define YY	r20
80#define	NEW_Y	r21
81#define TEMP	r22
82#define	PREA	r24
83
84#define y01 f0
85#define y02 f1
86#define y03 f2
87#define y04 f3
88#define y05 f4
89#define y06 f5
90#define y07 f6
91#define y08 f7
92
93#define xtemp1 f8
94#define xtemp2 f9
95#define xtemp3 f10
96#define xtemp4 f11
97#define xtemp5 f12
98#define xtemp6 f13
99#define xtemp7 f14
100#define xtemp8 f15
101
102#define atemp1 f16
103#define atemp2 f17
104#define atemp3 f18
105#define atemp4 f19
106
107#define xsum1  f20
108#define xsum2  f21
109#define xsum3  f22
110#define xsum4  f23
111
112#define a1     f24
113#define a2     f25
114#define a3     f26
115#define a4     f27
116#define a5     f28
117#define a6     f29
118#define a7     f30
119#define a8     f31
120
121#define alpha_r  f1
122#define alpha_i  f2
123
124#if defined(PPCG4)
125#define PREFETCHSIZE_A  24
126#endif
127
128#if defined(PPC440) || defined(PPC440FP2)
129#define PREFETCHSIZE_A  24
130#endif
131
132#ifdef PPC970
133#define PREFETCHSIZE_A  32
134#endif
135
136#ifdef CELL
137#define PREFETCHSIZE_A  72
138#endif
139
140#ifdef POWER4
141#define PREFETCHSIZE_A  16
142#endif
143
144#ifdef POWER5
145#define PREFETCHSIZE_A  96
146#endif
147
148#ifdef POWER6
149#define PREFETCHSIZE_A  112
150#endif
151
152#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970)
153#define NOP1
154#define NOP2
155#else
156#define NOP1   mr	LDA, LDA
157#define NOP2   mr	INCX, INCX
158#endif
159
160#ifndef NEEDPARAM
161
162#ifndef __64BIT__
163#define STACKSIZE 224
164#define ALPHA_R     200(SP)
165#define ALPHA_I     208(SP)
166#define	FZERO	    216(SP)
167#else
168#define STACKSIZE 280
169#define ALPHA_R     256(SP)
170#define ALPHA_I     264(SP)
171#define FZERO	    272(SP)
172#endif
173
174#ifndef HEMV
175#define FMADD1	FNMSUB
176#define FMADD2	FMADD
177#else
178#define FMADD1	FMADD
179#define FMADD2	FNMSUB
180#endif
181
182	PROLOGUE
183	PROFCODE
184
185	addi	SP,   SP, -STACKSIZE
186	li	r0,   0
187
188	stfd	f14,     0(SP)
189	stfd	f15,     8(SP)
190	stfd	f16,    16(SP)
191	stfd	f17,    24(SP)
192	stfd	f18,    32(SP)
193	stfd	f19,    40(SP)
194	stfd	f20,    48(SP)
195	stfd	f21,    56(SP)
196	stfd	f22,    64(SP)
197	stfd	f23,    72(SP)
198	stfd	f24,    80(SP)
199	stfd	f25,    88(SP)
200	stfd	f26,    96(SP)
201	stfd	f27,   104(SP)
202	stfd	f28,   112(SP)
203	stfd	f29,   120(SP)
204	stfd	f30,   128(SP)
205	stfd	f31,   136(SP)
206
207#ifdef __64BIT__
208	std	r0,    FZERO
209	std	r14,   144(SP)
210	std	r15,   152(SP)
211	std	r16,   160(SP)
212	std	r17,   168(SP)
213	std	r18,   176(SP)
214	std	r19,   184(SP)
215	std	r20,   192(SP)
216	std	r21,   200(SP)
217	std	r22,   208(SP)
218	std	r23,   216(SP)
219	std	r24,   224(SP)
220	std	r25,   232(SP)
221	std	r26,   240(SP)
222	std	r27,   248(SP)
223#else
224	stw	r0,    0 + FZERO
225	stw	r0,    4 + FZERO
226	stw	r14,   144(SP)
227	stw	r15,   148(SP)
228	stw	r16,   152(SP)
229	stw	r17,   156(SP)
230	stw	r18,   160(SP)
231	stw	r19,   164(SP)
232	stw	r20,   168(SP)
233	stw	r21,   172(SP)
234	stw	r22,   176(SP)
235	stw	r23,   180(SP)
236	stw	r24,   184(SP)
237	stw	r25,   188(SP)
238	stw	r26,   192(SP)
239	stw	r27,   196(SP)
240#endif
241
242#ifdef linux
243#ifndef __64BIT__
244	lwz	BUFFER,   56 + STACKSIZE(SP)
245#else
246	ld	Y,       112 + STACKSIZE(SP)
247	ld	INCY,    120 + STACKSIZE(SP)
248	ld	BUFFER,  128 + STACKSIZE(SP)
249#endif
250#endif
251
252#if defined(_AIX) || defined(__APPLE__)
253#ifndef __64BIT__
254#ifdef DOUBLE
255	lwz	X,       56 + STACKSIZE(SP)
256	lwz	INCX,    60 + STACKSIZE(SP)
257	lwz	Y,       64 + STACKSIZE(SP)
258	lwz	INCY,    68 + STACKSIZE(SP)
259	lwz	BUFFER,  72 + STACKSIZE(SP)
260#else
261	lwz	Y,       56 + STACKSIZE(SP)
262	lwz	INCY,    60 + STACKSIZE(SP)
263	lwz	BUFFER,  64 + STACKSIZE(SP)
264#endif
265#else
266	ld	Y,       112 + STACKSIZE(SP)
267	ld	INCY,    120 + STACKSIZE(SP)
268	ld	BUFFER,  128 + STACKSIZE(SP)
269#endif
270#endif
271
272	STFD	alpha_r, ALPHA_R
273	STFD	alpha_i, ALPHA_I
274
275	slwi	LDA,  LDA,  ZBASE_SHIFT
276	slwi	INCX, INCX, ZBASE_SHIFT
277	slwi	INCY, INCY, ZBASE_SHIFT
278
279	li	PREA, PREFETCHSIZE_A * SIZE
280	sub	IS, M, IS
281
282	cmpwi	cr0, M, 0
283	ble-	LL(999)
284
285	mullw	TEMP, IS, LDA
286	add	A, A, TEMP
287
288	cmpwi	cr0, INCX, 2 * SIZE
289	beq	LL(05)
290
291	mr	XX, X
292	mr	X, BUFFER
293
294	srawi.	r0, M, 2
295	mtspr	CTR, r0
296	ble	LL(03)
297	.align 4
298
299LL(01):
300	LFD	a1, 0 * SIZE(XX)
301	LFD	a2, 1 * SIZE(XX)
302	add	XX, XX, INCX
303	LFD	a3, 0 * SIZE(XX)
304	LFD	a4, 1 * SIZE(XX)
305	add	XX, XX, INCX
306	LFD	a5, 0 * SIZE(XX)
307	LFD	a6, 1 * SIZE(XX)
308	add	XX, XX, INCX
309	LFD	a7, 0 * SIZE(XX)
310	LFD	a8, 1 * SIZE(XX)
311	add	XX, XX, INCX
312
313	dcbt	XX, PREA
314	dcbtst	BUFFER, PREA
315
316	STFD	a1, 0 * SIZE(BUFFER)
317	STFD	a2, 1 * SIZE(BUFFER)
318	STFD	a3, 2 * SIZE(BUFFER)
319	STFD	a4, 3 * SIZE(BUFFER)
320	STFD	a5, 4 * SIZE(BUFFER)
321	STFD	a6, 5 * SIZE(BUFFER)
322	STFD	a7, 6 * SIZE(BUFFER)
323	STFD	a8, 7 * SIZE(BUFFER)
324
325	addi	BUFFER, BUFFER, 8 * SIZE
326	bdnz	LL(01)
327	.align 4
328
329LL(03):
330	andi.	r0, M, 3
331	mtspr	CTR, r0
332	ble	LL(05)
333	.align 4
334
335LL(04):
336	LFD	a1, 0 * SIZE(XX)
337	LFD	a2, 1 * SIZE(XX)
338	add	XX, XX, INCX
339
340	STFD	a1, 0 * SIZE(BUFFER)
341	STFD	a2, 1 * SIZE(BUFFER)
342
343	addi	BUFFER, BUFFER, 2 * SIZE
344	bdnz	LL(04)
345	.align 4
346
347LL(05):
348	mr	NEW_Y, Y
349	lfd	f0, FZERO
350
351	cmpwi	cr0, INCY, 2 * SIZE
352	beq	LL(10)
353
354	mr	NEW_Y, BUFFER
355
356	addi	r0, M,  3
357	srawi.	r0, r0, 2
358	mtspr	CTR, r0
359	.align 4
360
361LL(06):
362	STFD	f0, 0 * SIZE(BUFFER)
363	STFD	f0, 1 * SIZE(BUFFER)
364	STFD	f0, 2 * SIZE(BUFFER)
365	STFD	f0, 3 * SIZE(BUFFER)
366	STFD	f0, 4 * SIZE(BUFFER)
367	STFD	f0, 5 * SIZE(BUFFER)
368	STFD	f0, 6 * SIZE(BUFFER)
369	STFD	f0, 7 * SIZE(BUFFER)
370	addi	BUFFER, BUFFER, 8 * SIZE
371	bdnz	LL(06)
372	.align 4
373
374LL(10):
375	addi	TEMP, IS, 2
376	cmpw	cr0, TEMP, M
377	bgt	LL(20)
378	.align 4
379
380LL(11):
381	mr	AO1, A
382	add	AO2, A,   LDA
383	add	A,   AO2, LDA
384
385	slwi	TEMP,  IS,  ZBASE_SHIFT
386	add	TEMP, X, TEMP
387
388	LFD	y05, ALPHA_R
389	LFD	y06, ALPHA_I
390
391	LFD	xtemp1, 0 * SIZE(TEMP)
392	LFD	xtemp2, 1 * SIZE(TEMP)
393	LFD	xtemp3, 2 * SIZE(TEMP)
394	LFD	xtemp4, 3 * SIZE(TEMP)
395
396	FMUL	atemp1, y05, xtemp1
397	FMUL	atemp2, y06, xtemp1
398	FMUL	atemp3, y05, xtemp3
399	FMUL	atemp4, y06, xtemp3
400
401	FNMSUB	atemp1, y06, xtemp2, atemp1
402	FMADD	atemp2, y05, xtemp2, atemp2
403	FNMSUB	atemp3, y06, xtemp4, atemp3
404	FMADD	atemp4, y05, xtemp4, atemp4
405
406	lfd	xsum1, FZERO
407	fmr	xsum2, xsum1
408	fmr	xsum3, xsum1
409	fmr	xsum4, xsum1
410
411	mr	XX, X
412	mr	YY, NEW_Y
413
414	LFD	a1,  0 * SIZE(AO1)
415	LFD	a2,  1 * SIZE(AO1)
416	LFD	a3,  2 * SIZE(AO1)
417	LFD	a4,  3 * SIZE(AO1)
418
419	LFD	a5,  0 * SIZE(AO2)
420	LFD	a6,  1 * SIZE(AO2)
421	LFD	a7,  2 * SIZE(AO2)
422	LFD	a8,  3 * SIZE(AO2)
423
424	LFD	xtemp1,  0 * SIZE(XX)
425	LFD	xtemp2,  1 * SIZE(XX)
426	LFD	xtemp3,  2 * SIZE(XX)
427	LFD	xtemp4,  3 * SIZE(XX)
428
429	LFD	y01,  0 * SIZE(YY)
430	LFD	y02,  1 * SIZE(YY)
431	LFD	y03,  2 * SIZE(YY)
432	LFD	y04,  3 * SIZE(YY)
433
434	srawi.	r0,  IS, 3
435	mtspr	CTR, r0
436	ble	LL(15)
437
438	FMADD	xsum1, xtemp1, a1,  xsum1
439	DCBT(AO1, PREA)
440	FMADD	y01, atemp1, a1, y01
441	NOP2
442
443	FMADD	xsum2, xtemp2, a1,  xsum2
444	NOP1
445	FMADD	y02, atemp2, a1, y02
446	LFD	a1,  4 * SIZE(AO1)
447
448	FMADD	xsum3, xtemp1, a5,  xsum3
449	NOP1
450	FMADD	y03, atemp1, a3, y03
451	NOP2
452
453	FMADD	xsum4, xtemp2, a5,  xsum4
454	NOP1
455	FMADD	y04, atemp2, a3, y04
456	NOP2
457
458	FMADD1	xsum1, xtemp2, a2,  xsum1
459	LFD	y05,  4 * SIZE(YY)
460	FNMSUB	y01, atemp2, a2, y01
461	NOP2
462
463	FMADD2	xsum2, xtemp1, a2,  xsum2
464	LFD	y06,  5 * SIZE(YY)
465	FMADD	y02, atemp1, a2, y02
466	LFD	a2,  5 * SIZE(AO1)
467
468	FMADD1	xsum3, xtemp2, a6,  xsum3
469	LFD	xtemp2,  5 * SIZE(XX)
470	FNMSUB	y03, atemp2, a4, y03
471	NOP2
472
473	FMADD2	xsum4, xtemp1, a6,  xsum4
474	LFD	xtemp1,  4 * SIZE(XX)
475	FMADD	y04, atemp1, a4, y04
476	NOP2
477
478	FMADD	xsum1, xtemp3, a3,  xsum1
479	LFD	y07,  6 * SIZE(YY)
480	FMADD	y01, atemp3, a5, y01
481	NOP2
482
483	FMADD	xsum2, xtemp4, a3,  xsum2
484	LFD	a3,  6 * SIZE(AO1)
485	FMADD	y02, atemp4, a5, y02
486	LFD	a5,  4 * SIZE(AO2)
487
488	FMADD	xsum3, xtemp3, a7,  xsum3
489	LFD	y08,  7 * SIZE(YY)
490	FMADD	y03, atemp3, a7, y03
491	NOP2
492
493	FMADD	xsum4, xtemp4, a7,  xsum4
494	NOP1
495	FMADD	y04, atemp4, a7, y04
496	LFD	a7,  6 * SIZE(AO2)
497
498	FMADD1	xsum1, xtemp4, a4,  xsum1
499	NOP1
500	FNMSUB	y01, atemp4, a6, y01
501#	DCBT(X, PREX)
502	NOP2
503
504	FMADD2	xsum2, xtemp3, a4,  xsum2
505	LFD	a4,  7 * SIZE(AO1)
506	FMADD	y02, atemp3, a6, y02
507	LFD	a6,  5 * SIZE(AO2)
508
509	FMADD1	xsum3, xtemp4, a8,  xsum3
510	LFD	xtemp4,  7 * SIZE(XX)
511	FNMSUB	y03, atemp4, a8, y03
512	NOP2
513
514	FMADD2	xsum4, xtemp3, a8,  xsum4
515	LFD	xtemp3,  6 * SIZE(XX)
516	FMADD	y04, atemp3, a8, y04
517	LFD	a8,  7 * SIZE(AO2)
518
519	FMADD	xsum1, xtemp1, a1,  xsum1
520	STFD	y01,  0 * SIZE(YY)
521	FMADD	y05, atemp1, a1, y05
522	NOP2
523
524	FMADD	xsum2, xtemp2, a1,  xsum2
525	STFD	y02,  1 * SIZE(YY)
526	FMADD	y06, atemp2, a1, y06
527	LFD	a1,  8 * SIZE(AO1)
528
529	FMADD	xsum3, xtemp1, a5,  xsum3
530	STFD	y03,  2 * SIZE(YY)
531	FMADD	y07, atemp1, a3, y07
532	NOP2
533
534	FMADD	xsum4, xtemp2, a5,  xsum4
535	STFD	y04,  3 * SIZE(YY)
536	FMADD	y08, atemp2, a3, y08
537	NOP2
538
539	FMADD1	xsum1, xtemp2, a2,  xsum1
540	LFD	y01,  8 * SIZE(YY)
541	FNMSUB	y05, atemp2, a2, y05
542	NOP2
543
544	FMADD2	xsum2, xtemp1, a2,  xsum2
545	LFD	y02,  9 * SIZE(YY)
546	FMADD	y06, atemp1, a2, y06
547	LFD	a2,  9 * SIZE(AO1)
548
549	FMADD1	xsum3, xtemp2, a6,  xsum3
550	LFD	xtemp2,  9 * SIZE(XX)
551	FNMSUB	y07, atemp2, a4, y07
552	NOP2
553
554	FMADD2	xsum4, xtemp1, a6,  xsum4
555	LFD	xtemp1,  8 * SIZE(XX)
556	FMADD	y08, atemp1, a4, y08
557	NOP2
558
559	FMADD	xsum1, xtemp3, a3,  xsum1
560	LFD	y03, 10 * SIZE(YY)
561	FMADD	y05, atemp3, a5, y05
562	NOP2
563
564	FMADD	xsum2, xtemp4, a3,  xsum2
565	LFD	a3, 10 * SIZE(AO1)
566	FMADD	y06, atemp4, a5, y06
567	LFD	a5,  8 * SIZE(AO2)
568
569	FMADD	xsum3, xtemp3, a7,  xsum3
570	LFD	y04, 11 * SIZE(YY)
571	FMADD	y07, atemp3, a7, y07
572	NOP2
573
574	FMADD	xsum4, xtemp4, a7,  xsum4
575	NOP1
576	FMADD	y08, atemp4, a7, y08
577	LFD	a7, 10 * SIZE(AO2)
578
579	FMADD1	xsum1, xtemp4, a4,  xsum1
580	NOP1
581	FNMSUB	y05, atemp4, a6, y05
582	NOP2
583
584	FMADD2	xsum2, xtemp3, a4,  xsum2
585	LFD	a4, 11 * SIZE(AO1)
586	FMADD	y06, atemp3, a6, y06
587	LFD	a6,  9 * SIZE(AO2)
588
589	FMADD1	xsum3, xtemp4, a8,  xsum3
590	LFD	xtemp4, 11 * SIZE(XX)
591	FNMSUB	y07, atemp4, a8, y07
592	bdz	LL(13)
593	.align 4
594
595LL(12):
596	FMADD2	xsum4, xtemp3, a8,  xsum4
597	LFD	xtemp3, 10 * SIZE(XX)
598	FMADD	y08, atemp3, a8, y08
599	LFD	a8, 11 * SIZE(AO2)
600
601	FMADD	xsum1, xtemp1, a1,  xsum1
602	STFD	y05,  4 * SIZE(YY)
603	FMADD	y01, atemp1, a1, y01
604	DCBT(AO2, PREA)
605
606	FMADD	xsum2, xtemp2, a1,  xsum2
607	STFD	y06,  5 * SIZE(YY)
608	FMADD	y02, atemp2, a1, y02
609	LFD	a1, 12 * SIZE(AO1)
610
611	FMADD	xsum3, xtemp1, a5,  xsum3
612	STFD	y07,  6 * SIZE(YY)
613	FMADD	y03, atemp1, a3, y03
614	NOP2
615
616	FMADD	xsum4, xtemp2, a5,  xsum4
617	STFD	y08,  7 * SIZE(YY)
618	FMADD	y04, atemp2, a3, y04
619	NOP2
620
621	FMADD1	xsum1, xtemp2, a2,  xsum1
622	LFD	y05, 12 * SIZE(YY)
623	FNMSUB	y01, atemp2, a2, y01
624	NOP2
625
626	FMADD2	xsum2, xtemp1, a2,  xsum2
627	LFD	y06, 13 * SIZE(YY)
628	FMADD	y02, atemp1, a2, y02
629	LFD	a2, 13 * SIZE(AO1)
630
631	FMADD1	xsum3, xtemp2, a6,  xsum3
632	LFD	xtemp2, 13 * SIZE(XX)
633	FNMSUB	y03, atemp2, a4, y03
634	NOP2
635
636	FMADD2	xsum4, xtemp1, a6,  xsum4
637	LFD	xtemp1, 12 * SIZE(XX)
638	FMADD	y04, atemp1, a4, y04
639	NOP2
640
641	FMADD	xsum1, xtemp3, a3,  xsum1
642	LFD	y07, 14 * SIZE(YY)
643	FMADD	y01, atemp3, a5, y01
644	NOP2
645
646	FMADD	xsum2, xtemp4, a3,  xsum2
647	LFD	a3, 14 * SIZE(AO1)
648	FMADD	y02, atemp4, a5, y02
649	LFD	a5, 12 * SIZE(AO2)
650
651	FMADD	xsum3, xtemp3, a7,  xsum3
652	LFD	y08, 15 * SIZE(YY)
653	FMADD	y03, atemp3, a7, y03
654	NOP2
655
656	FMADD	xsum4, xtemp4, a7,  xsum4
657	NOP1
658	FMADD	y04, atemp4, a7, y04
659	LFD	a7, 14 * SIZE(AO2)
660
661	FMADD1	xsum1, xtemp4, a4,  xsum1
662	NOP1
663	FNMSUB	y01, atemp4, a6, y01
664#	DCBT(Y1, PREY)
665	NOP2
666
667	FMADD2	xsum2, xtemp3, a4,  xsum2
668	LFD	a4, 15 * SIZE(AO1)
669	FMADD	y02, atemp3, a6, y02
670	LFD	a6, 13 * SIZE(AO2)
671
672	FMADD1	xsum3, xtemp4, a8,  xsum3
673	LFD	xtemp4, 15 * SIZE(XX)
674	FNMSUB	y03, atemp4, a8, y03
675	NOP2
676
677	FMADD2	xsum4, xtemp3, a8,  xsum4
678	LFD	xtemp3, 14 * SIZE(XX)
679	FMADD	y04, atemp3, a8, y04
680	LFD	a8, 15 * SIZE(AO2)
681
682	FMADD	xsum1, xtemp1, a1,  xsum1
683	STFD	y01,  8 * SIZE(YY)
684	FMADD	y05, atemp1, a1, y05
685	NOP2
686
687	FMADD	xsum2, xtemp2, a1,  xsum2
688	STFD	y02,  9 * SIZE(YY)
689	FMADD	y06, atemp2, a1, y06
690	LFD	a1, 16 * SIZE(AO1)
691
692	FMADD	xsum3, xtemp1, a5,  xsum3
693	STFD	y03, 10 * SIZE(YY)
694	FMADD	y07, atemp1, a3, y07
695	NOP2
696
697	FMADD	xsum4, xtemp2, a5,  xsum4
698	STFD	y04, 11 * SIZE(YY)
699	FMADD	y08, atemp2, a3, y08
700	NOP2
701
702	FMADD1	xsum1, xtemp2, a2,  xsum1
703	LFD	y01, 16 * SIZE(YY)
704	FNMSUB	y05, atemp2, a2, y05
705	NOP2
706
707	FMADD2	xsum2, xtemp1, a2,  xsum2
708	LFD	y02, 17 * SIZE(YY)
709	FMADD	y06, atemp1, a2, y06
710	LFD	a2, 17 * SIZE(AO1)
711
712	FMADD1	xsum3, xtemp2, a6,  xsum3
713	LFD	xtemp2, 17 * SIZE(XX)
714	FNMSUB	y07, atemp2, a4, y07
715	NOP2
716
717	FMADD2	xsum4, xtemp1, a6,  xsum4
718	LFD	xtemp1, 16 * SIZE(XX)
719	FMADD	y08, atemp1, a4, y08
720	addi	AO2, AO2, 16 * SIZE
721
722	FMADD	xsum1, xtemp3, a3,  xsum1
723	LFD	y03, 18 * SIZE(YY)
724	FMADD	y05, atemp3, a5, y05
725	addi	XX, XX, 16 * SIZE
726
727	FMADD	xsum2, xtemp4, a3,  xsum2
728	LFD	a3, 18 * SIZE(AO1)
729	FMADD	y06, atemp4, a5, y06
730	LFD	a5,  0 * SIZE(AO2)
731
732	FMADD	xsum3, xtemp3, a7,  xsum3
733	LFD	y04, 19 * SIZE(YY)
734	FMADD	y07, atemp3, a7, y07
735	NOP2
736
737	FMADD	xsum4, xtemp4, a7,  xsum4
738	addi	AO1, AO1, 16 * SIZE
739	FMADD	y08, atemp4, a7, y08
740	LFD	a7,  2 * SIZE(AO2)
741
742	FMADD1	xsum1, xtemp4, a4,  xsum1
743	addi	YY, YY, 16 * SIZE
744	FNMSUB	y05, atemp4, a6, y05
745	NOP2
746
747	FMADD2	xsum2, xtemp3, a4,  xsum2
748	LFD	a4,  3 * SIZE(AO1)
749	FMADD	y06, atemp3, a6, y06
750	LFD	a6,  1 * SIZE(AO2)
751
752	FMADD1	xsum3, xtemp4, a8,  xsum3
753	LFD	xtemp4,  3 * SIZE(XX)
754	FNMSUB	y07, atemp4, a8, y07
755	NOP2
756
757	FMADD2	xsum4, xtemp3, a8,  xsum4
758	LFD	xtemp3,  2 * SIZE(XX)
759	FMADD	y08, atemp3, a8, y08
760	LFD	a8,  3 * SIZE(AO2)
761
762	FMADD	xsum1, xtemp1, a1,  xsum1
763	STFD	y05, -4 * SIZE(YY)
764	FMADD	y01, atemp1, a1, y01
765	DCBT(AO1, PREA)
766
767	FMADD	xsum2, xtemp2, a1,  xsum2
768	STFD	y06, -3 * SIZE(YY)
769	FMADD	y02, atemp2, a1, y02
770	LFD	a1,  4 * SIZE(AO1)
771
772	FMADD	xsum3, xtemp1, a5,  xsum3
773	STFD	y07, -2 * SIZE(YY)
774	FMADD	y03, atemp1, a3, y03
775	NOP2
776
777	FMADD	xsum4, xtemp2, a5,  xsum4
778	STFD	y08, -1 * SIZE(YY)
779	FMADD	y04, atemp2, a3, y04
780	NOP2
781
782	FMADD1	xsum1, xtemp2, a2,  xsum1
783	LFD	y05,  4 * SIZE(YY)
784	FNMSUB	y01, atemp2, a2, y01
785	NOP2
786
787	FMADD2	xsum2, xtemp1, a2,  xsum2
788	LFD	y06,  5 * SIZE(YY)
789	FMADD	y02, atemp1, a2, y02
790	LFD	a2,  5 * SIZE(AO1)
791
792	FMADD1	xsum3, xtemp2, a6,  xsum3
793	LFD	xtemp2,  5 * SIZE(XX)
794	FNMSUB	y03, atemp2, a4, y03
795	NOP2
796
797	FMADD2	xsum4, xtemp1, a6,  xsum4
798	LFD	xtemp1,  4 * SIZE(XX)
799	FMADD	y04, atemp1, a4, y04
800	NOP2
801
802	FMADD	xsum1, xtemp3, a3,  xsum1
803	LFD	y07,  6 * SIZE(YY)
804	FMADD	y01, atemp3, a5, y01
805	NOP2
806
807	FMADD	xsum2, xtemp4, a3,  xsum2
808	LFD	a3,  6 * SIZE(AO1)
809	FMADD	y02, atemp4, a5, y02
810	LFD	a5,  4 * SIZE(AO2)
811
812	FMADD	xsum3, xtemp3, a7,  xsum3
813	LFD	y08,  7 * SIZE(YY)
814	FMADD	y03, atemp3, a7, y03
815	NOP2
816
817	FMADD	xsum4, xtemp4, a7,  xsum4
818	NOP1
819	FMADD	y04, atemp4, a7, y04
820	LFD	a7,  6 * SIZE(AO2)
821
822	FMADD1	xsum1, xtemp4, a4,  xsum1
823	NOP1
824	FNMSUB	y01, atemp4, a6, y01
825#	DCBT(X, PREX)
826	NOP2
827
828	FMADD2	xsum2, xtemp3, a4,  xsum2
829	LFD	a4,  7 * SIZE(AO1)
830	FMADD	y02, atemp3, a6, y02
831	LFD	a6,  5 * SIZE(AO2)
832
833	FMADD1	xsum3, xtemp4, a8,  xsum3
834	LFD	xtemp4,  7 * SIZE(XX)
835	FNMSUB	y03, atemp4, a8, y03
836	NOP2
837
838	FMADD2	xsum4, xtemp3, a8,  xsum4
839	LFD	xtemp3,  6 * SIZE(XX)
840	FMADD	y04, atemp3, a8, y04
841	LFD	a8,  7 * SIZE(AO2)
842
843	FMADD	xsum1, xtemp1, a1,  xsum1
844	STFD	y01,  0 * SIZE(YY)
845	FMADD	y05, atemp1, a1, y05
846	NOP2
847
848	FMADD	xsum2, xtemp2, a1,  xsum2
849	STFD	y02,  1 * SIZE(YY)
850	FMADD	y06, atemp2, a1, y06
851	LFD	a1,  8 * SIZE(AO1)
852
853	FMADD	xsum3, xtemp1, a5,  xsum3
854	STFD	y03,  2 * SIZE(YY)
855	FMADD	y07, atemp1, a3, y07
856	NOP2
857
858	FMADD	xsum4, xtemp2, a5,  xsum4
859	STFD	y04,  3 * SIZE(YY)
860	FMADD	y08, atemp2, a3, y08
861	NOP2
862
863	FMADD1	xsum1, xtemp2, a2,  xsum1
864	LFD	y01,  8 * SIZE(YY)
865	FNMSUB	y05, atemp2, a2, y05
866	NOP2
867
868	FMADD2	xsum2, xtemp1, a2,  xsum2
869	LFD	y02,  9 * SIZE(YY)
870	FMADD	y06, atemp1, a2, y06
871	LFD	a2,  9 * SIZE(AO1)
872
873	FMADD1	xsum3, xtemp2, a6,  xsum3
874	LFD	xtemp2,  9 * SIZE(XX)
875	FNMSUB	y07, atemp2, a4, y07
876	NOP2
877
878	FMADD2	xsum4, xtemp1, a6,  xsum4
879	LFD	xtemp1,  8 * SIZE(XX)
880	FMADD	y08, atemp1, a4, y08
881	NOP2
882
883	FMADD	xsum1, xtemp3, a3,  xsum1
884	LFD	y03, 10 * SIZE(YY)
885	FMADD	y05, atemp3, a5, y05
886	NOP2
887
888	FMADD	xsum2, xtemp4, a3,  xsum2
889	LFD	a3, 10 * SIZE(AO1)
890	FMADD	y06, atemp4, a5, y06
891	LFD	a5,  8 * SIZE(AO2)
892
893	FMADD	xsum3, xtemp3, a7,  xsum3
894	LFD	y04, 11 * SIZE(YY)
895	FMADD	y07, atemp3, a7, y07
896	NOP2
897
898	FMADD	xsum4, xtemp4, a7,  xsum4
899	NOP1
900	FMADD	y08, atemp4, a7, y08
901	LFD	a7, 10 * SIZE(AO2)
902
903	FMADD1	xsum1, xtemp4, a4,  xsum1
904	NOP1
905	FNMSUB	y05, atemp4, a6, y05
906	NOP2
907
908	FMADD2	xsum2, xtemp3, a4,  xsum2
909	LFD	a4, 11 * SIZE(AO1)
910	FMADD	y06, atemp3, a6, y06
911	LFD	a6,  9 * SIZE(AO2)
912
913	FMADD1	xsum3, xtemp4, a8,  xsum3
914	LFD	xtemp4, 11 * SIZE(XX)
915	FNMSUB	y07, atemp4, a8, y07
916	bdnz	LL(12)
917	.align 4
918
919LL(13):
920	FMADD2	xsum4, xtemp3, a8,  xsum4
921	LFD	xtemp3, 10 * SIZE(XX)
922	FMADD	y08, atemp3, a8, y08
923	LFD	a8, 11 * SIZE(AO2)
924
925	FMADD	xsum1, xtemp1, a1,  xsum1
926	STFD	y05,  4 * SIZE(YY)
927	FMADD	y01, atemp1, a1, y01
928	NOP2
929
930	FMADD	xsum2, xtemp2, a1,  xsum2
931	STFD	y06,  5 * SIZE(YY)
932	FMADD	y02, atemp2, a1, y02
933	LFD	a1, 12 * SIZE(AO1)
934
935	FMADD	xsum3, xtemp1, a5,  xsum3
936	STFD	y07,  6 * SIZE(YY)
937	FMADD	y03, atemp1, a3, y03
938	NOP2
939
940	FMADD	xsum4, xtemp2, a5,  xsum4
941	STFD	y08,  7 * SIZE(YY)
942	FMADD	y04, atemp2, a3, y04
943	NOP2
944
945	FMADD1	xsum1, xtemp2, a2,  xsum1
946	LFD	y05, 12 * SIZE(YY)
947	FNMSUB	y01, atemp2, a2, y01
948	NOP2
949
950	FMADD2	xsum2, xtemp1, a2,  xsum2
951	LFD	y06, 13 * SIZE(YY)
952	FMADD	y02, atemp1, a2, y02
953	LFD	a2, 13 * SIZE(AO1)
954
955	FMADD1	xsum3, xtemp2, a6,  xsum3
956	LFD	xtemp2, 13 * SIZE(XX)
957	FNMSUB	y03, atemp2, a4, y03
958	NOP2
959
960	FMADD2	xsum4, xtemp1, a6,  xsum4
961	LFD	xtemp1, 12 * SIZE(XX)
962	FMADD	y04, atemp1, a4, y04
963	NOP2
964
965	FMADD	xsum1, xtemp3, a3,  xsum1
966	LFD	y07, 14 * SIZE(YY)
967	FMADD	y01, atemp3, a5, y01
968	NOP2
969
970	FMADD	xsum2, xtemp4, a3,  xsum2
971	LFD	a3, 14 * SIZE(AO1)
972	FMADD	y02, atemp4, a5, y02
973	LFD	a5, 12 * SIZE(AO2)
974
975	FMADD	xsum3, xtemp3, a7,  xsum3
976	LFD	y08, 15 * SIZE(YY)
977	FMADD	y03, atemp3, a7, y03
978	NOP2
979
980	FMADD	xsum4, xtemp4, a7,  xsum4
981	NOP1
982	FMADD	y04, atemp4, a7, y04
983	LFD	a7, 14 * SIZE(AO2)
984
985	FMADD1	xsum1, xtemp4, a4,  xsum1
986	NOP1
987	FNMSUB	y01, atemp4, a6, y01
988	NOP2
989
990	FMADD2	xsum2, xtemp3, a4,  xsum2
991	LFD	a4, 15 * SIZE(AO1)
992	FMADD	y02, atemp3, a6, y02
993	LFD	a6, 13 * SIZE(AO2)
994
995	FMADD1	xsum3, xtemp4, a8,  xsum3
996	LFD	xtemp4, 15 * SIZE(XX)
997	FNMSUB	y03, atemp4, a8, y03
998	NOP2
999
1000	FMADD2	xsum4, xtemp3, a8,  xsum4
1001	LFD	xtemp3, 14 * SIZE(XX)
1002	FMADD	y04, atemp3, a8, y04
1003	LFD	a8, 15 * SIZE(AO2)
1004
1005	FMADD	xsum1, xtemp1, a1,  xsum1
1006	STFD	y01,  8 * SIZE(YY)
1007	FMADD	y05, atemp1, a1, y05
1008	NOP2
1009
1010	FMADD	xsum2, xtemp2, a1,  xsum2
1011	STFD	y02,  9 * SIZE(YY)
1012	FMADD	y06, atemp2, a1, y06
1013	LFD	a1, 16 * SIZE(AO1)
1014
1015	FMADD	xsum3, xtemp1, a5,  xsum3
1016	STFD	y03, 10 * SIZE(YY)
1017	FMADD	y07, atemp1, a3, y07
1018	NOP2
1019
1020	FMADD	xsum4, xtemp2, a5,  xsum4
1021	STFD	y04, 11 * SIZE(YY)
1022	FMADD	y08, atemp2, a3, y08
1023	NOP2
1024
1025	FMADD1	xsum1, xtemp2, a2,  xsum1
1026	LFD	y01, 16 * SIZE(YY)
1027	FNMSUB	y05, atemp2, a2, y05
1028	NOP2
1029
1030	FMADD2	xsum2, xtemp1, a2,  xsum2
1031	LFD	y02, 17 * SIZE(YY)
1032	FMADD	y06, atemp1, a2, y06
1033	LFD	a2, 17 * SIZE(AO1)
1034
1035	FMADD1	xsum3, xtemp2, a6,  xsum3
1036	LFD	xtemp2, 17 * SIZE(XX)
1037	FNMSUB	y07, atemp2, a4, y07
1038	NOP2
1039
1040	FMADD2	xsum4, xtemp1, a6,  xsum4
1041	LFD	xtemp1, 16 * SIZE(XX)
1042	FMADD	y08, atemp1, a4, y08
1043	addi	AO2, AO2, 16 * SIZE
1044
1045	FMADD	xsum1, xtemp3, a3,  xsum1
1046	LFD	y03, 18 * SIZE(YY)
1047	FMADD	y05, atemp3, a5, y05
1048	addi	XX, XX, 16 * SIZE
1049
1050	FMADD	xsum2, xtemp4, a3,  xsum2
1051	LFD	a3, 18 * SIZE(AO1)
1052	FMADD	y06, atemp4, a5, y06
1053	LFD	a5,  0 * SIZE(AO2)
1054
1055	FMADD	xsum3, xtemp3, a7,  xsum3
1056	LFD	y04, 19 * SIZE(YY)
1057	FMADD	y07, atemp3, a7, y07
1058	NOP2
1059
1060	FMADD	xsum4, xtemp4, a7,  xsum4
1061	addi	AO1, AO1, 16 * SIZE
1062	FMADD	y08, atemp4, a7, y08
1063	LFD	a7,  2 * SIZE(AO2)
1064
1065	FMADD1	xsum1, xtemp4, a4,  xsum1
1066	addi	YY, YY, 16 * SIZE
1067	FNMSUB	y05, atemp4, a6, y05
1068	NOP2
1069
1070	FMADD2	xsum2, xtemp3, a4,  xsum2
1071	LFD	a4,  3 * SIZE(AO1)
1072	FMADD	y06, atemp3, a6, y06
1073	LFD	a6,  1 * SIZE(AO2)
1074
1075	FMADD1	xsum3, xtemp4, a8,  xsum3
1076	LFD	xtemp4,  3 * SIZE(XX)
1077	FNMSUB	y07, atemp4, a8, y07
1078	NOP2
1079
1080	FMADD2	xsum4, xtemp3, a8,  xsum4
1081	LFD	xtemp3,  2 * SIZE(XX)
1082	FMADD	y08, atemp3, a8, y08
1083	LFD	a8,  3 * SIZE(AO2)
1084
1085	STFD	y05, -4 * SIZE(YY)
1086	STFD	y06, -3 * SIZE(YY)
1087	STFD	y07, -2 * SIZE(YY)
1088	STFD	y08, -1 * SIZE(YY)
1089	.align 4
1090
1091LL(15):
1092	andi.	r0,  IS, 4
1093	ble	LL(16)
1094
1095	FMADD	xsum1, xtemp1, a1,  xsum1
1096	NOP1
1097	FMADD	y01, atemp1, a1, y01
1098	NOP2
1099
1100	FMADD	xsum2, xtemp2, a1,  xsum2
1101	NOP1
1102	FMADD	y02, atemp2, a1, y02
1103	LFD	a1,  4 * SIZE(AO1)
1104
1105	FMADD	xsum3, xtemp1, a5,  xsum3
1106	NOP1
1107	FMADD	y03, atemp1, a3, y03
1108	NOP2
1109
1110	FMADD	xsum4, xtemp2, a5,  xsum4
1111	NOP1
1112	FMADD	y04, atemp2, a3, y04
1113	NOP2
1114
1115	FMADD1	xsum1, xtemp2, a2,  xsum1
1116	LFD	y05,  4 * SIZE(YY)
1117	FNMSUB	y01, atemp2, a2, y01
1118	NOP2
1119
1120	FMADD2	xsum2, xtemp1, a2,  xsum2
1121	LFD	y06,  5 * SIZE(YY)
1122	FMADD	y02, atemp1, a2, y02
1123	LFD	a2,  5 * SIZE(AO1)
1124
1125	FMADD1	xsum3, xtemp2, a6,  xsum3
1126	LFD	xtemp2,  5 * SIZE(XX)
1127	FNMSUB	y03, atemp2, a4, y03
1128	NOP2
1129
1130	FMADD2	xsum4, xtemp1, a6,  xsum4
1131	LFD	xtemp1,  4 * SIZE(XX)
1132	FMADD	y04, atemp1, a4, y04
1133	NOP2
1134
1135	FMADD	xsum1, xtemp3, a3,  xsum1
1136	LFD	y07,  6 * SIZE(YY)
1137	FMADD	y01, atemp3, a5, y01
1138	NOP2
1139
1140	FMADD	xsum2, xtemp4, a3,  xsum2
1141	LFD	a3,  6 * SIZE(AO1)
1142	FMADD	y02, atemp4, a5, y02
1143	LFD	a5,  4 * SIZE(AO2)
1144
1145	FMADD	xsum3, xtemp3, a7,  xsum3
1146	LFD	y08,  7 * SIZE(YY)
1147	FMADD	y03, atemp3, a7, y03
1148	NOP2
1149
1150	FMADD	xsum4, xtemp4, a7,  xsum4
1151	NOP1
1152	FMADD	y04, atemp4, a7, y04
1153	LFD	a7,  6 * SIZE(AO2)
1154
1155	FMADD1	xsum1, xtemp4, a4,  xsum1
1156	NOP1
1157	FNMSUB	y01, atemp4, a6, y01
1158	NOP2
1159
1160	FMADD2	xsum2, xtemp3, a4,  xsum2
1161	LFD	a4,  7 * SIZE(AO1)
1162	FMADD	y02, atemp3, a6, y02
1163	LFD	a6,  5 * SIZE(AO2)
1164
1165	FMADD1	xsum3, xtemp4, a8,  xsum3
1166	LFD	xtemp4,  7 * SIZE(XX)
1167	FNMSUB	y03, atemp4, a8, y03
1168	NOP2
1169
1170	FMADD2	xsum4, xtemp3, a8,  xsum4
1171	LFD	xtemp3,  6 * SIZE(XX)
1172	FMADD	y04, atemp3, a8, y04
1173	LFD	a8,  7 * SIZE(AO2)
1174
1175	FMADD	xsum1, xtemp1, a1,  xsum1
1176	STFD	y01,  0 * SIZE(YY)
1177	FMADD	y05, atemp1, a1, y05
1178	NOP2
1179
1180	FMADD	xsum2, xtemp2, a1,  xsum2
1181	STFD	y02,  1 * SIZE(YY)
1182	FMADD	y06, atemp2, a1, y06
1183	LFD	a1,  8 * SIZE(AO1)
1184
1185	FMADD	xsum3, xtemp1, a5,  xsum3
1186	STFD	y03,  2 * SIZE(YY)
1187	FMADD	y07, atemp1, a3, y07
1188	NOP2
1189
1190	FMADD	xsum4, xtemp2, a5,  xsum4
1191	STFD	y04,  3 * SIZE(YY)
1192	FMADD	y08, atemp2, a3, y08
1193	NOP2
1194
1195	FMADD1	xsum1, xtemp2, a2,  xsum1
1196	LFD	y01,  8 * SIZE(YY)
1197	FNMSUB	y05, atemp2, a2, y05
1198	NOP2
1199
1200	FMADD2	xsum2, xtemp1, a2,  xsum2
1201	LFD	y02,  9 * SIZE(YY)
1202	FMADD	y06, atemp1, a2, y06
1203	LFD	a2,  9 * SIZE(AO1)
1204
1205	FMADD1	xsum3, xtemp2, a6,  xsum3
1206	LFD	xtemp2,  9 * SIZE(XX)
1207	FNMSUB	y07, atemp2, a4, y07
1208	NOP2
1209
1210	FMADD2	xsum4, xtemp1, a6,  xsum4
1211	LFD	xtemp1,  8 * SIZE(XX)
1212	FMADD	y08, atemp1, a4, y08
1213	NOP2
1214
1215	FMADD	xsum1, xtemp3, a3,  xsum1
1216	LFD	y03, 10 * SIZE(YY)
1217	FMADD	y05, atemp3, a5, y05
1218	NOP2
1219
1220	FMADD	xsum2, xtemp4, a3,  xsum2
1221	LFD	a3, 10 * SIZE(AO1)
1222	FMADD	y06, atemp4, a5, y06
1223	LFD	a5,  8 * SIZE(AO2)
1224
1225	FMADD	xsum3, xtemp3, a7,  xsum3
1226	LFD	y04, 11 * SIZE(YY)
1227	FMADD	y07, atemp3, a7, y07
1228	NOP2
1229
1230	FMADD	xsum4, xtemp4, a7,  xsum4
1231	NOP1
1232	FMADD	y08, atemp4, a7, y08
1233	LFD	a7, 10 * SIZE(AO2)
1234
1235	FMADD1	xsum1, xtemp4, a4,  xsum1
1236	NOP1
1237	FNMSUB	y05, atemp4, a6, y05
1238	NOP2
1239
1240	FMADD2	xsum2, xtemp3, a4,  xsum2
1241	LFD	a4, 11 * SIZE(AO1)
1242	FMADD	y06, atemp3, a6, y06
1243	LFD	a6,  9 * SIZE(AO2)
1244
1245	FMADD1	xsum3, xtemp4, a8,  xsum3
1246	LFD	xtemp4, 11 * SIZE(XX)
1247	FNMSUB	y07, atemp4, a8, y07
1248
1249	FMADD2	xsum4, xtemp3, a8,  xsum4
1250	LFD	xtemp3, 10 * SIZE(XX)
1251	FMADD	y08, atemp3, a8, y08
1252	LFD	a8, 11 * SIZE(AO2)
1253
1254	STFD	y05,  4 * SIZE(YY)
1255	STFD	y06,  5 * SIZE(YY)
1256	STFD	y07,  6 * SIZE(YY)
1257	STFD	y08,  7 * SIZE(YY)
1258
1259	addi	AO1, AO1, 8 * SIZE
1260	addi	AO2, AO2, 8 * SIZE
1261
1262	addi	XX, XX, 8 * SIZE
1263	addi	YY, YY, 8 * SIZE
1264	.align 4
1265
1266LL(16):
1267	andi.	r0,  IS, 2
1268	ble	LL(18)
1269
1270	FMADD	xsum1, xtemp1, a1,  xsum1
1271	FMADD	y01, atemp1, a1, y01
1272	FMADD	xsum2, xtemp2, a1,  xsum2
1273	FMADD	y02, atemp2, a1, y02
1274	FMADD	xsum3, xtemp1, a5,  xsum3
1275	FMADD	y03, atemp1, a3, y03
1276	FMADD	xsum4, xtemp2, a5,  xsum4
1277	FMADD	y04, atemp2, a3, y04
1278
1279	FMADD1	xsum1, xtemp2, a2,  xsum1
1280	FNMSUB	y01, atemp2, a2, y01
1281	FMADD2	xsum2, xtemp1, a2,  xsum2
1282	FMADD	y02, atemp1, a2, y02
1283	FMADD1	xsum3, xtemp2, a6,  xsum3
1284	FNMSUB	y03, atemp2, a4, y03
1285	FMADD2	xsum4, xtemp1, a6,  xsum4
1286	FMADD	y04, atemp1, a4, y04
1287
1288	FMADD	xsum1, xtemp3, a3,  xsum1
1289	FMADD	y01, atemp3, a5, y01
1290	FMADD	xsum2, xtemp4, a3,  xsum2
1291	FMADD	y02, atemp4, a5, y02
1292	FMADD	xsum3, xtemp3, a7,  xsum3
1293	FMADD	y03, atemp3, a7, y03
1294	FMADD	xsum4, xtemp4, a7,  xsum4
1295	FMADD	y04, atemp4, a7, y04
1296
1297	FMADD1	xsum1, xtemp4, a4,  xsum1
1298	FNMSUB	y01, atemp4, a6, y01
1299	FMADD2	xsum2, xtemp3, a4,  xsum2
1300	FMADD	y02, atemp3, a6, y02
1301	FMADD1	xsum3, xtemp4, a8,  xsum3
1302	FNMSUB	y03, atemp4, a8, y03
1303	FMADD2	xsum4, xtemp3, a8,  xsum4
1304	FMADD	y04, atemp3, a8, y04
1305
1306	STFD	y01,  0 * SIZE(YY)
1307	STFD	y02,  1 * SIZE(YY)
1308	STFD	y03,  2 * SIZE(YY)
1309	STFD	y04,  3 * SIZE(YY)
1310
1311	LFD	a1,  4 * SIZE(AO1)
1312	LFD	a2,  5 * SIZE(AO1)
1313
1314	LFD	a5,  4 * SIZE(AO2)
1315	LFD	a6,  5 * SIZE(AO2)
1316	LFD	a7,  6 * SIZE(AO2)
1317	LFD	a8,  7 * SIZE(AO2)
1318
1319	LFD	y01,  4 * SIZE(YY)
1320	LFD	y02,  5 * SIZE(YY)
1321	LFD	y03,  6 * SIZE(YY)
1322	LFD	y04,  7 * SIZE(YY)
1323
1324	addi	YY, YY, 4 * SIZE
1325	.align 4
1326
1327LL(18):
1328	LFD	y05, ALPHA_R
1329	LFD	y06, ALPHA_I
1330
1331	FMUL	xtemp1, y05, xsum1
1332	FMUL	xtemp2, y06, xsum1
1333	FMUL	xtemp3, y05, xsum3
1334	FMUL	xtemp4, y06, xsum3
1335
1336	FNMSUB	xsum1, y06, xsum2, xtemp1
1337	FMADD	xsum2, y05, xsum2, xtemp2
1338	FNMSUB	xsum3, y06, xsum4, xtemp3
1339	FMADD	xsum4, y05, xsum4, xtemp4
1340
1341	FMADD	xsum1, atemp1, a1, xsum1
1342	FMADD	xsum2, atemp2, a1, xsum2
1343	FMADD	xsum3, atemp1, a5, xsum3
1344	FMADD	xsum4, atemp2, a5, xsum4
1345
1346#ifndef HEMV
1347	FMADD1	xsum1, atemp2, a2, xsum1
1348	FMADD2	xsum2, atemp1, a2, xsum2
1349#endif
1350	FMADD1	xsum3, atemp2, a6, xsum3
1351	FMADD2	xsum4, atemp1, a6, xsum4
1352
1353	FMADD	xsum1, atemp3, a5, xsum1
1354	FMADD	xsum2, atemp4, a5, xsum2
1355	FMADD	xsum3, atemp3, a7, xsum3
1356	FMADD	xsum4, atemp4, a7, xsum4
1357
1358	FNMSUB	xsum1, atemp4, a6, xsum1
1359	FMADD	xsum2, atemp3, a6, xsum2
1360#ifndef HEMV
1361	FNMSUB	xsum3, atemp4, a8, xsum3
1362	FMADD	xsum4, atemp3, a8, xsum4
1363#endif
1364
1365	FADD	y01, y01, xsum1
1366	FADD	y02, y02, xsum2
1367	FADD	y03, y03, xsum3
1368	FADD	y04, y04, xsum4
1369
1370	STFD	y01,  0 * SIZE(YY)
1371	addi	TEMP, IS, 4
1372	STFD	y02,  1 * SIZE(YY)
1373	addi	IS,   IS, 2
1374	STFD	y03,  2 * SIZE(YY)
1375	cmpw	cr0, TEMP, M
1376	STFD	y04,  3 * SIZE(YY)
1377	ble	LL(11)
1378	.align 4
1379
1380LL(20):
1381	andi.	TEMP, M, 1
1382	ble	LL(990)
1383
1384	mr	AO1, A
1385
1386	slwi	TEMP,  IS,  ZBASE_SHIFT
1387	add	TEMP, X, TEMP
1388
1389	LFD	y05, ALPHA_R
1390	LFD	y06, ALPHA_I
1391
1392	LFD	xtemp1, 0 * SIZE(TEMP)
1393	LFD	xtemp2, 1 * SIZE(TEMP)
1394
1395	FMUL	atemp1, y05, xtemp1
1396	FMUL	atemp2, y06, xtemp1
1397
1398	FNMSUB	atemp1, y06, xtemp2, atemp1
1399	FMADD	atemp2, y05, xtemp2, atemp2
1400
1401	lfd	xsum1, FZERO
1402	fmr	xsum2, xsum1
1403
1404	mr	XX, X
1405	mr	YY, NEW_Y
1406
1407	LFD	a1,  0 * SIZE(AO1)
1408	LFD	a2,  1 * SIZE(AO1)
1409
1410	LFD	xtemp1,  0 * SIZE(XX)
1411	LFD	xtemp2,  1 * SIZE(XX)
1412
1413	LFD	y01,  0 * SIZE(YY)
1414	LFD	y02,  1 * SIZE(YY)
1415
1416	mtspr	CTR, IS
1417	cmpwi	cr0, IS, 0
1418	ble	LL(28)
1419	.align 4
1420
1421LL(22):
1422	FMADD	xsum1, xtemp1, a1,  xsum1
1423	FMADD	y01, atemp1, a1, y01
1424	FMADD	xsum2, xtemp2, a1,  xsum2
1425	FMADD	y02, atemp2, a1, y02
1426	LFD	a1,  2 * SIZE(AO1)
1427
1428	FMADD1	xsum1, xtemp2, a2,  xsum1
1429	LFD	xtemp2,  3 * SIZE(XX)
1430	FNMSUB	y01, atemp2, a2, y01
1431	FMADD2	xsum2, xtemp1, a2,  xsum2
1432	LFD	xtemp1,  2 * SIZE(XX)
1433	FMADD	y02, atemp1, a2, y02
1434	LFD	a2,  3 * SIZE(AO1)
1435
1436	addi	AO1, AO1, 2 * SIZE
1437	addi	XX, XX, 2 * SIZE
1438	addi	YY, YY, 2 * SIZE
1439
1440	STFD	y01, -2 * SIZE(YY)
1441	LFD	y01,  0 * SIZE(YY)
1442	STFD	y02, -1 * SIZE(YY)
1443	LFD	y02,  1 * SIZE(YY)
1444	bdnz	LL(22)
1445	.align 4
1446
1447LL(28):
1448	LFD	y05, ALPHA_R
1449	LFD	y06, ALPHA_I
1450
1451	FMUL	xtemp1, y05, xsum1
1452	FMUL	xtemp2, y06, xsum1
1453
1454	FNMSUB	xsum1, y06, xsum2, xtemp1
1455	FMADD	xsum2, y05, xsum2, xtemp2
1456
1457	FMADD	xsum1, atemp1, a1, xsum1
1458	FMADD	xsum2, atemp2, a1, xsum2
1459
1460#ifndef HEMV
1461	FNMSUB	xsum1, atemp2, a2, xsum1
1462	FMADD	xsum2, atemp1, a2, xsum2
1463#endif
1464
1465	FADD	y01, y01, xsum1
1466	FADD	y02, y02, xsum2
1467
1468	STFD	y01,  0 * SIZE(YY)
1469	STFD	y02,  1 * SIZE(YY)
1470	.align 4
1471
1472LL(990):
1473	cmpwi	cr0, INCY, 2 * SIZE
1474	beq	LL(999)
1475
1476	mr	YY, Y
1477
1478	srawi.	r0, M, 2
1479	mtspr	CTR, r0
1480	ble	LL(995)
1481	.align 4
1482
1483LL(991):
1484	LFD	f0,  0 * SIZE(Y)
1485	LFD	f1,  1 * SIZE(Y)
1486	add	Y, Y, INCY
1487	LFD	f2,  0 * SIZE(Y)
1488	LFD	f3,  1 * SIZE(Y)
1489	add	Y, Y, INCY
1490	LFD	f4,  0 * SIZE(Y)
1491	LFD	f5,  1 * SIZE(Y)
1492	add	Y, Y, INCY
1493	LFD	f6,  0 * SIZE(Y)
1494	LFD	f7,  1 * SIZE(Y)
1495	add	Y, Y, INCY
1496
1497	LFD	f8,   0 * SIZE(NEW_Y)
1498	LFD	f9,   1 * SIZE(NEW_Y)
1499	LFD	f10,  2 * SIZE(NEW_Y)
1500	LFD	f11,  3 * SIZE(NEW_Y)
1501	LFD	f12,  4 * SIZE(NEW_Y)
1502	LFD	f13,  5 * SIZE(NEW_Y)
1503	LFD	f14,  6 * SIZE(NEW_Y)
1504	LFD	f15,  7 * SIZE(NEW_Y)
1505	addi	NEW_Y, NEW_Y, 8 * SIZE
1506
1507	FADD	f8,  f8,  f0
1508	FADD	f9,  f9,  f1
1509	FADD	f10, f10, f2
1510	FADD	f11, f11, f3
1511	FADD	f12, f12, f4
1512	FADD	f13, f13, f5
1513	FADD	f14, f14, f6
1514	FADD	f15, f15, f7
1515
1516	STFD	f8,  0 * SIZE(YY)
1517	STFD	f9,  1 * SIZE(YY)
1518	add	YY, YY, INCY
1519	STFD	f10, 0 * SIZE(YY)
1520	STFD	f11, 1 * SIZE(YY)
1521	add	YY, YY, INCY
1522	STFD	f12, 0 * SIZE(YY)
1523	STFD	f13, 1 * SIZE(YY)
1524	add	YY, YY, INCY
1525	STFD	f14, 0 * SIZE(YY)
1526	STFD	f15, 1 * SIZE(YY)
1527	add	YY, YY, INCY
1528	bdnz	LL(991)
1529	.align 4
1530
1531LL(995):
1532	andi.	J, M, 2
1533	ble	LL(996)
1534
1535	LFD	f0,  0 * SIZE(Y)
1536	LFD	f1,  1 * SIZE(Y)
1537	add	Y, Y, INCY
1538	LFD	f2,  0 * SIZE(Y)
1539	LFD	f3,  1 * SIZE(Y)
1540	add	Y, Y, INCY
1541
1542	LFD	f8,   0 * SIZE(NEW_Y)
1543	LFD	f9,   1 * SIZE(NEW_Y)
1544	LFD	f10,  2 * SIZE(NEW_Y)
1545	LFD	f11,  3 * SIZE(NEW_Y)
1546	addi	NEW_Y, NEW_Y, 4 * SIZE
1547
1548	FADD	f8,  f8,  f0
1549	FADD	f9,  f9,  f1
1550	FADD	f10, f10, f2
1551	FADD	f11, f11, f3
1552
1553	STFD	f8,  0 * SIZE(YY)
1554	STFD	f9,  1 * SIZE(YY)
1555	add	YY, YY, INCY
1556	STFD	f10, 0 * SIZE(YY)
1557	STFD	f11, 1 * SIZE(YY)
1558	add	YY, YY, INCY
1559	.align 4
1560
1561LL(996):
1562	andi.	J, M, 1
1563	ble	LL(999)
1564
1565	LFD	f0,  0 * SIZE(Y)
1566	LFD	f1,  1 * SIZE(Y)
1567
1568	LFD	f8,   0 * SIZE(NEW_Y)
1569	LFD	f9,   1 * SIZE(NEW_Y)
1570
1571	FADD	f8,  f8,  f0
1572	FADD	f9,  f9,  f1
1573
1574	STFD	f8,  0 * SIZE(YY)
1575	STFD	f9,  1 * SIZE(YY)
1576	.align 4
1577
1578LL(999):
1579	li	r3, 0
1580
1581	lfd	f14,     0(SP)
1582	lfd	f15,     8(SP)
1583	lfd	f16,    16(SP)
1584	lfd	f17,    24(SP)
1585	lfd	f18,    32(SP)
1586	lfd	f19,    40(SP)
1587	lfd	f20,    48(SP)
1588	lfd	f21,    56(SP)
1589	lfd	f22,    64(SP)
1590	lfd	f23,    72(SP)
1591	lfd	f24,    80(SP)
1592	lfd	f25,    88(SP)
1593	lfd	f26,    96(SP)
1594	lfd	f27,   104(SP)
1595	lfd	f28,   112(SP)
1596	lfd	f29,   120(SP)
1597	lfd	f30,   128(SP)
1598	lfd	f31,   136(SP)
1599
1600#ifdef __64BIT__
1601	ld	r14,   144(SP)
1602	ld	r15,   152(SP)
1603	ld	r16,   160(SP)
1604	ld	r17,   168(SP)
1605	ld	r18,   176(SP)
1606	ld	r19,   184(SP)
1607	ld	r20,   192(SP)
1608	ld	r21,   200(SP)
1609	ld	r22,   208(SP)
1610	ld	r23,   216(SP)
1611	ld	r24,   224(SP)
1612	ld	r25,   232(SP)
1613	ld	r26,   240(SP)
1614	ld	r27,   248(SP)
1615#else
1616	lwz	r14,   144(SP)
1617	lwz	r15,   148(SP)
1618	lwz	r16,   152(SP)
1619	lwz	r17,   156(SP)
1620	lwz	r18,   160(SP)
1621	lwz	r19,   164(SP)
1622	lwz	r20,   168(SP)
1623	lwz	r21,   172(SP)
1624	lwz	r22,   176(SP)
1625	lwz	r23,   180(SP)
1626	lwz	r24,   184(SP)
1627	lwz	r25,   188(SP)
1628	lwz	r26,   192(SP)
1629	lwz	r27,   196(SP)
1630#endif
1631
1632	addi	SP, SP, STACKSIZE
1633	blr
1634
1635	EPILOGUE
1636#endif
1637