1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#ifdef linux
43#ifndef __64BIT__
44#define M	r3
45#define IS	r4
46#define A	r5
47#define LDA	r6
48#define X	r7
49#define	INCX	r8
50#define	Y	r9
51#define	INCY	r10
52#define BUFFER	r14
53#else
54#define M	r3
55#define IS	r4
56#define A	r6
57#define LDA	r7
58#define X	r8
59#define	INCX	r9
60#define	Y	r10
61#define	INCY	r5
62#define BUFFER	r14
63#endif
64#endif
65
66#if defined(_AIX) || defined(__APPLE__)
67#if !defined(__64BIT__) && defined(DOUBLE)
68#define M	r3
69#define IS	r4
70#define A	r7
71#define LDA	r8
72#define X	r9
73#define	INCX	r10
74#define	Y	r5
75#define	INCY	r6
76#define BUFFER	r14
77#else
78#define M	r3
79#define IS	r4
80#define A	r6
81#define LDA	r7
82#define X	r8
83#define	INCX	r9
84#define	Y	r10
85#define	INCY	r5
86#define BUFFER	r14
87#endif
88#endif
89
90#define I	r11
91#define	J	r12
92
93#define AO1	r15
94#define AO2	r16
95#define AO3	r17
96#define AO4	r18
97#define XX	r19
98#define YY	r20
99#define	NEW_Y	r21
100#define TEMP	r22
101#define	PREA	r24
102
103#define y01 f0
104#define y02 f1
105#define y03 f2
106#define y04 f3
107
108#define atemp1 f4
109#define atemp2 f5
110#define atemp3 f6
111#define atemp4 f7
112
113#define xtemp1 f8
114#define xtemp2 f9
115#define xtemp3 f10
116#define xtemp4 f11
117
118#define xsum1  f12
119#define xsum2  f13
120#define xsum3  f14
121#define xsum4  f15
122
123#define a1     f16
124#define a2     f17
125#define a3     f18
126#define a4     f19
127#define a5     f20
128#define a6     f21
129#define a7     f22
130#define a8     f23
131#define a9     f24
132#define a10    f25
133#define a11    f26
134#define a12    f27
135#define a13    f28
136#define a14    f29
137#define a15    f30
138#define a16    f31
139
140#define alpha  f1
141
142#if defined(PPCG4)
143#define PREFETCHSIZE_A  24
144#endif
145
146#if defined(PPC440) || defined(PPC440FP2)
147#define PREFETCHSIZE_A  24
148#endif
149
150#ifdef PPC970
151#define PREFETCHSIZE_A  64
152#endif
153
154#ifdef CELL
155#define PREFETCHSIZE_A  72
156#endif
157
158#ifdef POWER4
159#define PREFETCHSIZE_A  16
160#endif
161
162#ifdef POWER5
163#define PREFETCHSIZE_A  96
164#endif
165
166#ifdef POWER6
167#define PREFETCHSIZE_A  40
168#endif
169
170#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970)
171#define NOP1
172#define NOP2
173#else
174#define NOP1   mr	LDA, LDA
175#define NOP2   mr	INCX, INCX
176#endif
177
178#ifndef NEEDPARAM
179
180#ifndef __64BIT__
181#define STACKSIZE 224
182#define ALPHA     200(SP)
183#define	FZERO	  208(SP)
184#else
185#define STACKSIZE 280
186#define ALPHA     256(SP)
187#define FZERO	  264(SP)
188#endif
189
190	PROLOGUE
191	PROFCODE
192
193	addi	SP,   SP, -STACKSIZE
194	li	r0,   0
195
196	stfd	f14,     0(SP)
197	stfd	f15,     8(SP)
198	stfd	f16,    16(SP)
199	stfd	f17,    24(SP)
200	stfd	f18,    32(SP)
201	stfd	f19,    40(SP)
202	stfd	f20,    48(SP)
203	stfd	f21,    56(SP)
204	stfd	f22,    64(SP)
205	stfd	f23,    72(SP)
206	stfd	f24,    80(SP)
207	stfd	f25,    88(SP)
208	stfd	f26,    96(SP)
209	stfd	f27,   104(SP)
210	stfd	f28,   112(SP)
211	stfd	f29,   120(SP)
212	stfd	f30,   128(SP)
213	stfd	f31,   136(SP)
214
215#ifdef __64BIT__
216	std	r0,    FZERO
217	std	r14,   144(SP)
218	std	r15,   152(SP)
219	std	r16,   160(SP)
220	std	r17,   168(SP)
221	std	r18,   176(SP)
222	std	r19,   184(SP)
223	std	r20,   192(SP)
224	std	r21,   200(SP)
225	std	r22,   208(SP)
226	std	r23,   216(SP)
227	std	r24,   224(SP)
228	std	r25,   232(SP)
229	std	r26,   240(SP)
230	std	r27,   248(SP)
231#else
232	stw	r0,    0 + FZERO
233	stw	r0,    4 + FZERO
234	stw	r14,   144(SP)
235	stw	r15,   148(SP)
236	stw	r16,   152(SP)
237	stw	r17,   156(SP)
238	stw	r18,   160(SP)
239	stw	r19,   164(SP)
240	stw	r20,   168(SP)
241	stw	r21,   172(SP)
242	stw	r22,   176(SP)
243	stw	r23,   180(SP)
244	stw	r24,   184(SP)
245	stw	r25,   188(SP)
246	stw	r26,   192(SP)
247	stw	r27,   196(SP)
248#endif
249
250#ifdef linux
251#ifndef __64BIT__
252	lwz	BUFFER,   56 + STACKSIZE(SP)
253#else
254	ld	INCY,    112 + STACKSIZE(SP)
255	ld	BUFFER,  120 + STACKSIZE(SP)
256#endif
257#endif
258
259#if defined(_AIX) || defined(__APPLE__)
260#ifndef __64BIT__
261#ifdef DOUBLE
262	lwz	Y,       56 + STACKSIZE(SP)
263	lwz	INCY,    60 + STACKSIZE(SP)
264	lwz	BUFFER,  64 + STACKSIZE(SP)
265#else
266	lwz	  INCY,  56 + STACKSIZE(SP)
267	lwz	BUFFER,  60 + STACKSIZE(SP)
268#endif
269#else
270	ld	INCY,    112 + STACKSIZE(SP)
271	ld	BUFFER,  120 + STACKSIZE(SP)
272#endif
273#endif
274
275	STFD	alpha, ALPHA
276
277	slwi	LDA,  LDA,  BASE_SHIFT
278	slwi	INCX, INCX, BASE_SHIFT
279	slwi	INCY, INCY, BASE_SHIFT
280
281	li	PREA, PREFETCHSIZE_A * SIZE
282	sub	IS, M, IS
283
284	cmpwi	cr0, M, 0
285	ble-	LL(999)
286
287	mullw	TEMP, IS, LDA
288	add	A, A, TEMP
289
290	cmpwi	cr0, INCX, SIZE
291	beq	LL(05)
292
293	mr	XX, X
294	mr	X, BUFFER
295
296	srawi.	r0, M, 3
297	mtspr	CTR, r0
298	ble	LL(03)
299	.align 4
300
301LL(01):
302	LFD	a1, 0 * SIZE(XX)
303	add	XX, XX, INCX
304	LFD	a2, 0 * SIZE(XX)
305	add	XX, XX, INCX
306	LFD	a3, 0 * SIZE(XX)
307	add	XX, XX, INCX
308	LFD	a4, 0 * SIZE(XX)
309	add	XX, XX, INCX
310	LFD	a5, 0 * SIZE(XX)
311	add	XX, XX, INCX
312	LFD	a6, 0 * SIZE(XX)
313	add	XX, XX, INCX
314	LFD	a7, 0 * SIZE(XX)
315	add	XX, XX, INCX
316	LFD	a8, 0 * SIZE(XX)
317	add	XX, XX, INCX
318
319	dcbt	XX, PREA
320	dcbtst	BUFFER, PREA
321
322	STFD	a1, 0 * SIZE(BUFFER)
323	STFD	a2, 1 * SIZE(BUFFER)
324	STFD	a3, 2 * SIZE(BUFFER)
325	STFD	a4, 3 * SIZE(BUFFER)
326	STFD	a5, 4 * SIZE(BUFFER)
327	STFD	a6, 5 * SIZE(BUFFER)
328	STFD	a7, 6 * SIZE(BUFFER)
329	STFD	a8, 7 * SIZE(BUFFER)
330
331	addi	BUFFER, BUFFER, 8 * SIZE
332	bdnz	LL(01)
333	.align 4
334
335LL(03):
336	andi.	r0, M, 7
337	mtspr	CTR, r0
338	ble	LL(05)
339	.align 4
340
341LL(04):
342	LFD	a1, 0 * SIZE(XX)
343	add	XX, XX, INCX
344
345	STFD	a1, 0 * SIZE(BUFFER)
346	addi	BUFFER, BUFFER, 1 * SIZE
347	bdnz	LL(04)
348	.align 4
349
350LL(05):
351	mr	NEW_Y, Y
352	lfd	f0, FZERO
353
354	cmpwi	cr0, INCY, SIZE
355	beq	LL(10)
356
357	mr	NEW_Y, BUFFER
358
359	addi	r0, M,  7
360	srawi.	r0, r0, 3
361	mtspr	CTR, r0
362	.align 4
363
364LL(06):
365	STFD	f0, 0 * SIZE(BUFFER)
366	STFD	f0, 1 * SIZE(BUFFER)
367	STFD	f0, 2 * SIZE(BUFFER)
368	STFD	f0, 3 * SIZE(BUFFER)
369	STFD	f0, 4 * SIZE(BUFFER)
370	STFD	f0, 5 * SIZE(BUFFER)
371	STFD	f0, 6 * SIZE(BUFFER)
372	STFD	f0, 7 * SIZE(BUFFER)
373	addi	BUFFER, BUFFER, 8 * SIZE
374	bdnz	LL(06)
375	.align 4
376
377LL(10):
378	addi	TEMP, IS, 4
379	cmpw	cr0, TEMP, M
380	bgt	LL(20)
381	.align 4
382
383LL(11):
384	mr	AO1, A
385	add	AO2, A,   LDA
386	add	AO3, AO2, LDA
387	add	AO4, AO3, LDA
388	add	A,   AO4, LDA
389
390	slwi	TEMP,  IS,  BASE_SHIFT
391	add	TEMP, X, TEMP
392
393	LFD	a16, ALPHA
394	lfd	xsum1, FZERO
395
396	LFD	atemp1, 0 * SIZE(TEMP)
397	LFD	atemp2, 1 * SIZE(TEMP)
398	LFD	atemp3, 2 * SIZE(TEMP)
399	LFD	atemp4, 3 * SIZE(TEMP)
400
401	LFD	xtemp1,  0 * SIZE(X)
402	LFD	xtemp2,  1 * SIZE(X)
403	LFD	xtemp3,  2 * SIZE(X)
404	LFD	xtemp4,  3 * SIZE(X)
405
406	LFD	y01,  0 * SIZE(NEW_Y)
407	LFD	y02,  1 * SIZE(NEW_Y)
408	LFD	y03,  2 * SIZE(NEW_Y)
409	LFD	y04,  3 * SIZE(NEW_Y)
410
411	LFD	a1,  0 * SIZE(AO1)
412	FMUL	atemp1, a16, atemp1
413	LFD	a2,  1 * SIZE(AO1)
414	FMUL	atemp2, a16, atemp2
415	LFD	a3,  2 * SIZE(AO1)
416	FMUL	atemp3, a16, atemp3
417	LFD	a4,  3 * SIZE(AO1)
418	FMUL	atemp4, a16, atemp4
419
420	LFD	a5,  0 * SIZE(AO2)
421	fmr	xsum2, xsum1
422	LFD	a6,  1 * SIZE(AO2)
423	fmr	xsum3, xsum1
424	LFD	a7,  2 * SIZE(AO2)
425	fmr	xsum4, xsum1
426	LFD	a8,  3 * SIZE(AO2)
427
428	LFD	a9,  0 * SIZE(AO3)
429	LFD	a10, 1 * SIZE(AO3)
430	LFD	a11, 2 * SIZE(AO3)
431	LFD	a12, 3 * SIZE(AO3)
432
433	LFD	a13, 0 * SIZE(AO4)
434	LFD	a14, 1 * SIZE(AO4)
435	LFD	a15, 2 * SIZE(AO4)
436	LFD	a16, 3 * SIZE(AO4)
437
438	mr	XX, X
439	mr	YY, NEW_Y
440
441	srawi.	r0,  IS, 4
442	mtspr	CTR, r0
443	ble	LL(14)
444	.align 4
445
446LL(12):
447	FMADD	xsum1, xtemp1, a1,  xsum1
448	DCBT(AO1, PREA)
449	FMADD	y01, atemp1, a1,  y01
450	LFD	a1,  4 * SIZE(AO1)
451
452	FMADD	xsum2, xtemp1, a5,  xsum2
453	NOP1
454	FMADD	y02, atemp1, a2,  y02
455	NOP2
456
457	FMADD	xsum3, xtemp1, a9,  xsum3
458	NOP1
459	FMADD	y03, atemp1, a3,  y03
460	NOP2
461
462	FMADD	xsum4, xtemp1, a13, xsum4
463	LFD	xtemp1,  4 * SIZE(XX)
464	FMADD	y04, atemp1, a4,  y04
465	NOP2
466
467	FMADD	xsum1, xtemp2, a2,  xsum1
468	LFD	a2,  5 * SIZE(AO1)
469	FMADD	y01, atemp2, a5,  y01
470	LFD	a5,  4 * SIZE(AO2)
471
472	FMADD	xsum2, xtemp2, a6,  xsum2
473	NOP1
474	FMADD	y02, atemp2, a6,  y02
475	LFD	a6,  5 * SIZE(AO2)
476
477	FMADD	xsum3, xtemp2, a10, xsum3
478	NOP1
479	FMADD	y03, atemp2, a7,  y03
480	NOP2
481
482	FMADD	xsum4, xtemp2, a14, xsum4
483	LFD	xtemp2,  5 * SIZE(XX)
484	FMADD	y04, atemp2, a8,  y04
485#	DCBT(X, PREX)
486	NOP2
487
488	FMADD	xsum1, xtemp3, a3,  xsum1
489	LFD	a3,  6 * SIZE(AO1)
490	FMADD	y01, atemp3, a9,  y01
491	LFD	a9,  4 * SIZE(AO3)
492
493	FMADD	xsum2, xtemp3, a7,  xsum2
494	LFD	a7,  6 * SIZE(AO2)
495	FMADD	y02, atemp3, a10, y02
496	LFD	a10, 5 * SIZE(AO3)
497
498	FMADD	xsum3, xtemp3, a11, xsum3
499	NOP1
500	FMADD	y03, atemp3, a11, y03
501	LFD	a11, 6 * SIZE(AO3)
502
503	FMADD	xsum4, xtemp3, a15, xsum4
504	LFD	xtemp3,  6 * SIZE(XX)
505	FMADD	y04, atemp3, a12, y04
506	NOP2
507
508	FMADD	xsum1, xtemp4, a4,  xsum1
509	LFD	a4,  7 * SIZE(AO1)
510	FMADD	y01, atemp4, a13, y01
511	LFD	a13, 4 * SIZE(AO4)
512
513	FMADD	xsum2, xtemp4, a8,  xsum2
514	LFD	a8,  7 * SIZE(AO2)
515	FMADD	y02, atemp4, a14, y02
516	LFD	a14, 5 * SIZE(AO4)
517
518	FMADD	xsum3, xtemp4, a12, xsum3
519	LFD	a12, 7 * SIZE(AO3)
520	FMADD	y03, atemp4, a15, y03
521	LFD	a15, 6 * SIZE(AO4)
522
523	FMADD	xsum4, xtemp4, a16, xsum4
524	LFD	xtemp4,  7 * SIZE(XX)
525	FMADD	y04, atemp4, a16, y04
526	LFD	a16, 7 * SIZE(AO4)
527
528	STFD	y01,  0 * SIZE(YY)
529	LFD	y01,  4 * SIZE(YY)
530	STFD	y02,  1 * SIZE(YY)
531	LFD	y02,  5 * SIZE(YY)
532
533	STFD	y03,  2 * SIZE(YY)
534	LFD	y03,  6 * SIZE(YY)
535	STFD	y04,  3 * SIZE(YY)
536	LFD	y04,  7 * SIZE(YY)
537
538	FMADD	xsum1, xtemp1, a1,  xsum1
539	DCBT(AO2, PREA)
540	FMADD	y01, atemp1, a1,  y01
541	LFD	a1,  8 * SIZE(AO1)
542
543	FMADD	xsum2, xtemp1, a5,  xsum2
544	NOP1
545	FMADD	y02, atemp1, a2,  y02
546	NOP2
547
548	FMADD	xsum3, xtemp1, a9,  xsum3
549	NOP1
550	FMADD	y03, atemp1, a3,  y03
551	NOP2
552
553	FMADD	xsum4, xtemp1, a13, xsum4
554	LFD	xtemp1,  8 * SIZE(XX)
555	FMADD	y04, atemp1, a4,  y04
556	NOP2
557
558	FMADD	xsum1, xtemp2, a2,  xsum1
559	LFD	a2,  9 * SIZE(AO1)
560	FMADD	y01, atemp2, a5,  y01
561	LFD	a5,  8 * SIZE(AO2)
562
563	FMADD	xsum2, xtemp2, a6,  xsum2
564	NOP1
565	FMADD	y02, atemp2, a6,  y02
566	LFD	a6,  9 * SIZE(AO2)
567
568	FMADD	xsum3, xtemp2, a10, xsum3
569	NOP1
570	FMADD	y03, atemp2, a7,  y03
571	NOP2
572
573	FMADD	xsum4, xtemp2, a14, xsum4
574	LFD	xtemp2,  9 * SIZE(XX)
575	FMADD	y04, atemp2, a8,  y04
576	NOP2
577
578	FMADD	xsum1, xtemp3, a3,  xsum1
579	LFD	a3, 10 * SIZE(AO1)
580	FMADD	y01, atemp3, a9,  y01
581	LFD	a9,  8 * SIZE(AO3)
582
583	FMADD	xsum2, xtemp3, a7,  xsum2
584	LFD	a7, 10 * SIZE(AO2)
585	FMADD	y02, atemp3, a10, y02
586	LFD	a10, 9 * SIZE(AO3)
587
588	FMADD	xsum3, xtemp3, a11, xsum3
589	NOP1
590	FMADD	y03, atemp3, a11, y03
591	LFD	a11, 10 * SIZE(AO3)
592
593	FMADD	xsum4, xtemp3, a15, xsum4
594	LFD	xtemp3, 10 * SIZE(XX)
595	FMADD	y04, atemp3, a12, y04
596	NOP2
597
598	FMADD	xsum1, xtemp4, a4,  xsum1
599	LFD	a4, 11 * SIZE(AO1)
600	FMADD	y01, atemp4, a13, y01
601	LFD	a13, 8 * SIZE(AO4)
602
603	FMADD	xsum2, xtemp4, a8,  xsum2
604	LFD	a8, 11 * SIZE(AO2)
605	FMADD	y02, atemp4, a14, y02
606	LFD	a14, 9 * SIZE(AO4)
607
608	FMADD	xsum3, xtemp4, a12, xsum3
609	LFD	a12, 11 * SIZE(AO3)
610	FMADD	y03, atemp4, a15, y03
611	LFD	a15, 10 * SIZE(AO4)
612
613	FMADD	xsum4, xtemp4, a16, xsum4
614	LFD	xtemp4, 11 * SIZE(XX)
615	FMADD	y04, atemp4, a16, y04
616	LFD	a16, 11 * SIZE(AO4)
617
618	STFD	y01,  4 * SIZE(YY)
619	LFD	y01,  8 * SIZE(YY)
620	STFD	y02,  5 * SIZE(YY)
621	LFD	y02,  9 * SIZE(YY)
622
623	STFD	y03,  6 * SIZE(YY)
624	LFD	y03, 10 * SIZE(YY)
625	STFD	y04,  7 * SIZE(YY)
626	LFD	y04, 11 * SIZE(YY)
627
628
629	FMADD	xsum1, xtemp1, a1,  xsum1
630	DCBT(AO3, PREA)
631	FMADD	y01, atemp1, a1,  y01
632	LFD	a1, 12 * SIZE(AO1)
633
634	FMADD	xsum2, xtemp1, a5,  xsum2
635	NOP1
636	FMADD	y02, atemp1, a2,  y02
637	NOP2
638
639	FMADD	xsum3, xtemp1, a9,  xsum3
640	NOP1
641	FMADD	y03, atemp1, a3,  y03
642	NOP2
643
644	FMADD	xsum4, xtemp1, a13, xsum4
645	LFD	xtemp1, 12 * SIZE(XX)
646	FMADD	y04, atemp1, a4,  y04
647	NOP2
648
649	FMADD	xsum1, xtemp2, a2,  xsum1
650	LFD	a2, 13 * SIZE(AO1)
651	FMADD	y01, atemp2, a5,  y01
652	LFD	a5, 12 * SIZE(AO2)
653
654	FMADD	xsum2, xtemp2, a6,  xsum2
655	NOP1
656	FMADD	y02, atemp2, a6,  y02
657	LFD	a6, 13 * SIZE(AO2)
658
659	FMADD	xsum3, xtemp2, a10, xsum3
660	NOP1
661	FMADD	y03, atemp2, a7,  y03
662#	DCBT(Y1, PREY)
663	NOP2
664
665	FMADD	xsum4, xtemp2, a14, xsum4
666	LFD	xtemp2, 13 * SIZE(XX)
667	FMADD	y04, atemp2, a8,  y04
668	NOP2
669
670	FMADD	xsum1, xtemp3, a3,  xsum1
671	LFD	a3, 14 * SIZE(AO1)
672	FMADD	y01, atemp3, a9,  y01
673	LFD	a9, 12 * SIZE(AO3)
674
675	FMADD	xsum2, xtemp3, a7,  xsum2
676	LFD	a7, 14 * SIZE(AO2)
677	FMADD	y02, atemp3, a10, y02
678	LFD	a10,13 * SIZE(AO3)
679
680	FMADD	xsum3, xtemp3, a11, xsum3
681	NOP1
682	FMADD	y03, atemp3, a11, y03
683	LFD	a11, 14 * SIZE(AO3)
684
685	FMADD	xsum4, xtemp3, a15, xsum4
686	LFD	xtemp3, 14 * SIZE(XX)
687	FMADD	y04, atemp3, a12, y04
688	NOP2
689
690	FMADD	xsum1, xtemp4, a4,  xsum1
691	LFD	a4, 15 * SIZE(AO1)
692	FMADD	y01, atemp4, a13, y01
693	LFD	a13,12 * SIZE(AO4)
694
695	FMADD	xsum2, xtemp4, a8,  xsum2
696	LFD	a8, 15 * SIZE(AO2)
697	FMADD	y02, atemp4, a14, y02
698	LFD	a14, 13 * SIZE(AO4)
699
700	FMADD	xsum3, xtemp4, a12, xsum3
701	LFD	a12, 15 * SIZE(AO3)
702	FMADD	y03, atemp4, a15, y03
703	LFD	a15, 14 * SIZE(AO4)
704
705	FMADD	xsum4, xtemp4, a16, xsum4
706	LFD	xtemp4,  15 * SIZE(XX)
707	FMADD	y04, atemp4, a16, y04
708	LFD	a16, 15 * SIZE(AO4)
709
710	STFD	y01,  8 * SIZE(YY)
711	LFD	y01, 12 * SIZE(YY)
712	STFD	y02,  9 * SIZE(YY)
713	LFD	y02, 13 * SIZE(YY)
714
715	STFD	y03, 10 * SIZE(YY)
716	LFD	y03, 14 * SIZE(YY)
717	STFD	y04, 11 * SIZE(YY)
718	LFD	y04, 15 * SIZE(YY)
719
720	FMADD	xsum1, xtemp1, a1,  xsum1
721	DCBT(AO4, PREA)
722	FMADD	y01, atemp1, a1,  y01
723	LFD	a1, 16 * SIZE(AO1)
724
725	FMADD	xsum2, xtemp1, a5,  xsum2
726	NOP1
727	FMADD	y02, atemp1, a2,  y02
728	NOP2
729
730	FMADD	xsum3, xtemp1, a9,  xsum3
731	NOP1
732	FMADD	y03, atemp1, a3,  y03
733	NOP2
734
735	FMADD	xsum4, xtemp1, a13, xsum4
736	LFD	xtemp1, 16 * SIZE(XX)
737	FMADD	y04, atemp1, a4,  y04
738	addi	YY, YY, 16 * SIZE
739
740	FMADD	xsum1, xtemp2, a2,  xsum1
741	LFD	a2, 17 * SIZE(AO1)
742	FMADD	y01, atemp2, a5,  y01
743	LFD	a5, 16 * SIZE(AO2)
744
745	FMADD	xsum2, xtemp2, a6,  xsum2
746	addi	AO3, AO3, 16 * SIZE
747	FMADD	y02, atemp2, a6,  y02
748	LFD	a6, 17 * SIZE(AO2)
749
750	FMADD	xsum3, xtemp2, a10, xsum3
751	addi	AO1, AO1, 16 * SIZE
752	FMADD	y03, atemp2, a7,  y03
753	addi	AO2, AO2, 16 * SIZE
754
755	FMADD	xsum4, xtemp2, a14, xsum4
756	LFD	xtemp2, 17 * SIZE(XX)
757	FMADD	y04, atemp2, a8,  y04
758	addi	AO4, AO4, 16 * SIZE
759
760	FMADD	xsum1, xtemp3, a3,  xsum1
761	LFD	a3,  2 * SIZE(AO1)
762	FMADD	y01, atemp3, a9,  y01
763	LFD	a9,  0 * SIZE(AO3)
764
765	FMADD	xsum2, xtemp3, a7,  xsum2
766	LFD	a7,  2 * SIZE(AO2)
767	FMADD	y02, atemp3, a10, y02
768	LFD	a10,  1 * SIZE(AO3)
769
770	FMADD	xsum3, xtemp3, a11, xsum3
771	NOP1
772	FMADD	y03, atemp3, a11, y03
773	LFD	a11,  2 * SIZE(AO3)
774
775	FMADD	xsum4, xtemp3, a15, xsum4
776	LFD	xtemp3, 18 * SIZE(XX)
777	FMADD	y04, atemp3, a12, y04
778	addi	XX, XX, 16 * SIZE
779
780	FMADD	xsum1, xtemp4, a4,  xsum1
781	LFD	a4,  3 * SIZE(AO1)
782	FMADD	y01, atemp4, a13, y01
783	LFD	a13,  0 * SIZE(AO4)
784
785	FMADD	xsum2, xtemp4, a8,  xsum2
786	LFD	a8,  3 * SIZE(AO2)
787	FMADD	y02, atemp4, a14, y02
788	LFD	a14,  1 * SIZE(AO4)
789
790	FMADD	xsum3, xtemp4, a12, xsum3
791	LFD	a12,  3 * SIZE(AO3)
792	FMADD	y03, atemp4, a15, y03
793	LFD	a15,  2 * SIZE(AO4)
794
795	FMADD	xsum4, xtemp4, a16, xsum4
796	LFD	xtemp4,  3 * SIZE(XX)
797	FMADD	y04, atemp4, a16, y04
798	LFD	a16,  3 * SIZE(AO4)
799
800	STFD	y01, -4 * SIZE(YY)
801	LFD	y01,  0 * SIZE(YY)
802	STFD	y02, -3 * SIZE(YY)
803	LFD	y02,  1 * SIZE(YY)
804
805	STFD	y03, -2 * SIZE(YY)
806	LFD	y03,  2 * SIZE(YY)
807	STFD	y04, -1 * SIZE(YY)
808	LFD	y04,  3 * SIZE(YY)
809	bdnz	LL(12)
810	.align 4
811
812LL(14):
813	andi.	r0,  IS, 8
814	ble	LL(15)
815
816	FMADD	xsum1, xtemp1, a1,  xsum1
817	NOP1
818	FMADD	y01, atemp1, a1,  y01
819	LFD	a1,  4 * SIZE(AO1)
820
821	FMADD	xsum2, xtemp1, a5,  xsum2
822	NOP1
823	FMADD	y02, atemp1, a2,  y02
824	NOP2
825
826	FMADD	xsum3, xtemp1, a9,  xsum3
827	NOP1
828	FMADD	y03, atemp1, a3,  y03
829	NOP2
830
831	FMADD	xsum4, xtemp1, a13, xsum4
832	LFD	xtemp1,  4 * SIZE(XX)
833	FMADD	y04, atemp1, a4,  y04
834	NOP2
835
836	FMADD	xsum1, xtemp2, a2,  xsum1
837	LFD	a2,  5 * SIZE(AO1)
838	FMADD	y01, atemp2, a5,  y01
839	LFD	a5,  4 * SIZE(AO2)
840
841	FMADD	xsum2, xtemp2, a6,  xsum2
842	NOP1
843	FMADD	y02, atemp2, a6,  y02
844	LFD	a6,  5 * SIZE(AO2)
845
846	FMADD	xsum3, xtemp2, a10, xsum3
847	NOP1
848	FMADD	y03, atemp2, a7,  y03
849	NOP2
850
851	FMADD	xsum4, xtemp2, a14, xsum4
852	LFD	xtemp2,  5 * SIZE(XX)
853	FMADD	y04, atemp2, a8,  y04
854	NOP2
855
856	FMADD	xsum1, xtemp3, a3,  xsum1
857	LFD	a3,  6 * SIZE(AO1)
858	FMADD	y01, atemp3, a9,  y01
859	LFD	a9,  4 * SIZE(AO3)
860
861	FMADD	xsum2, xtemp3, a7,  xsum2
862	LFD	a7,  6 * SIZE(AO2)
863	FMADD	y02, atemp3, a10, y02
864	LFD	a10, 5 * SIZE(AO3)
865
866	FMADD	xsum3, xtemp3, a11, xsum3
867	NOP1
868	FMADD	y03, atemp3, a11, y03
869	LFD	a11, 6 * SIZE(AO3)
870
871	FMADD	xsum4, xtemp3, a15, xsum4
872	LFD	xtemp3,  6 * SIZE(XX)
873	FMADD	y04, atemp3, a12, y04
874	NOP2
875
876	FMADD	xsum1, xtemp4, a4,  xsum1
877	LFD	a4,  7 * SIZE(AO1)
878	FMADD	y01, atemp4, a13, y01
879	LFD	a13, 4 * SIZE(AO4)
880
881	FMADD	xsum2, xtemp4, a8,  xsum2
882	LFD	a8,  7 * SIZE(AO2)
883	FMADD	y02, atemp4, a14, y02
884	LFD	a14, 5 * SIZE(AO4)
885
886	FMADD	xsum3, xtemp4, a12, xsum3
887	LFD	a12, 7 * SIZE(AO3)
888	FMADD	y03, atemp4, a15, y03
889	LFD	a15, 6 * SIZE(AO4)
890
891	FMADD	xsum4, xtemp4, a16, xsum4
892	LFD	xtemp4,  7 * SIZE(XX)
893	FMADD	y04, atemp4, a16, y04
894	LFD	a16, 7 * SIZE(AO4)
895
896	STFD	y01,  0 * SIZE(YY)
897	LFD	y01,  4 * SIZE(YY)
898	STFD	y02,  1 * SIZE(YY)
899	LFD	y02,  5 * SIZE(YY)
900
901	STFD	y03,  2 * SIZE(YY)
902	LFD	y03,  6 * SIZE(YY)
903	STFD	y04,  3 * SIZE(YY)
904	LFD	y04,  7 * SIZE(YY)
905
906	FMADD	xsum1, xtemp1, a1,  xsum1
907	NOP1
908	FMADD	y01, atemp1, a1,  y01
909	LFD	a1,  8 * SIZE(AO1)
910
911	FMADD	xsum2, xtemp1, a5,  xsum2
912	NOP1
913	FMADD	y02, atemp1, a2,  y02
914	NOP2
915
916	FMADD	xsum3, xtemp1, a9,  xsum3
917	NOP1
918	FMADD	y03, atemp1, a3,  y03
919	NOP2
920
921	FMADD	xsum4, xtemp1, a13, xsum4
922	LFD	xtemp1,  8 * SIZE(XX)
923	FMADD	y04, atemp1, a4,  y04
924	NOP2
925
926	FMADD	xsum1, xtemp2, a2,  xsum1
927	LFD	a2,  9 * SIZE(AO1)
928	FMADD	y01, atemp2, a5,  y01
929	LFD	a5,  8 * SIZE(AO2)
930
931	FMADD	xsum2, xtemp2, a6,  xsum2
932	NOP1
933	FMADD	y02, atemp2, a6,  y02
934	LFD	a6,  9 * SIZE(AO2)
935
936	FMADD	xsum3, xtemp2, a10, xsum3
937	NOP1
938	FMADD	y03, atemp2, a7,  y03
939	NOP2
940
941	FMADD	xsum4, xtemp2, a14, xsum4
942	LFD	xtemp2,  9 * SIZE(XX)
943	FMADD	y04, atemp2, a8,  y04
944	NOP2
945
946	FMADD	xsum1, xtemp3, a3,  xsum1
947	LFD	a3, 10 * SIZE(AO1)
948	FMADD	y01, atemp3, a9,  y01
949	LFD	a9,  8 * SIZE(AO3)
950
951	FMADD	xsum2, xtemp3, a7,  xsum2
952	LFD	a7, 10 * SIZE(AO2)
953	FMADD	y02, atemp3, a10, y02
954	LFD	a10, 9 * SIZE(AO3)
955
956	FMADD	xsum3, xtemp3, a11, xsum3
957	NOP1
958	FMADD	y03, atemp3, a11, y03
959	LFD	a11, 10 * SIZE(AO3)
960
961	FMADD	xsum4, xtemp3, a15, xsum4
962	LFD	xtemp3, 10 * SIZE(XX)
963	FMADD	y04, atemp3, a12, y04
964	NOP2
965
966	FMADD	xsum1, xtemp4, a4,  xsum1
967	LFD	a4, 11 * SIZE(AO1)
968	FMADD	y01, atemp4, a13, y01
969	LFD	a13, 8 * SIZE(AO4)
970
971	FMADD	xsum2, xtemp4, a8,  xsum2
972	LFD	a8, 11 * SIZE(AO2)
973	FMADD	y02, atemp4, a14, y02
974	LFD	a14, 9 * SIZE(AO4)
975
976	FMADD	xsum3, xtemp4, a12, xsum3
977	LFD	a12, 11 * SIZE(AO3)
978	FMADD	y03, atemp4, a15, y03
979	LFD	a15, 10 * SIZE(AO4)
980
981	FMADD	xsum4, xtemp4, a16, xsum4
982	LFD	xtemp4, 11 * SIZE(XX)
983	FMADD	y04, atemp4, a16, y04
984	LFD	a16, 11 * SIZE(AO4)
985
986	addi	AO1, AO1, 8 * SIZE
987	addi	AO2, AO2, 8 * SIZE
988	addi	AO3, AO3, 8 * SIZE
989	addi	AO4, AO4, 8 * SIZE
990
991	STFD	y01,  4 * SIZE(YY)
992	LFD	y01,  8 * SIZE(YY)
993	STFD	y02,  5 * SIZE(YY)
994	LFD	y02,  9 * SIZE(YY)
995
996	STFD	y03,  6 * SIZE(YY)
997	LFD	y03, 10 * SIZE(YY)
998	STFD	y04,  7 * SIZE(YY)
999	LFD	y04, 11 * SIZE(YY)
1000
1001	addi	XX, XX, 8 * SIZE
1002	addi	YY, YY, 8 * SIZE
1003	.align 4
1004
1005LL(15):
1006	andi.	r0,  IS, 4
1007	ble	LL(18)
1008
1009	FMADD	xsum1, xtemp1, a1,  xsum1
1010	NOP1
1011	FMADD	y01, atemp1, a1,  y01
1012	LFD	a1,  4 * SIZE(AO1)
1013
1014	FMADD	xsum2, xtemp1, a5,  xsum2
1015	NOP1
1016	FMADD	y02, atemp1, a2,  y02
1017	NOP2
1018
1019	FMADD	xsum3, xtemp1, a9,  xsum3
1020	NOP1
1021	FMADD	y03, atemp1, a3,  y03
1022	NOP2
1023
1024	FMADD	xsum4, xtemp1, a13, xsum4
1025	LFD	xtemp1,  4 * SIZE(XX)
1026	FMADD	y04, atemp1, a4,  y04
1027	NOP2
1028
1029	FMADD	xsum1, xtemp2, a2,  xsum1
1030	LFD	a2,  5 * SIZE(AO1)
1031	FMADD	y01, atemp2, a5,  y01
1032	LFD	a5,  4 * SIZE(AO2)
1033
1034	FMADD	xsum2, xtemp2, a6,  xsum2
1035	NOP1
1036	FMADD	y02, atemp2, a6,  y02
1037	LFD	a6,  5 * SIZE(AO2)
1038
1039	FMADD	xsum3, xtemp2, a10, xsum3
1040	NOP1
1041	FMADD	y03, atemp2, a7,  y03
1042	NOP2
1043
1044	FMADD	xsum4, xtemp2, a14, xsum4
1045	LFD	xtemp2,  5 * SIZE(XX)
1046	FMADD	y04, atemp2, a8,  y04
1047	NOP2
1048
1049	FMADD	xsum1, xtemp3, a3,  xsum1
1050	LFD	a3,  6 * SIZE(AO1)
1051	FMADD	y01, atemp3, a9,  y01
1052	LFD	a9,  4 * SIZE(AO3)
1053
1054	FMADD	xsum2, xtemp3, a7,  xsum2
1055	LFD	a7,  6 * SIZE(AO2)
1056	FMADD	y02, atemp3, a10, y02
1057	LFD	a10, 5 * SIZE(AO3)
1058
1059	FMADD	xsum3, xtemp3, a11, xsum3
1060	NOP1
1061	FMADD	y03, atemp3, a11, y03
1062	LFD	a11, 6 * SIZE(AO3)
1063
1064	FMADD	xsum4, xtemp3, a15, xsum4
1065	LFD	xtemp3,  6 * SIZE(XX)
1066	FMADD	y04, atemp3, a12, y04
1067	NOP2
1068
1069	FMADD	xsum1, xtemp4, a4,  xsum1
1070	LFD	a4,  7 * SIZE(AO1)
1071	FMADD	y01, atemp4, a13, y01
1072	LFD	a13, 4 * SIZE(AO4)
1073
1074	FMADD	xsum2, xtemp4, a8,  xsum2
1075	LFD	a8,  7 * SIZE(AO2)
1076	FMADD	y02, atemp4, a14, y02
1077	LFD	a14, 5 * SIZE(AO4)
1078
1079	FMADD	xsum3, xtemp4, a12, xsum3
1080	LFD	a12, 7 * SIZE(AO3)
1081	FMADD	y03, atemp4, a15, y03
1082	LFD	a15, 6 * SIZE(AO4)
1083
1084	FMADD	xsum4, xtemp4, a16, xsum4
1085	LFD	xtemp4,  7 * SIZE(XX)
1086	FMADD	y04, atemp4, a16, y04
1087	LFD	a16, 7 * SIZE(AO4)
1088
1089	addi	AO1, AO1, 4 * SIZE
1090	addi	AO2, AO2, 4 * SIZE
1091	addi	AO3, AO3, 4 * SIZE
1092	addi	AO4, AO4, 4 * SIZE
1093
1094	STFD	y01,  0 * SIZE(YY)
1095	LFD	y01,  4 * SIZE(YY)
1096	STFD	y02,  1 * SIZE(YY)
1097	LFD	y02,  5 * SIZE(YY)
1098
1099	STFD	y03,  2 * SIZE(YY)
1100	LFD	y03,  6 * SIZE(YY)
1101	STFD	y04,  3 * SIZE(YY)
1102	LFD	y04,  7 * SIZE(YY)
1103
1104	addi	XX, XX, 4 * SIZE
1105	addi	YY, YY, 4 * SIZE
1106	.align 4
1107
1108LL(18):
1109	LFD	xtemp1, ALPHA
1110
1111	FMUL	xsum1, xtemp1, xsum1
1112	FMUL	xsum2, xtemp1, xsum2
1113	FMUL	xsum3, xtemp1, xsum3
1114	FMUL	xsum4, xtemp1, xsum4
1115
1116	FMADD	xsum1, atemp1, a1,  xsum1
1117	FMADD	xsum2, atemp1, a5,  xsum2
1118	FMADD	xsum3, atemp1, a9,  xsum3
1119	FMADD	xsum4, atemp1, a13, xsum4
1120
1121	FMADD	xsum1, atemp2, a5,  xsum1
1122	FMADD	xsum2, atemp2, a6,  xsum2
1123	FMADD	xsum3, atemp2, a10, xsum3
1124	FMADD	xsum4, atemp2, a14, xsum4
1125
1126	FMADD	xsum1, atemp3, a9,  xsum1
1127	FMADD	xsum2, atemp3, a10, xsum2
1128	FMADD	xsum3, atemp3, a11, xsum3
1129	FMADD	xsum4, atemp3, a15, xsum4
1130
1131	FMADD	xsum1, atemp4, a13, xsum1
1132	FMADD	xsum2, atemp4, a14, xsum2
1133	FMADD	xsum3, atemp4, a15, xsum3
1134	FMADD	xsum4, atemp4, a16, xsum4
1135
1136	FADD	y01, y01, xsum1
1137	FADD	y02, y02, xsum2
1138	FADD	y03, y03, xsum3
1139	FADD	y04, y04, xsum4
1140
1141	STFD	y01,  0 * SIZE(YY)
1142	STFD	y02,  1 * SIZE(YY)
1143	STFD	y03,  2 * SIZE(YY)
1144	STFD	y04,  3 * SIZE(YY)
1145
1146	addi	TEMP, IS, 8
1147	addi	IS,   IS, 4
1148	cmpw	cr0, TEMP, M
1149	ble	LL(11)
1150	.align 4
1151
1152LL(20):
1153	andi.	TEMP, M, 2
1154	ble	LL(30)
1155
1156	mr	AO1, A
1157	add	AO2, A,   LDA
1158	add	A,   AO2, LDA
1159
1160	slwi	TEMP,  IS,  BASE_SHIFT
1161	add	TEMP, X, TEMP
1162
1163	LFD	atemp1, 0 * SIZE(TEMP)
1164	LFD	atemp2, 1 * SIZE(TEMP)
1165
1166	LFD	a1, ALPHA
1167
1168	FMUL	atemp1, a1, atemp1
1169	FMUL	atemp2, a1, atemp2
1170
1171	lfd	xsum1, FZERO
1172	fmr	xsum2, xsum1
1173
1174	mr	XX, X
1175	mr	YY, NEW_Y
1176
1177	LFD	xtemp1,  0 * SIZE(XX)
1178	LFD	xtemp2,  1 * SIZE(XX)
1179
1180	LFD	y01,  0 * SIZE(YY)
1181	LFD	y02,  1 * SIZE(YY)
1182
1183	LFD	a1,  0 * SIZE(AO1)
1184	LFD	a2,  1 * SIZE(AO1)
1185
1186	LFD	a5,  0 * SIZE(AO2)
1187	LFD	a6,  1 * SIZE(AO2)
1188
1189	srawi.	r0,  IS, 1
1190	mtspr	CTR, r0
1191	ble	LL(28)
1192	.align 4
1193
1194LL(22):
1195	FMADD	xsum1, xtemp1, a1,  xsum1
1196	FMADD	xsum2, xtemp1, a5,  xsum2
1197
1198	FMADD	xsum1, xtemp2, a2,  xsum1
1199	FMADD	xsum2, xtemp2, a6,  xsum2
1200
1201	FMADD	y01, atemp1, a1,  y01
1202	FMADD	y02, atemp1, a2,  y02
1203	FMADD	y01, atemp2, a5,  y01
1204	FMADD	y02, atemp2, a6,  y02
1205
1206	LFD	xtemp1,  2 * SIZE(XX)
1207	LFD	xtemp2,  3 * SIZE(XX)
1208
1209	LFD	a1,  2 * SIZE(AO1)
1210	LFD	a2,  3 * SIZE(AO1)
1211
1212	LFD	a5,  2 * SIZE(AO2)
1213	LFD	a6,  3 * SIZE(AO2)
1214
1215	STFD	y01,  0 * SIZE(YY)
1216	STFD	y02,  1 * SIZE(YY)
1217
1218	LFD	y01,  2 * SIZE(YY)
1219	LFD	y02,  3 * SIZE(YY)
1220
1221	addi	AO1, AO1, 2 * SIZE
1222	addi	AO2, AO2, 2 * SIZE
1223
1224	addi	XX, XX, 2 * SIZE
1225	addi	YY, YY, 2 * SIZE
1226
1227	bdnz	LL(22)
1228	.align 4
1229
1230LL(28):
1231	LFD	xtemp1, ALPHA
1232
1233	FMUL	xsum1, xtemp1, xsum1
1234	FMUL	xsum2, xtemp1, xsum2
1235
1236	FMADD	xsum1, atemp1, a1,  xsum1
1237	FMADD	xsum2, atemp1, a5,  xsum2
1238	FMADD	xsum1, atemp2, a5,  xsum1
1239	FMADD	xsum2, atemp2, a6,  xsum2
1240
1241	FADD	y01, y01, xsum1
1242	FADD	y02, y02, xsum2
1243
1244	STFD	y01,  0 * SIZE(YY)
1245	STFD	y02,  1 * SIZE(YY)
1246
1247	addi	IS, IS, 2
1248	.align 4
1249
1250LL(30):
1251	andi.	TEMP, M, 1
1252	ble	LL(990)
1253
1254	mr	AO1, A
1255
1256	slwi	TEMP,  IS,  BASE_SHIFT
1257	add	TEMP, X, TEMP
1258
1259	LFD	atemp1, 0 * SIZE(TEMP)
1260
1261	LFD	a1, ALPHA
1262
1263	FMUL	atemp1, a1, atemp1
1264
1265	lfd	xsum1, FZERO
1266
1267	mr	XX, X
1268	mr	YY, NEW_Y
1269
1270	LFD	xtemp1,  0 * SIZE(XX)
1271	LFD	y01,  0 * SIZE(YY)
1272
1273	LFD	a1,  0 * SIZE(AO1)
1274
1275	mtspr	CTR, IS
1276	cmpwi	cr0, IS, 0
1277	ble	LL(38)
1278	.align 4
1279
1280LL(32):
1281	FMADD	xsum1, xtemp1, a1,  xsum1
1282
1283	FMADD	y01, atemp1, a1,  y01
1284
1285	LFD	xtemp1,  1 * SIZE(XX)
1286
1287	LFD	a1,   1 * SIZE(AO1)
1288
1289	STFD	y01,  0 * SIZE(YY)
1290
1291	LFD	y01,  1 * SIZE(YY)
1292
1293	addi	AO1, AO1, 1 * SIZE
1294
1295	addi	XX, XX, 1 * SIZE
1296	addi	YY, YY, 1 * SIZE
1297
1298	bdnz	LL(32)
1299	.align 4
1300
1301LL(38):
1302	LFD	xtemp1, ALPHA
1303
1304	FMUL	xsum1, xtemp1, xsum1
1305
1306	FMADD	xsum1, atemp1, a1,  xsum1
1307
1308	FADD	y01, y01, xsum1
1309
1310	STFD	y01,  0 * SIZE(YY)
1311	.align 4
1312
1313LL(990):
1314	cmpwi	cr0, INCY, SIZE
1315	beq	LL(999)
1316
1317	mr	YY, Y
1318
1319	srawi.	r0, M, 3
1320	mtspr	CTR, r0
1321	ble	LL(995)
1322	.align 4
1323
1324LL(991):
1325	LFD	f0,  0 * SIZE(Y)
1326	add	Y, Y, INCY
1327	LFD	f1,  0 * SIZE(Y)
1328	add	Y, Y, INCY
1329	LFD	f2,  0 * SIZE(Y)
1330	add	Y, Y, INCY
1331	LFD	f3,  0 * SIZE(Y)
1332	add	Y, Y, INCY
1333	LFD	f4,  0 * SIZE(Y)
1334	add	Y, Y, INCY
1335	LFD	f5,  0 * SIZE(Y)
1336	add	Y, Y, INCY
1337	LFD	f6,  0 * SIZE(Y)
1338	add	Y, Y, INCY
1339	LFD	f7,  0 * SIZE(Y)
1340	add	Y, Y, INCY
1341
1342	LFD	f8,   0 * SIZE(NEW_Y)
1343	LFD	f9,   1 * SIZE(NEW_Y)
1344	LFD	f10,  2 * SIZE(NEW_Y)
1345	LFD	f11,  3 * SIZE(NEW_Y)
1346	LFD	f12,  4 * SIZE(NEW_Y)
1347	LFD	f13,  5 * SIZE(NEW_Y)
1348	LFD	f14,  6 * SIZE(NEW_Y)
1349	LFD	f15,  7 * SIZE(NEW_Y)
1350	addi	NEW_Y, NEW_Y, 8 * SIZE
1351
1352	FADD	f8,  f8,  f0
1353	FADD	f9,  f9,  f1
1354	FADD	f10, f10, f2
1355	FADD	f11, f11, f3
1356	FADD	f12, f12, f4
1357	FADD	f13, f13, f5
1358	FADD	f14, f14, f6
1359	FADD	f15, f15, f7
1360
1361	STFD	f8,  0 * SIZE(YY)
1362	add	YY, YY, INCY
1363	STFD	f9,  0 * SIZE(YY)
1364	add	YY, YY, INCY
1365	STFD	f10, 0 * SIZE(YY)
1366	add	YY, YY, INCY
1367	STFD	f11, 0 * SIZE(YY)
1368	add	YY, YY, INCY
1369	STFD	f12, 0 * SIZE(YY)
1370	add	YY, YY, INCY
1371	STFD	f13, 0 * SIZE(YY)
1372	add	YY, YY, INCY
1373	STFD	f14, 0 * SIZE(YY)
1374	add	YY, YY, INCY
1375	STFD	f15, 0 * SIZE(YY)
1376	add	YY, YY, INCY
1377	bdnz	LL(991)
1378	.align 4
1379
1380LL(995):
1381	andi.	J, M, 4
1382	ble	LL(996)
1383
1384	LFD	f0,  0 * SIZE(Y)
1385	add	Y, Y, INCY
1386	LFD	f1,  0 * SIZE(Y)
1387	add	Y, Y, INCY
1388	LFD	f2,  0 * SIZE(Y)
1389	add	Y, Y, INCY
1390	LFD	f3,  0 * SIZE(Y)
1391	add	Y, Y, INCY
1392
1393	LFD	f8,   0 * SIZE(NEW_Y)
1394	LFD	f9,   1 * SIZE(NEW_Y)
1395	LFD	f10,  2 * SIZE(NEW_Y)
1396	LFD	f11,  3 * SIZE(NEW_Y)
1397	addi	NEW_Y, NEW_Y, 4 * SIZE
1398
1399	FADD	f8,  f8,  f0
1400	FADD	f9,  f9,  f1
1401	FADD	f10, f10, f2
1402	FADD	f11, f11, f3
1403
1404	STFD	f8,  0 * SIZE(YY)
1405	add	YY, YY, INCY
1406	STFD	f9,  0 * SIZE(YY)
1407	add	YY, YY, INCY
1408	STFD	f10, 0 * SIZE(YY)
1409	add	YY, YY, INCY
1410	STFD	f11, 0 * SIZE(YY)
1411	add	YY, YY, INCY
1412	.align 4
1413
1414LL(996):
1415	andi.	J, M, 2
1416	ble	LL(997)
1417
1418	LFD	f0,  0 * SIZE(Y)
1419	add	Y, Y, INCY
1420	LFD	f1,  0 * SIZE(Y)
1421	add	Y, Y, INCY
1422
1423	LFD	f8,   0 * SIZE(NEW_Y)
1424	LFD	f9,   1 * SIZE(NEW_Y)
1425	addi	NEW_Y, NEW_Y, 2 * SIZE
1426
1427	FADD	f8,  f8,  f0
1428	FADD	f9,  f9,  f1
1429
1430	STFD	f8,  0 * SIZE(YY)
1431	add	YY, YY, INCY
1432	STFD	f9,  0 * SIZE(YY)
1433	add	YY, YY, INCY
1434	.align 4
1435
1436LL(997):
1437	andi.	J, M, 1
1438	ble	LL(999)
1439
1440	LFD	f0,  0 * SIZE(Y)
1441	LFD	f8,   0 * SIZE(NEW_Y)
1442
1443	FADD	f8,  f8,  f0
1444
1445	STFD	f8,  0 * SIZE(YY)
1446	.align 4
1447
1448LL(999):
1449	li	r3, 0
1450
1451	lfd	f14,     0(SP)
1452	lfd	f15,     8(SP)
1453	lfd	f16,    16(SP)
1454	lfd	f17,    24(SP)
1455	lfd	f18,    32(SP)
1456	lfd	f19,    40(SP)
1457	lfd	f20,    48(SP)
1458	lfd	f21,    56(SP)
1459	lfd	f22,    64(SP)
1460	lfd	f23,    72(SP)
1461	lfd	f24,    80(SP)
1462	lfd	f25,    88(SP)
1463	lfd	f26,    96(SP)
1464	lfd	f27,   104(SP)
1465	lfd	f28,   112(SP)
1466	lfd	f29,   120(SP)
1467	lfd	f30,   128(SP)
1468	lfd	f31,   136(SP)
1469
1470#ifdef __64BIT__
1471	ld	r14,   144(SP)
1472	ld	r15,   152(SP)
1473	ld	r16,   160(SP)
1474	ld	r17,   168(SP)
1475	ld	r18,   176(SP)
1476	ld	r19,   184(SP)
1477	ld	r20,   192(SP)
1478	ld	r21,   200(SP)
1479	ld	r22,   208(SP)
1480	ld	r23,   216(SP)
1481	ld	r24,   224(SP)
1482	ld	r25,   232(SP)
1483	ld	r26,   240(SP)
1484	ld	r27,   248(SP)
1485#else
1486	lwz	r14,   144(SP)
1487	lwz	r15,   148(SP)
1488	lwz	r16,   152(SP)
1489	lwz	r17,   156(SP)
1490	lwz	r18,   160(SP)
1491	lwz	r19,   164(SP)
1492	lwz	r20,   168(SP)
1493	lwz	r21,   172(SP)
1494	lwz	r22,   176(SP)
1495	lwz	r23,   180(SP)
1496	lwz	r24,   184(SP)
1497	lwz	r25,   188(SP)
1498	lwz	r26,   192(SP)
1499	lwz	r27,   196(SP)
1500#endif
1501
1502	addi	SP, SP, STACKSIZE
1503	blr
1504
1505	EPILOGUE
1506#endif
1507