1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#ifndef NEEDPARAM
43#ifndef DOUBLE
44#include "sparam.h"
45#else
46#include "dparam.h"
47#endif
48#endif
49
50#ifdef linux
51#ifndef __64BIT__
52#define M	r3
53#define	N	r4
54#define X	r6
55#define INCX	r7
56#define Y	r8
57#define	INCY	r9
58#define	A	r10
59#define	LDA	r5
60#else
61#define M	r3
62#define	N	r4
63#define X	r7
64#define INCX	r8
65#define Y	r9
66#define	INCY	r10
67#define	A	r5
68#define	LDA	r6
69#endif
70#endif
71
72#if defined(_AIX) || defined(__APPLE__)
73#if !defined(__64BIT__) && defined(DOUBLE)
74#define M	r3
75#define	N	r4
76#define X	r8
77#define INCX	r9
78#define Y	r10
79#define	INCY	r5
80#define	A	r6
81#define	LDA	r7
82#else
83#define M	r3
84#define	N	r4
85#define X	r7
86#define INCX	r8
87#define Y	r9
88#define	INCY	r10
89#define	A	r5
90#define	LDA	r6
91#endif
92#endif
93
94#define I	r11
95#define	J	r12
96
97#define AO1	r14
98#define AO2	r15
99#define AO3	r16
100#define AO4	r17
101#define AO5	r18
102#define AO6	r19
103#define AO7	r20
104#define AO8	r21
105
106#define	X1	r22
107#define	PREA	r23
108#define	PREC	r24
109#define XX	r25
110#define BUFFER	r26
111
112#define y01 f0
113#define y02 f1
114#define y03 f2
115#define y04 f3
116#define y05 f4
117#define y06 f5
118#define y07 f6
119#define y08 f7
120
121#define alpha1 f8
122#define alpha2 f9
123
124#define a1     f12
125#define a2     f13
126#define a3     f14
127#define a4     f15
128#define a5     f16
129#define a6     f17
130#define a7     f18
131#define a8     f19
132#define a9     f20
133#define a10    f21
134#define a11    f22
135#define a12    f23
136#define a13    f24
137#define a14    f25
138#define a15    f26
139#define a16    f27
140
141#define alpha  f31
142
143#if defined(PPC440) || defined(PPC440FP2)
144#define PREFETCHSIZE_A  24
145#define PREFETCHSIZE_C  16
146#endif
147
148#ifdef PPC970
149#define PREFETCHSIZE_A  16
150#define PREFETCHSIZE_C  16
151#endif
152
153#ifdef POWER4
154#define PREFETCHSIZE_A  16
155#define PREFETCHSIZE_C  16
156#endif
157
158#ifdef POWER5
159#define PREFETCHSIZE_A  16
160#define PREFETCHSIZE_C  16
161#endif
162
163#ifndef NEEDPARAM
164
165#ifndef __64BIT__
166#define STACKSIZE 224
167#else
168#define STACKSIZE 280
169#endif
170
171	PROLOGUE
172	PROFCODE
173
174	addi	SP,   SP, -STACKSIZE
175
176	stfd	f14,     0(SP)
177	stfd	f15,     8(SP)
178	stfd	f16,    16(SP)
179	stfd	f17,    24(SP)
180	stfd	f18,    32(SP)
181	stfd	f19,    40(SP)
182	stfd	f20,    48(SP)
183	stfd	f21,    56(SP)
184	stfd	f22,    64(SP)
185	stfd	f23,    72(SP)
186	stfd	f24,    80(SP)
187	stfd	f25,    88(SP)
188	stfd	f26,    96(SP)
189	stfd	f27,   104(SP)
190	stfd	f28,   112(SP)
191	stfd	f29,   120(SP)
192	stfd	f30,   128(SP)
193	stfd	f31,   136(SP)
194
195#ifdef __64BIT__
196	std	r14,   144(SP)
197	std	r15,   152(SP)
198	std	r16,   160(SP)
199	std	r17,   168(SP)
200	std	r18,   176(SP)
201	std	r19,   184(SP)
202	std	r20,   192(SP)
203	std	r21,   200(SP)
204	std	r22,   208(SP)
205	std	r23,   216(SP)
206	std	r24,   224(SP)
207	std	r25,   232(SP)
208	std	r26,   240(SP)
209	std	r27,   248(SP)
210#else
211	stw	r14,   144(SP)
212	stw	r15,   148(SP)
213	stw	r16,   152(SP)
214	stw	r17,   156(SP)
215	stw	r18,   160(SP)
216	stw	r19,   164(SP)
217	stw	r20,   168(SP)
218	stw	r21,   172(SP)
219	stw	r22,   176(SP)
220	stw	r23,   180(SP)
221	stw	r24,   184(SP)
222	stw	r25,   188(SP)
223	stw	r26,   192(SP)
224	stw	r27,   196(SP)
225#endif
226
227#ifdef linux
228#ifndef __64BIT__
229	lwz	LDA,      8 + STACKSIZE(SP)
230	lwz	BUFFER,  12 + STACKSIZE(SP)
231#else
232	ld	A,       112 + STACKSIZE(SP)
233	ld	LDA,     120 + STACKSIZE(SP)
234	ld	BUFFER,  128 + STACKSIZE(SP)
235#endif
236#endif
237
238#if defined(_AIX) || defined(__APPLE__)
239#ifndef __64BIT__
240#ifdef DOUBLE
241	lwz	INCY,    56 + STACKSIZE(SP)
242	lwz	A,       60 + STACKSIZE(SP)
243	lwz	LDA,     64 + STACKSIZE(SP)
244	lwz	BUFFER,  68 + STACKSIZE(SP)
245#else
246	lwz	A,       56 + STACKSIZE(SP)
247	lwz	LDA,     60 + STACKSIZE(SP)
248	lwz	BUFFER,  64 + STACKSIZE(SP)
249#endif
250#else
251	ld	A,       112 + STACKSIZE(SP)
252	ld	LDA,     120 + STACKSIZE(SP)
253	ld	BUFFER,  128 + STACKSIZE(SP)
254#endif
255#endif
256
257	fmr	alpha, f1
258
259	slwi	LDA,  LDA,  BASE_SHIFT
260	slwi	INCX, INCX, BASE_SHIFT
261	slwi	INCY, INCY, BASE_SHIFT
262
263	li	PREA, PREFETCHSIZE_A * SIZE
264	li	PREC, PREFETCHSIZE_C * SIZE
265
266	cmpwi	cr0, M, 0
267	ble-	LL(999)
268
269	cmpwi	cr0, N, 0
270	ble-	LL(999)
271
272	mr	XX, X
273
274	cmpi	cr0, 0, INCX, SIZE
275	beq	LL(10)
276
277	mr	XX, BUFFER
278	mr	X1, BUFFER
279
280	srawi.	r0,  M, 3
281	mtspr	CTR, r0
282	ble	LL(05)
283	.align 4
284
285LL(01):
286	LFD	a1, 0 * SIZE(X)
287	add	X, X, INCX
288	LFD	a2, 0 * SIZE(X)
289	add	X, X, INCX
290	LFD	a3, 0 * SIZE(X)
291	add	X, X, INCX
292	LFD	a4, 0 * SIZE(X)
293	add	X, X, INCX
294	LFD	a5, 0 * SIZE(X)
295	add	X, X, INCX
296	LFD	a6, 0 * SIZE(X)
297	add	X, X, INCX
298	LFD	a7, 0 * SIZE(X)
299	add	X, X, INCX
300	LFD	a8, 0 * SIZE(X)
301	add	X, X, INCX
302
303	STFD	a1, 0 * SIZE(X1)
304	STFD	a2, 1 * SIZE(X1)
305	STFD	a3, 2 * SIZE(X1)
306	STFD	a4, 3 * SIZE(X1)
307	STFD	a5, 4 * SIZE(X1)
308	STFD	a6, 5 * SIZE(X1)
309	STFD	a7, 6 * SIZE(X1)
310	STFD	a8, 7 * SIZE(X1)
311
312	addi	X1, X1, 8 * SIZE
313	bdnz+	LL(01)
314	.align 4
315
316LL(05):
317	andi.	r0, M, 7
318	mtspr	CTR, r0
319	ble	LL(10)
320	.align 4
321
322LL(06):
323	LFD	a1, 0 * SIZE(X)
324	add	X, X, INCX
325	STFD	a1, 0 * SIZE(X1)
326	addi	X1, X1, SIZE
327	bdnz+	LL(06)
328	.align 4
329
330LL(10):
331	srawi.	J, N, 1
332	ble	LL(20)
333	.align 4
334
335LL(11):
336	LFD	alpha1, 0 * SIZE(Y)
337	add	Y, Y, INCY
338	LFD	alpha2, 0 * SIZE(Y)
339	add	Y, Y, INCY
340
341	FMUL	alpha1, alpha, alpha1
342	FMUL	alpha2, alpha, alpha2
343
344	mr	AO1, A
345	add	AO2, A,   LDA
346	add	A,   AO2, LDA
347
348	mr	X1, XX
349
350	srawi.	r0,  M, 4
351	mtspr	CTR, r0
352	ble	LL(15)
353
354	LFD	a1,   0 * SIZE(AO1)
355	LFD	a2,   1 * SIZE(AO1)
356	LFD	a3,   2 * SIZE(AO1)
357	LFD	a4,   3 * SIZE(AO1)
358
359	LFD	a5,   4 * SIZE(AO1)
360	LFD	a6,   5 * SIZE(AO1)
361	LFD	a7,   6 * SIZE(AO1)
362	LFD	a8,   7 * SIZE(AO1)
363
364	LFD	y01,  0 * SIZE(X1)
365	LFD	y02,  1 * SIZE(X1)
366	LFD	y03,  2 * SIZE(X1)
367	LFD	y04,  3 * SIZE(X1)
368
369	LFD	y05,  4 * SIZE(X1)
370	LFD	y06,  5 * SIZE(X1)
371	LFD	y07,  6 * SIZE(X1)
372	LFD	y08,  7 * SIZE(X1)
373
374	LFD	a9,   0 * SIZE(AO2)
375	LFD	a10,  1 * SIZE(AO2)
376	LFD	a11,  2 * SIZE(AO2)
377	LFD	a12,  3 * SIZE(AO2)
378
379	LFD	a13,  4 * SIZE(AO2)
380	LFD	a14,  5 * SIZE(AO2)
381	LFD	a15,  6 * SIZE(AO2)
382	LFD	a16,  7 * SIZE(AO2)
383	bdz	LL(13)
384	.align 4
385
386LL(12):
387	FMADD	a1,  alpha1, y01, a1
388	FMADD	a2,  alpha1, y02, a2
389	FMADD	a3,  alpha1, y03, a3
390	FMADD	a4,  alpha1, y04, a4
391
392	FMADD	a5,  alpha1, y05, a5
393	FMADD	a6,  alpha1, y06, a6
394	FMADD	a7,  alpha1, y07, a7
395	FMADD	a8,  alpha1, y08, a8
396
397	STFD	a1,   0 * SIZE(AO1)
398	STFD	a2,   1 * SIZE(AO1)
399	STFD	a3,   2 * SIZE(AO1)
400	STFD	a4,   3 * SIZE(AO1)
401
402	LFD	a1,   8 * SIZE(AO1)
403	LFD	a2,   9 * SIZE(AO1)
404	LFD	a3,  10 * SIZE(AO1)
405	LFD	a4,  11 * SIZE(AO1)
406
407	STFD	a5,   4 * SIZE(AO1)
408	STFD	a6,   5 * SIZE(AO1)
409	STFD	a7,   6 * SIZE(AO1)
410	STFD	a8,   7 * SIZE(AO1)
411
412	LFD	a5,  12 * SIZE(AO1)
413	LFD	a6,  13 * SIZE(AO1)
414	LFD	a7,  14 * SIZE(AO1)
415	LFD	a8,  15 * SIZE(AO1)
416
417	FMADD	a9,  alpha2, y01, a9
418	FMADD	a10, alpha2, y02, a10
419	FMADD	a11, alpha2, y03, a11
420	FMADD	a12, alpha2, y04, a12
421
422	LFD	y01,  8 * SIZE(X1)
423	LFD	y02,  9 * SIZE(X1)
424	LFD	y03, 10 * SIZE(X1)
425	LFD	y04, 11 * SIZE(X1)
426
427	FMADD	a13, alpha2, y05, a13
428	FMADD	a14, alpha2, y06, a14
429	FMADD	a15, alpha2, y07, a15
430	FMADD	a16, alpha2, y08, a16
431
432	LFD	y05, 12 * SIZE(X1)
433	LFD	y06, 13 * SIZE(X1)
434	LFD	y07, 14 * SIZE(X1)
435	LFD	y08, 15 * SIZE(X1)
436
437	STFD	a9,   0 * SIZE(AO2)
438	STFD	a10,  1 * SIZE(AO2)
439	STFD	a11,  2 * SIZE(AO2)
440	STFD	a12,  3 * SIZE(AO2)
441
442	LFD	a9,   8 * SIZE(AO2)
443	LFD	a10,  9 * SIZE(AO2)
444	LFD	a11, 10 * SIZE(AO2)
445	LFD	a12, 11 * SIZE(AO2)
446
447	STFD	a13,  4 * SIZE(AO2)
448	STFD	a14,  5 * SIZE(AO2)
449	STFD	a15,  6 * SIZE(AO2)
450	STFD	a16,  7 * SIZE(AO2)
451
452	LFD	a13, 12 * SIZE(AO2)
453	LFD	a14, 13 * SIZE(AO2)
454	LFD	a15, 14 * SIZE(AO2)
455	LFD	a16, 15 * SIZE(AO2)
456
457	FMADD	a1,  alpha1, y01, a1
458	FMADD	a2,  alpha1, y02, a2
459	FMADD	a3,  alpha1, y03, a3
460	FMADD	a4,  alpha1, y04, a4
461
462	FMADD	a5,  alpha1, y05, a5
463	FMADD	a6,  alpha1, y06, a6
464	FMADD	a7,  alpha1, y07, a7
465	FMADD	a8,  alpha1, y08, a8
466
467	STFD	a1,   8 * SIZE(AO1)
468	STFD	a2,   9 * SIZE(AO1)
469	STFD	a3,  10 * SIZE(AO1)
470	STFD	a4,  11 * SIZE(AO1)
471
472	LFD	a1,  16 * SIZE(AO1)
473	LFD	a2,  17 * SIZE(AO1)
474	LFD	a3,  18 * SIZE(AO1)
475	LFD	a4,  19 * SIZE(AO1)
476
477	STFD	a5,  12 * SIZE(AO1)
478	STFD	a6,  13 * SIZE(AO1)
479	STFD	a7,  14 * SIZE(AO1)
480	STFD	a8,  15 * SIZE(AO1)
481
482	LFD	a5,  20 * SIZE(AO1)
483	LFD	a6,  21 * SIZE(AO1)
484	LFD	a7,  22 * SIZE(AO1)
485	LFD	a8,  23 * SIZE(AO1)
486
487	FMADD	a9,  alpha2, y01, a9
488	FMADD	a10, alpha2, y02, a10
489	FMADD	a11, alpha2, y03, a11
490	FMADD	a12, alpha2, y04, a12
491
492	LFD	y01, 16 * SIZE(X1)
493	LFD	y02, 17 * SIZE(X1)
494	LFD	y03, 18 * SIZE(X1)
495	LFD	y04, 19 * SIZE(X1)
496
497	FMADD	a13, alpha2, y05, a13
498	FMADD	a14, alpha2, y06, a14
499	FMADD	a15, alpha2, y07, a15
500	FMADD	a16, alpha2, y08, a16
501
502	LFD	y05, 20 * SIZE(X1)
503	LFD	y06, 21 * SIZE(X1)
504	LFD	y07, 22 * SIZE(X1)
505	LFD	y08, 23 * SIZE(X1)
506
507	STFD	a9,   8 * SIZE(AO2)
508	STFD	a10,  9 * SIZE(AO2)
509	STFD	a11, 10 * SIZE(AO2)
510	STFD	a12, 11 * SIZE(AO2)
511
512	LFD	a9,  16 * SIZE(AO2)
513	LFD	a10, 17 * SIZE(AO2)
514	LFD	a11, 18 * SIZE(AO2)
515	LFD	a12, 19 * SIZE(AO2)
516
517	STFD	a13, 12 * SIZE(AO2)
518	STFD	a14, 13 * SIZE(AO2)
519	STFD	a15, 14 * SIZE(AO2)
520	STFD	a16, 15 * SIZE(AO2)
521
522	LFD	a13, 20 * SIZE(AO2)
523	LFD	a14, 21 * SIZE(AO2)
524	LFD	a15, 22 * SIZE(AO2)
525	LFD	a16, 23 * SIZE(AO2)
526
527	addi	AO1, AO1, 16 * SIZE
528	addi	AO2, AO2, 16 * SIZE
529	addi	X1, X1,   16 * SIZE
530
531	DCBT(AO1, PREA)
532	DCBT(AO2, PREA)
533	DCBT(Y1, PREY)
534
535	bdnz+	LL(12)
536	.align 4
537
538LL(13):
539	FMADD	a1,  alpha1, y01, a1
540	FMADD	a2,  alpha1, y02, a2
541	FMADD	a3,  alpha1, y03, a3
542	FMADD	a4,  alpha1, y04, a4
543
544	FMADD	a5,  alpha1, y05, a5
545	FMADD	a6,  alpha1, y06, a6
546	FMADD	a7,  alpha1, y07, a7
547	FMADD	a8,  alpha1, y08, a8
548
549	STFD	a1,   0 * SIZE(AO1)
550	STFD	a2,   1 * SIZE(AO1)
551	STFD	a3,   2 * SIZE(AO1)
552	STFD	a4,   3 * SIZE(AO1)
553
554	LFD	a1,   8 * SIZE(AO1)
555	LFD	a2,   9 * SIZE(AO1)
556	LFD	a3,  10 * SIZE(AO1)
557	LFD	a4,  11 * SIZE(AO1)
558
559	STFD	a5,   4 * SIZE(AO1)
560	STFD	a6,   5 * SIZE(AO1)
561	STFD	a7,   6 * SIZE(AO1)
562	STFD	a8,   7 * SIZE(AO1)
563
564	LFD	a5,  12 * SIZE(AO1)
565	LFD	a6,  13 * SIZE(AO1)
566	LFD	a7,  14 * SIZE(AO1)
567	LFD	a8,  15 * SIZE(AO1)
568
569	FMADD	a9,  alpha2, y01, a9
570	FMADD	a10, alpha2, y02, a10
571	FMADD	a11, alpha2, y03, a11
572	FMADD	a12, alpha2, y04, a12
573
574	LFD	y01,  8 * SIZE(X1)
575	LFD	y02,  9 * SIZE(X1)
576	LFD	y03, 10 * SIZE(X1)
577	LFD	y04, 11 * SIZE(X1)
578
579	FMADD	a13, alpha2, y05, a13
580	FMADD	a14, alpha2, y06, a14
581	FMADD	a15, alpha2, y07, a15
582	FMADD	a16, alpha2, y08, a16
583
584	LFD	y05, 12 * SIZE(X1)
585	LFD	y06, 13 * SIZE(X1)
586	LFD	y07, 14 * SIZE(X1)
587	LFD	y08, 15 * SIZE(X1)
588
589	STFD	a9,   0 * SIZE(AO2)
590	STFD	a10,  1 * SIZE(AO2)
591	STFD	a11,  2 * SIZE(AO2)
592	STFD	a12,  3 * SIZE(AO2)
593
594	LFD	a9,   8 * SIZE(AO2)
595	LFD	a10,  9 * SIZE(AO2)
596	LFD	a11, 10 * SIZE(AO2)
597	LFD	a12, 11 * SIZE(AO2)
598
599	STFD	a13,  4 * SIZE(AO2)
600	STFD	a14,  5 * SIZE(AO2)
601	STFD	a15,  6 * SIZE(AO2)
602	STFD	a16,  7 * SIZE(AO2)
603
604	LFD	a13, 12 * SIZE(AO2)
605	LFD	a14, 13 * SIZE(AO2)
606	LFD	a15, 14 * SIZE(AO2)
607	LFD	a16, 15 * SIZE(AO2)
608
609	FMADD	a1,  alpha1, y01, a1
610	FMADD	a2,  alpha1, y02, a2
611	FMADD	a3,  alpha1, y03, a3
612	FMADD	a4,  alpha1, y04, a4
613
614	FMADD	a5,  alpha1, y05, a5
615	FMADD	a6,  alpha1, y06, a6
616	FMADD	a7,  alpha1, y07, a7
617	FMADD	a8,  alpha1, y08, a8
618
619	STFD	a1,   8 * SIZE(AO1)
620	STFD	a2,   9 * SIZE(AO1)
621	STFD	a3,  10 * SIZE(AO1)
622	STFD	a4,  11 * SIZE(AO1)
623
624	LFD	a1,  16 * SIZE(AO1)
625	LFD	a2,  17 * SIZE(AO1)
626	LFD	a3,  18 * SIZE(AO1)
627	LFD	a4,  19 * SIZE(AO1)
628
629	STFD	a5,  12 * SIZE(AO1)
630	STFD	a6,  13 * SIZE(AO1)
631	STFD	a7,  14 * SIZE(AO1)
632	STFD	a8,  15 * SIZE(AO1)
633
634	LFD	a5,  20 * SIZE(AO1)
635	LFD	a6,  21 * SIZE(AO1)
636	LFD	a7,  22 * SIZE(AO1)
637	LFD	a8,  23 * SIZE(AO1)
638
639	FMADD	a9,  alpha2, y01, a9
640	FMADD	a10, alpha2, y02, a10
641	FMADD	a11, alpha2, y03, a11
642	FMADD	a12, alpha2, y04, a12
643
644	FMADD	a13, alpha2, y05, a13
645	FMADD	a14, alpha2, y06, a14
646	FMADD	a15, alpha2, y07, a15
647	FMADD	a16, alpha2, y08, a16
648
649	STFD	a9,   8 * SIZE(AO2)
650	STFD	a10,  9 * SIZE(AO2)
651	STFD	a11, 10 * SIZE(AO2)
652	STFD	a12, 11 * SIZE(AO2)
653
654	STFD	a13, 12 * SIZE(AO2)
655	STFD	a14, 13 * SIZE(AO2)
656	STFD	a15, 14 * SIZE(AO2)
657	STFD	a16, 15 * SIZE(AO2)
658
659	addi	AO1, AO1, 16 * SIZE
660	addi	AO2, AO2, 16 * SIZE
661	addi	X1, X1,   16 * SIZE
662	.align 4
663
664
665LL(15):
666	andi.	r0, M, 15
667	ble	LL(19)
668
669	andi.	r0, M, 8
670	ble	LL(16)
671
672	LFD	y01,  0 * SIZE(X1)
673	LFD	y02,  1 * SIZE(X1)
674	LFD	y03,  2 * SIZE(X1)
675	LFD	y04,  3 * SIZE(X1)
676	LFD	y05,  4 * SIZE(X1)
677	LFD	y06,  5 * SIZE(X1)
678	LFD	y07,  6 * SIZE(X1)
679	LFD	y08,  7 * SIZE(X1)
680
681	LFD	a1,   0 * SIZE(AO1)
682	LFD	a2,   1 * SIZE(AO1)
683	LFD	a3,   2 * SIZE(AO1)
684	LFD	a4,   3 * SIZE(AO1)
685	LFD	a5,   4 * SIZE(AO1)
686	LFD	a6,   5 * SIZE(AO1)
687	LFD	a7,   6 * SIZE(AO1)
688	LFD	a8,   7 * SIZE(AO1)
689
690	LFD	a9,   0 * SIZE(AO2)
691	LFD	a10,  1 * SIZE(AO2)
692	LFD	a11,  2 * SIZE(AO2)
693	LFD	a12,  3 * SIZE(AO2)
694	LFD	a13,  4 * SIZE(AO2)
695	LFD	a14,  5 * SIZE(AO2)
696	LFD	a15,  6 * SIZE(AO2)
697	LFD	a16,  7 * SIZE(AO2)
698
699	FMADD	a1,  alpha1, y01, a1
700	FMADD	a2,  alpha1, y02, a2
701	FMADD	a3,  alpha1, y03, a3
702	FMADD	a4,  alpha1, y04, a4
703
704	STFD	a1,   0 * SIZE(AO1)
705	STFD	a2,   1 * SIZE(AO1)
706	STFD	a3,   2 * SIZE(AO1)
707	STFD	a4,   3 * SIZE(AO1)
708
709	FMADD	a5,  alpha1, y05, a5
710	FMADD	a6,  alpha1, y06, a6
711	FMADD	a7,  alpha1, y07, a7
712	FMADD	a8,  alpha1, y08, a8
713
714	STFD	a5,   4 * SIZE(AO1)
715	STFD	a6,   5 * SIZE(AO1)
716	STFD	a7,   6 * SIZE(AO1)
717	STFD	a8,   7 * SIZE(AO1)
718
719	FMADD	a9,  alpha2, y01, a9
720	FMADD	a10, alpha2, y02, a10
721	FMADD	a11, alpha2, y03, a11
722	FMADD	a12, alpha2, y04, a12
723
724	STFD	a9,   0 * SIZE(AO2)
725	STFD	a10,  1 * SIZE(AO2)
726	STFD	a11,  2 * SIZE(AO2)
727	STFD	a12,  3 * SIZE(AO2)
728
729	FMADD	a13, alpha2, y05, a13
730	FMADD	a14, alpha2, y06, a14
731	FMADD	a15, alpha2, y07, a15
732	FMADD	a16, alpha2, y08, a16
733
734	STFD	a13,  4 * SIZE(AO2)
735	STFD	a14,  5 * SIZE(AO2)
736	STFD	a15,  6 * SIZE(AO2)
737	STFD	a16,  7 * SIZE(AO2)
738
739	addi	AO1, AO1, 8 * SIZE
740	addi	AO2, AO2, 8 * SIZE
741	addi	X1, X1, 8 * SIZE
742	.align 4
743
744LL(16):
745	andi.	r0, M, 4
746	ble	LL(17)
747
748	LFD	a1,  0 * SIZE(AO1)
749	LFD	a2,  1 * SIZE(AO1)
750	LFD	a3,  2 * SIZE(AO1)
751	LFD	a4,  3 * SIZE(AO1)
752
753	LFD	y01, 0 * SIZE(X1)
754	LFD	y02, 1 * SIZE(X1)
755	LFD	y03, 2 * SIZE(X1)
756	LFD	y04, 3 * SIZE(X1)
757
758	LFD	a5,  0 * SIZE(AO2)
759	LFD	a6,  1 * SIZE(AO2)
760	LFD	a7,  2 * SIZE(AO2)
761	LFD	a8,  3 * SIZE(AO2)
762
763	FMADD	a1,  alpha1, y01, a1
764	FMADD	a2,  alpha1, y02, a2
765	FMADD	a3,  alpha1, y03, a3
766	FMADD	a4,  alpha1, y04, a4
767
768	STFD	a1,  0 * SIZE(AO1)
769	STFD	a2,  1 * SIZE(AO1)
770	STFD	a3,  2 * SIZE(AO1)
771	STFD	a4,  3 * SIZE(AO1)
772
773	FMADD	a5,  alpha2, y01, a5
774	FMADD	a6,  alpha2, y02, a6
775	FMADD	a7,  alpha2, y03, a7
776	FMADD	a8,  alpha2, y04, a8
777
778	STFD	a5,  0 * SIZE(AO2)
779	STFD	a6,  1 * SIZE(AO2)
780	STFD	a7,  2 * SIZE(AO2)
781	STFD	a8,  3 * SIZE(AO2)
782
783	addi	AO1, AO1, 4 * SIZE
784	addi	AO2, AO2, 4 * SIZE
785	addi	X1, X1, 4 * SIZE
786	.align 4
787
788LL(17):
789	andi.	r0, M, 2
790	ble	LL(18)
791
792	LFD	a1,  0 * SIZE(AO1)
793	LFD	a2,  1 * SIZE(AO1)
794	LFD	a3,  0 * SIZE(AO2)
795	LFD	a4,  1 * SIZE(AO2)
796
797	LFD	y01, 0 * SIZE(X1)
798	LFD	y02, 1 * SIZE(X1)
799
800	FMADD	a1, alpha1, y01, a1
801	FMADD	a2, alpha1, y02, a2
802	FMADD	a3, alpha2, y01, a3
803	FMADD	a4, alpha2, y02, a4
804
805	STFD	a1,  0 * SIZE(AO1)
806	STFD	a2,  1 * SIZE(AO1)
807	STFD	a3,  0 * SIZE(AO2)
808	STFD	a4,  1 * SIZE(AO2)
809
810	addi	AO1, AO1, 2 * SIZE
811	addi	AO2, AO2, 2 * SIZE
812
813	addi	X1, X1, 2 * SIZE
814	.align 4
815
816LL(18):
817	andi.	r0, M, 1
818	ble	LL(19)
819
820	LFD	y01, 0 * SIZE(X1)
821
822	LFD	a1,  0 * SIZE(AO1)
823	LFD	a2,  0 * SIZE(AO2)
824
825	FMADD	a1, alpha1, y01, a1
826	FMADD	a2, alpha2, y01, a2
827
828	STFD	a1,  0 * SIZE(AO1)
829	STFD	a2,  0 * SIZE(AO2)
830	.align 4
831
832LL(19):
833	addi	J, J, -1
834	cmpi	cr0, 0, J, 0
835	bgt	LL(11)
836	.align 4
837
838LL(20):
839	andi.	J, N, 1
840	ble	LL(999)
841	.align 4
842
843LL(21):
844	LFD	alpha1, 0 * SIZE(Y)
845	FMUL	alpha1, alpha, alpha1
846
847	mr	AO1, A
848	mr	X1, XX
849
850	srawi.	r0,  M, 4
851	mtspr	CTR, r0
852	ble	LL(25)
853
854	LFD	a1,   0 * SIZE(AO1)
855	LFD	a2,   1 * SIZE(AO1)
856	LFD	a3,   2 * SIZE(AO1)
857	LFD	a4,   3 * SIZE(AO1)
858
859	LFD	a5,   4 * SIZE(AO1)
860	LFD	a6,   5 * SIZE(AO1)
861	LFD	a7,   6 * SIZE(AO1)
862	LFD	a8,   7 * SIZE(AO1)
863
864	LFD	y01,  0 * SIZE(X1)
865	LFD	y02,  1 * SIZE(X1)
866	LFD	y03,  2 * SIZE(X1)
867	LFD	y04,  3 * SIZE(X1)
868
869	LFD	y05,  4 * SIZE(X1)
870	LFD	y06,  5 * SIZE(X1)
871	LFD	y07,  6 * SIZE(X1)
872	LFD	y08,  7 * SIZE(X1)
873
874	bdz	LL(23)
875	.align 4
876
877LL(22):
878	FMADD	a1,  alpha1, y01, a1
879	FMADD	a2,  alpha1, y02, a2
880	FMADD	a3,  alpha1, y03, a3
881	FMADD	a4,  alpha1, y04, a4
882
883	FMADD	a5,  alpha1, y05, a5
884	FMADD	a6,  alpha1, y06, a6
885	FMADD	a7,  alpha1, y07, a7
886	FMADD	a8,  alpha1, y08, a8
887
888	STFD	a1,   0 * SIZE(AO1)
889	STFD	a2,   1 * SIZE(AO1)
890	STFD	a3,   2 * SIZE(AO1)
891	STFD	a4,   3 * SIZE(AO1)
892
893	LFD	a1,   8 * SIZE(AO1)
894	LFD	a2,   9 * SIZE(AO1)
895	LFD	a3,  10 * SIZE(AO1)
896	LFD	a4,  11 * SIZE(AO1)
897
898	STFD	a5,   4 * SIZE(AO1)
899	STFD	a6,   5 * SIZE(AO1)
900	STFD	a7,   6 * SIZE(AO1)
901	STFD	a8,   7 * SIZE(AO1)
902
903	LFD	a5,  12 * SIZE(AO1)
904	LFD	a6,  13 * SIZE(AO1)
905	LFD	a7,  14 * SIZE(AO1)
906	LFD	a8,  15 * SIZE(AO1)
907
908	LFD	y01,  8 * SIZE(X1)
909	LFD	y02,  9 * SIZE(X1)
910	LFD	y03, 10 * SIZE(X1)
911	LFD	y04, 11 * SIZE(X1)
912
913	LFD	y05, 12 * SIZE(X1)
914	LFD	y06, 13 * SIZE(X1)
915	LFD	y07, 14 * SIZE(X1)
916	LFD	y08, 15 * SIZE(X1)
917
918	FMADD	a1,  alpha1, y01, a1
919	FMADD	a2,  alpha1, y02, a2
920	FMADD	a3,  alpha1, y03, a3
921	FMADD	a4,  alpha1, y04, a4
922
923	FMADD	a5,  alpha1, y05, a5
924	FMADD	a6,  alpha1, y06, a6
925	FMADD	a7,  alpha1, y07, a7
926	FMADD	a8,  alpha1, y08, a8
927
928	STFD	a1,   8 * SIZE(AO1)
929	STFD	a2,   9 * SIZE(AO1)
930	STFD	a3,  10 * SIZE(AO1)
931	STFD	a4,  11 * SIZE(AO1)
932
933	LFD	a1,  16 * SIZE(AO1)
934	LFD	a2,  17 * SIZE(AO1)
935	LFD	a3,  18 * SIZE(AO1)
936	LFD	a4,  19 * SIZE(AO1)
937
938	STFD	a5,  12 * SIZE(AO1)
939	STFD	a6,  13 * SIZE(AO1)
940	STFD	a7,  14 * SIZE(AO1)
941	STFD	a8,  15 * SIZE(AO1)
942
943	LFD	a5,  20 * SIZE(AO1)
944	LFD	a6,  21 * SIZE(AO1)
945	LFD	a7,  22 * SIZE(AO1)
946	LFD	a8,  23 * SIZE(AO1)
947
948	LFD	y01, 16 * SIZE(X1)
949	LFD	y02, 17 * SIZE(X1)
950	LFD	y03, 18 * SIZE(X1)
951	LFD	y04, 19 * SIZE(X1)
952
953	LFD	y05, 20 * SIZE(X1)
954	LFD	y06, 21 * SIZE(X1)
955	LFD	y07, 22 * SIZE(X1)
956	LFD	y08, 23 * SIZE(X1)
957
958	addi	AO1, AO1, 16 * SIZE
959	addi	X1, X1,   16 * SIZE
960
961	DCBT(AO1, PREA)
962	DCBT(Y1, PREY)
963
964	bdnz+	LL(22)
965	.align 4
966
967LL(23):
968	FMADD	a1,  alpha1, y01, a1
969	FMADD	a2,  alpha1, y02, a2
970	FMADD	a3,  alpha1, y03, a3
971	FMADD	a4,  alpha1, y04, a4
972
973	FMADD	a5,  alpha1, y05, a5
974	FMADD	a6,  alpha1, y06, a6
975	FMADD	a7,  alpha1, y07, a7
976	FMADD	a8,  alpha1, y08, a8
977
978	STFD	a1,   0 * SIZE(AO1)
979	STFD	a2,   1 * SIZE(AO1)
980	STFD	a3,   2 * SIZE(AO1)
981	STFD	a4,   3 * SIZE(AO1)
982
983	LFD	a1,   8 * SIZE(AO1)
984	LFD	a2,   9 * SIZE(AO1)
985	LFD	a3,  10 * SIZE(AO1)
986	LFD	a4,  11 * SIZE(AO1)
987
988	STFD	a5,   4 * SIZE(AO1)
989	STFD	a6,   5 * SIZE(AO1)
990	STFD	a7,   6 * SIZE(AO1)
991	STFD	a8,   7 * SIZE(AO1)
992
993	LFD	a5,  12 * SIZE(AO1)
994	LFD	a6,  13 * SIZE(AO1)
995	LFD	a7,  14 * SIZE(AO1)
996	LFD	a8,  15 * SIZE(AO1)
997
998	LFD	y01,  8 * SIZE(X1)
999	LFD	y02,  9 * SIZE(X1)
1000	LFD	y03, 10 * SIZE(X1)
1001	LFD	y04, 11 * SIZE(X1)
1002
1003	LFD	y05, 12 * SIZE(X1)
1004	LFD	y06, 13 * SIZE(X1)
1005	LFD	y07, 14 * SIZE(X1)
1006	LFD	y08, 15 * SIZE(X1)
1007
1008	FMADD	a1,  alpha1, y01, a1
1009	FMADD	a2,  alpha1, y02, a2
1010	FMADD	a3,  alpha1, y03, a3
1011	FMADD	a4,  alpha1, y04, a4
1012
1013	FMADD	a5,  alpha1, y05, a5
1014	FMADD	a6,  alpha1, y06, a6
1015	FMADD	a7,  alpha1, y07, a7
1016	FMADD	a8,  alpha1, y08, a8
1017
1018	STFD	a1,   8 * SIZE(AO1)
1019	STFD	a2,   9 * SIZE(AO1)
1020	STFD	a3,  10 * SIZE(AO1)
1021	STFD	a4,  11 * SIZE(AO1)
1022
1023	LFD	a1,  16 * SIZE(AO1)
1024	LFD	a2,  17 * SIZE(AO1)
1025	LFD	a3,  18 * SIZE(AO1)
1026	LFD	a4,  19 * SIZE(AO1)
1027
1028	STFD	a5,  12 * SIZE(AO1)
1029	STFD	a6,  13 * SIZE(AO1)
1030	STFD	a7,  14 * SIZE(AO1)
1031	STFD	a8,  15 * SIZE(AO1)
1032
1033	LFD	a5,  20 * SIZE(AO1)
1034	LFD	a6,  21 * SIZE(AO1)
1035	LFD	a7,  22 * SIZE(AO1)
1036	LFD	a8,  23 * SIZE(AO1)
1037
1038	addi	AO1, AO1, 16 * SIZE
1039	addi	X1, X1,   16 * SIZE
1040	.align 4
1041
1042LL(25):
1043	andi.	r0, M, 15
1044	ble	LL(999)
1045
1046	andi.	r0, M, 8
1047	ble	LL(26)
1048
1049	LFD	y01,  0 * SIZE(X1)
1050	LFD	y02,  1 * SIZE(X1)
1051	LFD	y03,  2 * SIZE(X1)
1052	LFD	y04,  3 * SIZE(X1)
1053	LFD	y05,  4 * SIZE(X1)
1054	LFD	y06,  5 * SIZE(X1)
1055	LFD	y07,  6 * SIZE(X1)
1056	LFD	y08,  7 * SIZE(X1)
1057
1058	LFD	a1,   0 * SIZE(AO1)
1059	LFD	a2,   1 * SIZE(AO1)
1060	LFD	a3,   2 * SIZE(AO1)
1061	LFD	a4,   3 * SIZE(AO1)
1062	LFD	a5,   4 * SIZE(AO1)
1063	LFD	a6,   5 * SIZE(AO1)
1064	LFD	a7,   6 * SIZE(AO1)
1065	LFD	a8,   7 * SIZE(AO1)
1066
1067	FMADD	a1,  alpha1, y01, a1
1068	FMADD	a2,  alpha1, y02, a2
1069	FMADD	a3,  alpha1, y03, a3
1070	FMADD	a4,  alpha1, y04, a4
1071
1072	STFD	a1,   0 * SIZE(AO1)
1073	STFD	a2,   1 * SIZE(AO1)
1074	STFD	a3,   2 * SIZE(AO1)
1075	STFD	a4,   3 * SIZE(AO1)
1076
1077	FMADD	a5,  alpha1, y05, a5
1078	FMADD	a6,  alpha1, y06, a6
1079	FMADD	a7,  alpha1, y07, a7
1080	FMADD	a8,  alpha1, y08, a8
1081
1082	STFD	a5,   4 * SIZE(AO1)
1083	STFD	a6,   5 * SIZE(AO1)
1084	STFD	a7,   6 * SIZE(AO1)
1085	STFD	a8,   7 * SIZE(AO1)
1086
1087	addi	AO1, AO1, 8 * SIZE
1088	addi	X1, X1, 8 * SIZE
1089	.align 4
1090
1091LL(26):
1092	andi.	r0, M, 4
1093	ble	LL(27)
1094
1095	LFD	a1,  0 * SIZE(AO1)
1096	LFD	a2,  1 * SIZE(AO1)
1097	LFD	a3,  2 * SIZE(AO1)
1098	LFD	a4,  3 * SIZE(AO1)
1099
1100	LFD	y01, 0 * SIZE(X1)
1101	LFD	y02, 1 * SIZE(X1)
1102	LFD	y03, 2 * SIZE(X1)
1103	LFD	y04, 3 * SIZE(X1)
1104
1105	FMADD	a1,  alpha1, y01, a1
1106	FMADD	a2,  alpha1, y02, a2
1107	FMADD	a3,  alpha1, y03, a3
1108	FMADD	a4,  alpha1, y04, a4
1109
1110	STFD	a1,  0 * SIZE(AO1)
1111	STFD	a2,  1 * SIZE(AO1)
1112	STFD	a3,  2 * SIZE(AO1)
1113	STFD	a4,  3 * SIZE(AO1)
1114
1115	addi	AO1, AO1, 4 * SIZE
1116	addi	X1, X1, 4 * SIZE
1117	.align 4
1118
1119LL(27):
1120	andi.	r0, M, 2
1121	ble	LL(28)
1122
1123	LFD	a1,  0 * SIZE(AO1)
1124	LFD	a2,  1 * SIZE(AO1)
1125
1126	LFD	y01, 0 * SIZE(X1)
1127	LFD	y02, 1 * SIZE(X1)
1128
1129	FMADD	a1, alpha1, y01, a1
1130	FMADD	a2, alpha1, y02, a2
1131
1132	STFD	a1,  0 * SIZE(AO1)
1133	STFD	a2,  1 * SIZE(AO1)
1134
1135	addi	AO1, AO1, 2 * SIZE
1136	addi	X1, X1, 2 * SIZE
1137	.align 4
1138
1139LL(28):
1140	andi.	r0, M, 1
1141	ble	LL(999)
1142
1143	LFD	y01, 0 * SIZE(X1)
1144	LFD	a1,  0 * SIZE(AO1)
1145
1146	FMADD	a1, alpha1, y01, a1
1147
1148	STFD	a1,  0 * SIZE(AO1)
1149	.align 4
1150
1151LL(999):
1152	li	r3, 0
1153
1154	lfd	f14,     0(SP)
1155	lfd	f15,     8(SP)
1156	lfd	f16,    16(SP)
1157	lfd	f17,    24(SP)
1158	lfd	f18,    32(SP)
1159	lfd	f19,    40(SP)
1160	lfd	f20,    48(SP)
1161	lfd	f21,    56(SP)
1162	lfd	f22,    64(SP)
1163	lfd	f23,    72(SP)
1164	lfd	f24,    80(SP)
1165	lfd	f25,    88(SP)
1166	lfd	f26,    96(SP)
1167	lfd	f27,   104(SP)
1168	lfd	f28,   112(SP)
1169	lfd	f29,   120(SP)
1170	lfd	f30,   128(SP)
1171	lfd	f31,   136(SP)
1172
1173#ifdef __64BIT__
1174	ld	r14,   144(SP)
1175	ld	r15,   152(SP)
1176	ld	r16,   160(SP)
1177	ld	r17,   168(SP)
1178	ld	r18,   176(SP)
1179	ld	r19,   184(SP)
1180	ld	r20,   192(SP)
1181	ld	r21,   200(SP)
1182	ld	r22,   208(SP)
1183	ld	r23,   216(SP)
1184	ld	r24,   224(SP)
1185	ld	r25,   232(SP)
1186	ld	r26,   240(SP)
1187	ld	r27,   248(SP)
1188#else
1189	lwz	r14,   144(SP)
1190	lwz	r15,   148(SP)
1191	lwz	r16,   152(SP)
1192	lwz	r17,   156(SP)
1193	lwz	r18,   160(SP)
1194	lwz	r19,   164(SP)
1195	lwz	r20,   168(SP)
1196	lwz	r21,   172(SP)
1197	lwz	r22,   176(SP)
1198	lwz	r23,   180(SP)
1199	lwz	r24,   184(SP)
1200	lwz	r25,   188(SP)
1201	lwz	r26,   192(SP)
1202	lwz	r27,   196(SP)
1203#endif
1204
1205	addi	SP, SP, STACKSIZE
1206	blr
1207
1208	EPILOGUE
1209#endif
1210