1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define M	r3
43#define	N	r4
44#define A	r6
45#define LDA	r7
46#define X	r8
47#define	INCX	r9
48#define	Y	r10
49#define	INCY	r5
50
51#define I	r11
52#define	J	r12
53
54#define INCY2	r24
55#define A1	r25
56#define A2	r26
57#define A3	r27
58#define A4	r28
59
60#define YL	r29
61#define	YS	r30
62#define INC2	r31
63
64#define yl1 f0
65#define yl2 f2
66#define yl3 f3
67#define yl4 f4
68#define ys1 f5
69#define ys2 f6
70#define ys3 f7
71#define ys4 f8
72#define	yl5 f27
73#define ys5 f28
74
75#define alpha1 f9
76#define alpha2 f10
77
78#define a1     f11
79#define a2     f12
80#define a3     f13
81#define a4     f14
82#define a5     f15
83#define a6     f16
84#define a7     f17
85#define a8     f18
86
87#define a9     f19
88#define a10    f20
89#define a11    f21
90#define a12    f22
91#define a13    f23
92#define a14    f24
93#define a15    f25
94#define a16    f26
95
96#define alpha  f1
97
98	PROLOGUE
99	PROFCODE
100
101	li	r0, -16
102	lwz	INCY,      8(SP)
103
104	stfpdux	f14, SP, r0
105	stfpdux	f15, SP, r0
106	stfpdux	f16, SP, r0
107	stfpdux	f17, SP, r0
108	stfpdux	f18, SP, r0
109	stfpdux	f19, SP, r0
110	stfpdux	f20, SP, r0
111	stfpdux	f21, SP, r0
112	stfpdux	f22, SP, r0
113	stfpdux	f23, SP, r0
114	stfpdux	f24, SP, r0
115	stfpdux	f25, SP, r0
116	stfpdux	f26, SP, r0
117	stfpdux	f27, SP, r0
118	stfpdux	f28, SP, r0
119	stfpdux	f29, SP, r0
120	stfpdux	f30, SP, r0
121	stfpdux	f31, SP, r0
122
123	stwu	r31,  -4(SP)
124	stwu	r30,  -4(SP)
125	stwu	r29,  -4(SP)
126	stwu	r28,  -4(SP)
127
128	stwu	r27,  -4(SP)
129	stwu	r26,  -4(SP)
130	stwu	r25,  -4(SP)
131	stwu	r24,  -4(SP)
132
133	stwu	r23,  -4(SP)
134	stwu	r22,  -4(SP)
135	stwu	r21,  -4(SP)
136	stwu	r20,  -4(SP)
137
138	stwu	r19,  -4(SP)
139	stwu	r18,  -4(SP)
140	stwu	r17,  -4(SP)
141	stwu	r16,  -4(SP)
142
143	slwi	LDA,  LDA,  BASE_SHIFT
144	slwi	INCX, INCX, BASE_SHIFT
145	slwi	INCY, INCY, BASE_SHIFT
146
147	fsmfp	alpha, alpha
148
149	cmpwi	cr0, M, 0
150	ble-	.L999
151	cmpwi	cr0, N, 0
152	ble-	.L999
153
154	add	INCY2, INCY, INCY
155	li	INC2, 2 * SIZE
156	sub	X, X, INCX
157
158	andi.	r0, A,  2 * SIZE - 1
159#	bne	.L100
160
161# All cases for aligned A, even LDA
162
163	cmpwi	cr0, INCY,  SIZE
164	bne	.L70
165
166	andi.	r0, Y,  2 * SIZE - 1
167	bne	.L40
168
169# A : aligned  LDA : even  Y : Unit Aligned
170
171	sub	A, A, INC2
172	sub	Y, Y, INCY2
173
174	srawi.	J, N, 2
175	ble	.L20
176	.align 4
177
178.L11:
179	LFDUX	alpha1, X, INCX
180	mr	A1, A
181	add	A2, A,  LDA
182	add	A3, A2, LDA
183	LFSDUX	alpha1, X, INCX
184	LFDUX	alpha2, X, INCX
185	add	A4, A3, LDA
186	add	A,  A4, LDA
187	mr	YL, Y
188	LFSDUX	alpha2, X, INCX
189	fpmul	alpha1, alpha, alpha1
190	mr	YS, Y
191	srawi.	r0,  M, 3
192	mtspr	CTR, r0
193	fpmul	alpha2, alpha, alpha2
194	ble	.L15
195
196	LFPDUX	yl1, YL, INCY2
197	LFPDUX	yl2, YL, INCY2
198	LFPDUX	yl3, YL, INCY2
199	LFPDUX	yl4, YL, INCY2
200
201	LFPDUX	a1,  A1, INC2
202	LFPDUX	a5,  A1, INC2
203	LFPDUX	a9,  A1, INC2
204	LFPDUX	a13, A1, INC2
205
206	LFPDUX	a2,  A2, INC2
207	LFPDUX	a6,  A2, INC2
208	LFPDUX	a10, A2, INC2
209	LFPDUX	a14, A2, INC2
210
211	LFPDUX	a3,  A3, INC2
212	LFPDUX	a7,  A3, INC2
213	LFPDUX	a11, A3, INC2
214	LFPDUX	a15, A3, INC2
215
216	LFPDUX	a4,  A4, INC2
217	fxcpmadd  ys1, alpha1, a1,  yl1
218	LFPDUX	a8,  A4, INC2
219	fxcpmadd  ys2, alpha1, a5,  yl2
220	LFPDUX	a12, A4, INC2
221	fxcpmadd  ys3, alpha1, a9,  yl3
222	LFPDUX	a16, A4, INC2
223	fxcpmadd  ys4, alpha1, a13, yl4
224	bdz	.L13
225	.align 4
226
227.L12:
228	LFPDUX	yl1, YL, INCY2
229
230	fxcsmadd  ys1, alpha1, a2,  ys1
231	LFPDUX	a1,  A1, INC2
232	fxcsmadd  ys2, alpha1, a6,  ys2
233	LFPDUX	a5,  A1, INC2
234	fxcsmadd  ys3, alpha1, a10, ys3
235	LFPDUX	a9,  A1, INC2
236	fxcsmadd  ys4, alpha1, a14, ys4
237	LFPDUX	a13, A1, INC2
238
239	LFPDUX	yl2, YL, INCY2
240
241	fxcpmadd  ys1, alpha2, a3,  ys1
242	LFPDUX	a2,  A2, INC2
243	fxcpmadd  ys2, alpha2, a7,  ys2
244	LFPDUX	a6,  A2, INC2
245	fxcpmadd  ys3, alpha2, a11, ys3
246	LFPDUX	a10, A2, INC2
247	fxcpmadd  ys4, alpha2, a15, ys4
248	LFPDUX	a14, A2, INC2
249
250	LFPDUX	yl3, YL, INCY2
251
252	fxcsmadd  ys1, alpha2, a4,  ys1
253	LFPDUX	a3,  A3, INC2
254	fxcsmadd  ys2, alpha2, a8,  ys2
255	LFPDUX	a7,  A3, INC2
256	fxcsmadd  ys3, alpha2, a12, ys3
257	LFPDUX	a11, A3, INC2
258	fxcsmadd  ys4, alpha2, a16, ys4
259	LFPDUX	a15, A3, INC2
260
261	LFPDUX	yl4, YL, INCY2
262
263	STFPDUX	ys1, YS, INCY2
264	STFPDUX	ys2, YS, INCY2
265	STFPDUX	ys3, YS, INCY2
266	STFPDUX	ys4, YS, INCY2
267
268	LFPDUX	a4,  A4, INC2
269	fxcpmadd  ys1, alpha1, a1,  yl1
270	LFPDUX	a8,  A4, INC2
271	fxcpmadd  ys2, alpha1, a5,  yl2
272	LFPDUX	a12, A4, INC2
273	fxcpmadd  ys3, alpha1, a9,  yl3
274	LFPDUX	a16, A4, INC2
275	fxcpmadd  ys4, alpha1, a13, yl4
276	bdnz	.L12
277	.align 4
278
279.L13:
280	fxcsmadd  ys1, alpha1, a2,  ys1
281	fxcsmadd  ys2, alpha1, a6,  ys2
282	fxcsmadd  ys3, alpha1, a10, ys3
283	fxcsmadd  ys4, alpha1, a14, ys4
284
285	fxcpmadd  ys1, alpha2, a3,  ys1
286	fxcpmadd  ys2, alpha2, a7,  ys2
287	fxcpmadd  ys3, alpha2, a11, ys3
288	fxcpmadd  ys4, alpha2, a15, ys4
289
290	fxcsmadd  ys1, alpha2, a4,  ys1
291	fxcsmadd  ys2, alpha2, a8,  ys2
292	fxcsmadd  ys3, alpha2, a12, ys3
293	fxcsmadd  ys4, alpha2, a16, ys4
294
295	STFPDUX	ys1, YS, INCY2
296	STFPDUX	ys2, YS, INCY2
297	STFPDUX	ys3, YS, INCY2
298	STFPDUX	ys4, YS, INCY2
299	.align 4
300
301.L15:
302	andi.	r0, M, 7
303	ble	.L19
304
305	andi.	r0, M, 4
306	ble	.L17
307
308	LFPDUX	yl1, YL, INCY2
309	LFPDUX	a1,  A1, INC2
310	LFPDUX	yl2, YL, INCY2
311	LFPDUX	a5,  A1, INC2
312
313	LFPDUX	a2,  A2, INC2
314	LFPDUX	a6,  A2, INC2
315	LFPDUX	a3,  A3, INC2
316	LFPDUX	a7,  A3, INC2
317
318	LFPDUX	a4,  A4, INC2
319	LFPDUX	a8,  A4, INC2
320
321	fxcpmadd  ys1, alpha1, a1, yl1
322	fxcpmadd  ys2, alpha1, a5, yl2
323	fxcsmadd  ys1, alpha1, a2, ys1
324	fxcsmadd  ys2, alpha1, a6, ys2
325
326	fxcpmadd  ys1, alpha2, a3, ys1
327	fxcpmadd  ys2, alpha2, a7, ys2
328	fxcsmadd  ys1, alpha2, a4, ys1
329	fxcsmadd  ys2, alpha2, a8, ys2
330
331	STFPDUX	ys1, YS, INCY2
332	STFPDUX	ys2, YS, INCY2
333	.align 4
334
335.L17:
336	andi.	r0, M, 2
337	ble	.L18
338
339	LFPDUX	yl1, YL, INCY2
340
341	LFPDUX	a1,  A1, INC2
342	LFPDUX	a2,  A2, INC2
343	LFPDUX	a3,  A3, INC2
344	LFPDUX	a4,  A4, INC2
345
346	fxcpmadd  ys1, alpha1, a1, yl1
347	fxcsmadd  ys1, alpha1, a2, ys1
348	fxcpmadd  ys1, alpha2, a3, ys1
349	fxcsmadd  ys1, alpha2, a4, ys1
350
351	STFPDUX	ys1, YS, INCY2
352	.align 4
353
354.L18:
355	andi.	r0, M, 1
356	ble	.L19
357
358	LFDUX	yl1, YL, INCY2
359
360	LFDUX	a1,  A1, INC2
361	LFDUX	a2,  A2, INC2
362	LFDUX	a3,  A3, INC2
363	LFDUX	a4,  A4, INC2
364
365	fxcpmadd  ys1, alpha1, a1, yl1
366	fxcsmadd  ys1, alpha1, a2, ys1
367	fxcpmadd  ys1, alpha2, a3, ys1
368	fxcsmadd  ys1, alpha2, a4, ys1
369
370	STFDUX	ys1, YS, INCY2
371	.align 4
372
373.L19:
374	addi	J, J, -1
375	cmpi	cr0, 0, J, 0
376	bgt	.L11
377	.align 4
378
379.L20:
380	andi.	J, N, 2
381	ble	.L30
382
383	LFDUX	alpha1, X, INCX
384
385	mr	A1, A
386	add	A2, A,  LDA
387	add	A,  A2, LDA
388	LFSDUX	alpha1, X, INCX
389
390	mr	YL, Y
391	mr	YS, Y
392	fpmul	alpha1, alpha, alpha1
393
394	srawi.	r0,  M, 3
395	mtspr	CTR, r0
396	ble	.L25
397
398	LFPDUX	yl1, YL, INCY2
399	LFPDUX	a1,  A1, INC2
400	LFPDUX	yl2, YL, INCY2
401	LFPDUX	a5,  A1, INC2
402
403	LFPDUX	yl3, YL, INCY2
404	LFPDUX	a9,  A1, INC2
405	LFPDUX	yl4, YL, INCY2
406	LFPDUX	a13, A1, INC2
407
408	LFPDUX	a2,  A2, INC2
409	LFPDUX	a6,  A2, INC2
410	LFPDUX	a10, A2, INC2
411	LFPDUX	a14, A2, INC2
412	bdz	.L23
413	.align 4
414
415.L22:
416	fxcpmadd  ys1, alpha1, a1,  yl1
417	LFPDUX	a1,  A1, INC2
418	LFPDUX	yl1, YL, INCY2
419	fxcpmadd  ys2, alpha1, a5,  yl2
420	LFPDUX	a5,  A1, INC2
421	LFPDUX	yl2, YL, INCY2
422	fxcpmadd  ys3, alpha1, a9,  yl3
423	LFPDUX	a9,  A1, INC2
424	LFPDUX	yl3, YL, INCY2
425	fxcpmadd  ys4, alpha1, a13, yl4
426	LFPDUX	a13, A1, INC2
427	LFPDUX	yl4, YL, INCY2
428
429	fxcsmadd  ys1, alpha1, a2,  ys1
430	LFPDUX	a2,  A2, INC2
431	fxcsmadd  ys2, alpha1, a6,  ys2
432	LFPDUX	a6,  A2, INC2
433	fxcsmadd  ys3, alpha1, a10, ys3
434	LFPDUX	a10, A2, INC2
435	fxcsmadd  ys4, alpha1, a14, ys4
436	LFPDUX	a14, A2, INC2
437
438	STFPDUX	ys1, YS, INCY2
439	STFPDUX	ys2, YS, INCY2
440	STFPDUX	ys3, YS, INCY2
441	STFPDUX	ys4, YS, INCY2
442	bdnz	.L22
443	.align 4
444
445.L23:
446	fxcpmadd  ys1, alpha1, a1,  yl1
447	fxcpmadd  ys2, alpha1, a5,  yl2
448	fxcpmadd  ys3, alpha1, a9,  yl3
449	fxcpmadd  ys4, alpha1, a13, yl4
450
451	fxcsmadd  ys1, alpha1, a2,  ys1
452	fxcsmadd  ys2, alpha1, a6,  ys2
453	fxcsmadd  ys3, alpha1, a10, ys3
454	fxcsmadd  ys4, alpha1, a14, ys4
455
456	STFPDUX	ys1, YS, INCY2
457	STFPDUX	ys2, YS, INCY2
458	STFPDUX	ys3, YS, INCY2
459	STFPDUX	ys4, YS, INCY2
460	.align 4
461
462.L25:
463	andi.	r0, M, 7
464	ble	.L30
465
466	andi.	r0, M, 4
467	ble	.L27
468
469	LFPDUX	yl1, YL, INCY2
470	LFPDUX	a1,  A1, INC2
471	LFPDUX	a2,  A2, INC2
472
473	LFPDUX	yl2, YL, INCY2
474	LFPDUX	a5,  A1, INC2
475	LFPDUX	a6,  A2, INC2
476
477	fxcpmadd  ys1, alpha1, a1, yl1
478	fxcsmadd  ys1, alpha1, a2, ys1
479	fxcpmadd  ys2, alpha1, a5, yl2
480	fxcsmadd  ys2, alpha1, a6, ys2
481
482	STFPDUX	ys1, YS, INCY2
483	STFPDUX	ys2, YS, INCY2
484	.align 4
485
486.L27:
487	andi.	r0, M, 2
488	ble	.L28
489
490	LFPDUX	yl1, YL, INCY2
491	LFPDUX	a1,  A1, INC2
492	LFPDUX	a2,  A2, INC2
493
494	fxcpmadd  ys1, alpha1, a1, yl1
495	fxcsmadd  ys1, alpha1, a2, ys1
496
497	STFPDUX	ys1, YS, INCY2
498	.align 4
499
500.L28:
501	andi.	r0, M, 1
502	ble	.L30
503
504	LFDUX	yl1, YL, INCY2
505	LFDUX	a1,  A1, INC2
506	LFDUX	a2,  A2, INC2
507
508	fxcpmadd  ys1, alpha1, a1, yl1
509	fxcsmadd  ys1, alpha1, a2, ys1
510
511	STFDUX	ys1, YS, INCY2
512	.align 4
513
514.L30:
515	andi.	J, N, 1
516	ble	.L999
517
518	LFDUX	alpha1, X, INCX
519
520	mr	A1, A
521	mr	YL, Y
522	mr	YS, Y
523	fmul	alpha1, alpha, alpha1
524
525	srawi.	r0,  M, 3
526	mtspr	CTR, r0
527	ble	.L35
528
529	LFPDUX	yl1, YL, INCY2
530	LFPDUX	a1,  A1, INC2
531	LFPDUX	yl2, YL, INCY2
532	LFPDUX	a5,  A1, INC2
533
534	LFPDUX	yl3, YL, INCY2
535	LFPDUX	a9,  A1, INC2
536	LFPDUX	yl4, YL, INCY2
537	LFPDUX	a13, A1, INC2
538	bdz	.L33
539	.align 4
540
541.L32:
542	fxcpmadd  ys1, alpha1, a1,  yl1
543	LFPDUX	yl1, YL, INCY2
544	LFPDUX	a1,  A1, INC2
545	fxcpmadd  ys2, alpha1, a5,  yl2
546	LFPDUX	yl2, YL, INCY2
547	LFPDUX	a5,  A1, INC2
548	fxcpmadd  ys3, alpha1, a9,  yl3
549	LFPDUX	yl3, YL, INCY2
550	LFPDUX	a9,  A1, INC2
551	fxcpmadd  ys4, alpha1, a13, yl4
552	LFPDUX	yl4, YL, INCY2
553	LFPDUX	a13, A1, INC2
554
555	STFPDUX	ys1, YS, INCY2
556	STFPDUX	ys2, YS, INCY2
557	STFPDUX	ys3, YS, INCY2
558	STFPDUX	ys4, YS, INCY2
559	bdnz	.L32
560	.align 4
561
562.L33:
563	fxcpmadd  ys1, alpha1, a1,  yl1
564	fxcpmadd  ys2, alpha1, a5,  yl2
565	fxcpmadd  ys3, alpha1, a9,  yl3
566	fxcpmadd  ys4, alpha1, a13, yl4
567
568	STFPDUX	ys1, YS, INCY2
569	STFPDUX	ys2, YS, INCY2
570	STFPDUX	ys3, YS, INCY2
571	STFPDUX	ys4, YS, INCY2
572	.align 4
573
574.L35:
575	andi.	r0, M, 7
576	ble	.L999
577
578	andi.	r0, M, 4
579	ble	.L37
580
581	LFPDUX	yl1, YL, INCY2
582	LFPDUX	a1,  A1, INC2
583
584	LFPDUX	yl2, YL, INCY2
585	LFPDUX	a5,  A1, INC2
586
587	fxcpmadd  ys1, alpha1, a1, yl1
588	fxcpmadd  ys2, alpha1, a5, yl2
589
590	STFPDUX	ys1, YS, INCY2
591	STFPDUX	ys2, YS, INCY2
592	.align 4
593
594.L37:
595	andi.	r0, M, 2
596	ble	.L38
597
598	LFPDUX	yl1, YL, INCY2
599	LFPDUX	a1,  A1, INC2
600
601	fxcpmadd  ys1, alpha1, a1, yl1
602
603	STFPDUX	ys1, YS, INCY2
604	.align 4
605
606.L38:
607	andi.	r0, M, 1
608	ble	.L999
609
610	LFDUX	yl1, YL, INCY2
611	LFDUX	a1,  A1, INC2
612
613	fxcpmadd  ys1, alpha1, a1, yl1
614
615	STFDUX	ys1, YS, INCY2
616	b	.L999
617	.align 4
618
619.L40:
620# A : aligned  LDA : even  Y : Unaligned
621
622	sub	A, A, INC2
623	sub	Y, Y, INCY
624
625	srawi.	J, N, 2
626	ble	.L50
627	.align 4
628
629.L41:
630	LFDUX	alpha1, X, INCX
631	LFSDUX	alpha1, X, INCX
632	LFDUX	alpha2, X, INCX
633	LFSDUX	alpha2, X, INCX
634
635	fpmul	alpha1, alpha, alpha1
636	fpmul	alpha2, alpha, alpha2
637
638	mr	A1, A
639	add	A2, A,  LDA
640	add	A3, A2, LDA
641	add	A4, A3, LDA
642	add	A,  A4, LDA
643
644	mr	YL, Y
645	sub	YS, Y, INCY2
646
647	LFSDX	ys1, YS, INCY2
648	LFDX	yl1, YL, INCY
649
650	srawi.	r0,  M, 3
651	mtspr	CTR, r0
652	ble	.L45
653
654	LFPDUX	a1,  A1, INC2
655	LFPDUX	a5,  A1, INC2
656	LFPDUX	a9,  A1, INC2
657	LFPDUX	a13, A1, INC2
658
659	LFXDUX	yl2, YL, INCY2
660	LFXDUX	yl3, YL, INCY2
661	LFXDUX	yl4, YL, INCY2
662	LFXDUX	yl5, YL, INCY2
663
664	LFPDUX	a2,  A2, INC2
665	LFPDUX	a6,  A2, INC2
666	LFPDUX	a10, A2, INC2
667	LFPDUX	a14, A2, INC2
668
669	LFPDUX	a3,  A3, INC2
670	LFPDUX	a7,  A3, INC2
671	LFPDUX	a11, A3, INC2
672	LFPDUX	a15, A3, INC2
673
674	LFPDUX	a4,  A4, INC2
675	fsmr	yl1, yl2
676	LFPDUX	a8,  A4, INC2
677	fsmr	yl2, yl3
678	LFPDUX	a12, A4, INC2
679	fsmr	yl3, yl4
680	LFPDUX	a16, A4, INC2
681	fsmr	yl4, yl5
682	bdz	.L43
683	.align 4
684
685.L42:
686	fxcpmadd  ys2, alpha1, a1,  yl1
687	LFPDUX	a1,  A1, INC2
688	fxcpmadd  ys3, alpha1, a5,  yl2
689	LFPDUX	a5,  A1, INC2
690	fxcpmadd  ys4, alpha1, a9,  yl3
691	LFPDUX	a9,  A1, INC2
692	fxcpmadd  ys5, alpha1, a13, yl4
693	LFPDUX	a13, A1, INC2
694
695	fxcsmadd  ys2, alpha1, a2,  ys2
696	LFPDUX	a2,  A2, INC2
697	fxcsmadd  ys3, alpha1, a6,  ys3
698	LFPDUX	a6,  A2, INC2
699	fxcsmadd  ys4, alpha1, a10, ys4
700	LFPDUX	a10, A2, INC2
701	fxcsmadd  ys5, alpha1, a14, ys5
702	LFPDUX	a14, A2, INC2
703
704	fxcpmadd  ys2, alpha2, a3,  ys2
705	LFPDUX	a3,  A3, INC2
706	fxcpmadd  ys3, alpha2, a7,  ys3
707	LFPDUX	a7,  A3, INC2
708	fxcpmadd  ys4, alpha2, a11, ys4
709	LFPDUX	a11, A3, INC2
710	fxcpmadd  ys5, alpha2, a15, ys5
711	LFPDUX	a15, A3, INC2
712
713	fxcsmadd  ys2, alpha2, a4,  ys2
714	LFPDUX	a4,  A4, INC2
715	fxcsmadd  ys3, alpha2, a8,  ys3
716	LFPDUX	a8,  A4, INC2
717	fxcsmadd  ys4, alpha2, a12, ys4
718	LFPDUX	a12, A4, INC2
719	fxcsmadd  ys5, alpha2, a16, ys5
720	LFPDUX	a16, A4, INC2
721
722	fmr	yl1, yl5
723	LFXDUX	yl2, YL, INCY2
724	fmr	ys1, ys2
725	LFXDUX	yl3, YL, INCY2
726	fmr	ys2, ys3
727	LFXDUX	yl4, YL, INCY2
728	fmr	ys3, ys4
729	LFXDUX	yl5, YL, INCY2
730	fmr	ys4, ys5
731
732	STFXDUX	ys1, YS, INCY2
733	fsmr	ys1, ys5
734	STFXDUX	ys2, YS, INCY2
735	fsmr	yl1, yl2
736	STFXDUX	ys3, YS, INCY2
737	fsmr	yl2, yl3
738	STFXDUX	ys4, YS, INCY2
739	fsmr	yl3, yl4
740
741	fsmr	yl4, yl5
742	bdnz	.L42
743	.align 4
744
745.L43:
746	fxcpmadd  ys2, alpha1, a1,  yl1
747	fxcpmadd  ys3, alpha1, a5,  yl2
748	fxcpmadd  ys4, alpha1, a9,  yl3
749	fxcpmadd  ys5, alpha1, a13, yl4
750
751	fxcsmadd  ys2, alpha1, a2,  ys2
752	fxcsmadd  ys3, alpha1, a6,  ys3
753	fxcsmadd  ys4, alpha1, a10, ys4
754	fxcsmadd  ys5, alpha1, a14, ys5
755
756	fxcpmadd  ys2, alpha2, a3,  ys2
757	fxcpmadd  ys3, alpha2, a7,  ys3
758	fxcpmadd  ys4, alpha2, a11, ys4
759	fxcpmadd  ys5, alpha2, a15, ys5
760
761	fxcsmadd  ys2, alpha2, a4,  ys2
762	fxcsmadd  ys3, alpha2, a8,  ys3
763	fxcsmadd  ys4, alpha2, a12, ys4
764	fxcsmadd  ys5, alpha2, a16, ys5
765
766	fmr	ys1, ys2
767	fmr	ys2, ys3
768	fmr	ys3, ys4
769	fmr	ys4, ys5
770	fmr	yl1, yl5
771
772	STFXDUX	ys1, YS, INCY2
773	fsmr	ys1, ys5
774	STFXDUX	ys2, YS, INCY2
775	STFXDUX	ys3, YS, INCY2
776	STFXDUX	ys4, YS, INCY2
777	.align 4
778
779.L45:
780	andi.	r0, M, 7
781	ble	.L48
782
783	andi.	r0, M, 4
784	ble	.L46
785
786	LFXDUX	yl2, YL, INCY2
787	LFXDUX	yl3, YL, INCY2
788
789	LFPDUX	a1,  A1, INC2
790	LFPDUX	a5,  A1, INC2
791
792	LFPDUX	a2,  A2, INC2
793	LFPDUX	a6,  A2, INC2
794	LFPDUX	a3,  A3, INC2
795	LFPDUX	a7,  A3, INC2
796
797	LFPDUX	a4,  A4, INC2
798	fsmr	yl1, yl2
799	LFPDUX	a8,  A4, INC2
800	fsmr	yl2, yl3
801
802	fxcpmadd  ys2, alpha1, a1, yl1
803	fxcpmadd  ys3, alpha1, a5, yl2
804	fxcsmadd  ys2, alpha1, a2, ys2
805	fxcsmadd  ys3, alpha1, a6, ys3
806
807	fxcpmadd  ys2, alpha2, a3, ys2
808	fxcpmadd  ys3, alpha2, a7, ys3
809	fxcsmadd  ys2, alpha2, a4, ys2
810	fxcsmadd  ys3, alpha2, a8, ys3
811
812	fmr	yl1, yl3
813	fmr	ys1, ys2
814	fmr	ys2, ys3
815
816	STFXDUX	ys1, YS, INCY2
817	fsmr	ys1, ys3
818	STFXDUX	ys2, YS, INCY2
819	.align 4
820
821.L46:
822	andi.	r0, M, 2
823	ble	.L47
824
825	LFXDUX	yl2, YL, INCY2
826
827	LFPDUX	a1,  A1, INC2
828	LFPDUX	a2,  A2, INC2
829	LFPDUX	a3,  A3, INC2
830	LFPDUX	a4,  A4, INC2
831
832	fsmr	yl1, yl2
833	fxcpmadd  ys2, alpha1, a1, yl1
834	fxcsmadd  ys2, alpha1, a2, ys2
835	fxcpmadd  ys2, alpha2, a3, ys2
836	fxcsmadd  ys2, alpha2, a4, ys2
837	fmr	yl1, yl2
838
839	fmr	ys1, ys2
840	STFXDUX	ys1, YS, INCY2
841	fsmr	ys1, ys2
842	.align 4
843
844.L47:
845	andi.	r0, M, 1
846	ble	.L48
847
848	LFDUX	a1,  A1, INC2
849	LFDUX	a2,  A2, INC2
850	LFDUX	a3,  A3, INC2
851	LFDUX	a4,  A4, INC2
852
853	fxcpmadd  ys2, alpha1, a1, yl1
854	fxcsmadd  ys2, alpha1, a2, ys2
855	fxcpmadd  ys2, alpha2, a3, ys2
856	fxcsmadd  ys2, alpha2, a4, ys2
857
858	STFSDX	ys1, YS, INCY2
859	add	YS, YS, INCY
860	STFDX	ys2, YS, INCY2
861	b	.L49
862	.align 4
863
864.L48:
865	STFSDUX	ys1, YS, INCY2
866	.align 4
867
868.L49:
869	addi	J, J, -1
870	cmpi	cr0, 0, J, 0
871	bgt	.L41
872	.align 4
873
874.L50:
875	andi.	J, N, 2
876	ble	.L60
877
878	LFDUX	alpha1, X, INCX
879
880	mr	A1, A
881	add	A2, A,  LDA
882	add	A,  A2, LDA
883	LFSDUX	alpha1, X, INCX
884
885	mr	YL, Y
886	sub	YS, Y, INCY2
887	fpmul	alpha1, alpha, alpha1
888
889	LFSDX	ys1, YS, INCY2
890	LFDX	yl1, YL, INCY
891
892	srawi.	r0,  M, 3
893	mtspr	CTR, r0
894	ble	.L55
895
896	LFPDUX	a1,  A1, INC2
897	LFPDUX	a5,  A1, INC2
898	LFPDUX	a9,  A1, INC2
899	LFPDUX	a13, A1, INC2
900
901	LFXDUX	yl2, YL, INCY2
902	LFXDUX	yl3, YL, INCY2
903 	LFXDUX	yl4, YL, INCY2
904	LFXDUX	yl5, YL, INCY2
905
906	LFPDUX	a2,  A2, INC2
907	fsmr	yl1, yl2
908	LFPDUX	a6,  A2, INC2
909	fsmr	yl2, yl3
910	LFPDUX	a10, A2, INC2
911	fsmr	yl3, yl4
912	LFPDUX	a14, A2, INC2
913	fsmr	yl4, yl5
914	bdz	.L53
915	.align 4
916
917.L52:
918	fxcpmadd  ys2, alpha1, a1,  yl1
919	LFPDUX	a1,  A1, INC2
920	fxcpmadd  ys3, alpha1, a5,  yl2
921	LFPDUX	a5,  A1, INC2
922	fxcpmadd  ys4, alpha1, a9,  yl3
923	LFPDUX	a9,  A1, INC2
924	fxcpmadd  ys5, alpha1, a13, yl4
925	LFPDUX	a13, A1, INC2
926
927	fxcsmadd  ys2, alpha1, a2,  ys2
928	LFPDUX	a2,  A2, INC2
929	fxcsmadd  ys3, alpha1, a6,  ys3
930	LFPDUX	a6,  A2, INC2
931	fxcsmadd  ys4, alpha1, a10, ys4
932	LFPDUX	a10, A2, INC2
933	fxcsmadd  ys5, alpha1, a14, ys5
934	LFPDUX	a14, A2, INC2
935
936	fmr	yl1, yl5
937	LFXDUX	yl2, YL, INCY2
938	fmr	ys1, ys2
939	LFXDUX	yl3, YL, INCY2
940	fmr	ys2, ys3
941 	LFXDUX	yl4, YL, INCY2
942	fmr	ys3, ys4
943	LFXDUX	yl5, YL, INCY2
944	fmr	ys4, ys5
945
946	STFXDUX	ys1, YS, INCY2
947	fsmr	ys1, ys5
948	STFXDUX	ys2, YS, INCY2
949	fsmr	yl1, yl2
950	STFXDUX	ys3, YS, INCY2
951	fsmr	yl2, yl3
952	STFXDUX	ys4, YS, INCY2
953	fsmr	yl3, yl4
954
955	fsmr	yl4, yl5
956	bdnz	.L52
957	.align 4
958
959.L53:
960	fxcpmadd  ys2, alpha1, a1,  yl1
961	fxcpmadd  ys3, alpha1, a5,  yl2
962	fxcpmadd  ys4, alpha1, a9,  yl3
963	fxcpmadd  ys5, alpha1, a13, yl4
964
965	fxcsmadd  ys2, alpha1, a2,  ys2
966	fxcsmadd  ys3, alpha1, a6,  ys3
967	fxcsmadd  ys4, alpha1, a10, ys4
968	fxcsmadd  ys5, alpha1, a14, ys5
969
970	fmr	yl1, yl5
971	fmr	ys1, ys2
972	fmr	ys2, ys3
973	fmr	ys3, ys4
974	fmr	ys4, ys5
975
976	STFXDUX	ys1, YS, INCY2
977	fsmr	ys1, ys5
978	STFXDUX	ys2, YS, INCY2
979	STFXDUX	ys3, YS, INCY2
980	STFXDUX	ys4, YS, INCY2
981	.align 4
982
983.L55:
984	andi.	r0, M, 7
985	ble	.L59
986
987	andi.	r0, M, 4
988	ble	.L57
989
990	LFXDUX	yl2, YL, INCY2
991	LFXDUX	yl3, YL, INCY2
992
993	LFPDUX	a1,  A1, INC2
994	LFPDUX	a2,  A2, INC2
995
996	LFPDUX	a5,  A1, INC2
997	LFPDUX	a6,  A2, INC2
998
999	fsmr	yl1, yl2
1000	fsmr	yl2, yl3
1001
1002	fxcpmadd  ys2, alpha1, a1, yl1
1003	fxcsmadd  ys2, alpha1, a2, ys2
1004	fxcpmadd  ys3, alpha1, a5, yl2
1005	fxcsmadd  ys3, alpha1, a6, ys3
1006
1007	fmr	yl1, yl3
1008	fmr	ys1, ys2
1009	fmr	ys2, ys3
1010
1011	STFXDUX	ys1, YS, INCY2
1012	STFXDUX	ys2, YS, INCY2
1013	fsmr	  ys1, ys3
1014	.align 4
1015
1016.L57:
1017	andi.	r0, M, 2
1018	ble	.L58
1019
1020	LFXDUX	yl2, YL, INCY2
1021	LFPDUX	a1,  A1, INC2
1022	LFPDUX	a2,  A2, INC2
1023
1024	fsmr	yl1, yl2
1025	fxcpmadd  ys2, alpha1, a1, yl1
1026	fxcsmadd  ys2, alpha1, a2, ys2
1027	fmr	yl1, yl2
1028
1029	fmr	ys1, ys2
1030	STFXDUX	ys1, YS, INCY2
1031	fsmr	ys1, ys2
1032	.align 4
1033
1034.L58:
1035	andi.	r0, M, 1
1036	ble	.L59
1037
1038	LFDUX	a1,  A1, INC2
1039	LFDUX	a2,  A2, INC2
1040
1041	fxmr	alpha2, alpha1
1042	fmadd	ys1, alpha1, a1, yl1
1043	fmadd	ys1, alpha2, a2, ys1
1044
1045	STFXDUX	ys1, YS, INCY2
1046	b	.L60
1047	.align 4
1048
1049.L59:
1050	STFSDUX	ys1, YS, INCY2
1051	.align 4
1052
1053.L60:
1054	andi.	J, N, 1
1055	ble	.L999
1056
1057	LFDUX	alpha1, X, INCX
1058	mr	A1, A
1059
1060	mr	YL, Y
1061	sub	YS, Y, INCY2
1062
1063	fmul	alpha1, alpha, alpha1
1064
1065	LFSDX	ys1, YS, INCY2
1066	LFDX	yl1, YL, INCY
1067
1068	srawi.	r0,  M, 3
1069	mtspr	CTR, r0
1070	ble	.L65
1071
1072	LFXDUX	yl2, YL, INCY2
1073	LFXDUX	yl3, YL, INCY2
1074	LFXDUX	yl4, YL, INCY2
1075	LFXDUX	yl5, YL, INCY2
1076
1077	LFPDUX	a1,  A1, INC2
1078	LFPDUX	a5,  A1, INC2
1079	LFPDUX	a9,  A1, INC2
1080	LFPDUX	a13, A1, INC2
1081
1082	fsmr	yl1, yl2
1083	fsmr	yl2, yl3
1084	fsmr	yl3, yl4
1085	fsmr	yl4, yl5
1086	bdz	.L63
1087	.align 4
1088
1089.L62:
1090	fxcpmadd  ys2, alpha1, a1,  yl1
1091	LFPDUX	a1,  A1, INC2
1092	fxcpmadd  ys3, alpha1, a5,  yl2
1093	LFXDUX	yl2, YL, INCY2
1094	fxcpmadd  ys4, alpha1, a9,  yl3
1095	LFXDUX	yl3, YL, INCY2
1096	fxcpmadd  ys5, alpha1, a13, yl4
1097	LFXDUX	yl4, YL, INCY2
1098
1099	fmr	yl1, yl5
1100	LFXDUX	yl5, YL, INCY2
1101	fmr	ys1, ys2
1102	LFPDUX	a5,  A1, INC2
1103	fmr	ys2, ys3
1104	LFPDUX	a9,  A1, INC2
1105	fmr	ys3, ys4
1106	LFPDUX	a13, A1, INC2
1107	fmr	ys4, ys5
1108
1109	STFXDUX	ys1, YS, INCY2
1110	fsmr	ys1, ys5
1111	STFXDUX	ys2, YS, INCY2
1112	fsmr	yl1, yl2
1113	STFXDUX	ys3, YS, INCY2
1114	fsmr	yl2, yl3
1115	STFXDUX	ys4, YS, INCY2
1116	fsmr	yl3, yl4
1117
1118	fsmr	yl4, yl5
1119	bdnz	.L62
1120	.align 4
1121
1122.L63:
1123	fxcpmadd  ys2, alpha1, a1,  yl1
1124	fxcpmadd  ys3, alpha1, a5,  yl2
1125	fxcpmadd  ys4, alpha1, a9,  yl3
1126	fxcpmadd  ys5, alpha1, a13, yl4
1127
1128	fmr	yl1, yl5
1129	fmr	ys1, ys2
1130	fmr	ys2, ys3
1131	fmr	ys3, ys4
1132	fmr	ys4, ys5
1133
1134	STFXDUX	ys1, YS, INCY2
1135	fsmr	ys1, ys5
1136	STFXDUX	ys2, YS, INCY2
1137	STFXDUX	ys3, YS, INCY2
1138	STFXDUX	ys4, YS, INCY2
1139	.align 4
1140
1141.L65:
1142	andi.	r0, M, 7
1143	ble	.L69
1144
1145	andi.	r0, M, 4
1146	ble	.L67
1147
1148	LFXDUX	yl2, YL, INCY2
1149	LFXDUX	yl3, YL, INCY2
1150
1151	LFPDUX	a1,  A1, INC2
1152	LFPDUX	a5,  A1, INC2
1153
1154	fsmr	yl1, yl2
1155	fsmr	yl2, yl3
1156
1157	fxcpmadd  ys2, alpha1, a1, yl1
1158	fxcpmadd  ys3, alpha1, a5, yl2
1159
1160	fmr	yl1, yl3
1161	fmr	ys1, ys2
1162	fmr	ys2, ys3
1163
1164	STFXDUX	ys1, YS, INCY2
1165	fsmr	  ys1, ys3
1166	STFXDUX	ys2, YS, INCY2
1167	.align 4
1168
1169.L67:
1170	andi.	r0, M, 2
1171	ble	.L68
1172
1173	LFPDUX	a1,  A1, INC2
1174	LFXDUX	yl2, YL, INCY2
1175
1176	fsmr	yl1, yl2
1177	fxcpmadd  ys2, alpha1, a1, yl1
1178	fmr	yl1, yl2
1179	fmr	ys1, ys2
1180	STFXDUX	ys1, YS, INCY2
1181	fsmr	ys1, ys2
1182	.align 4
1183
1184.L68:
1185	andi.	r0, M, 1
1186	ble	.L69
1187
1188	LFDUX	a1,  A1, INC2
1189	fmadd  ys1, alpha1, a1, yl1
1190	STFXDUX	ys1, YS, INCY2
1191	b	.L999
1192	.align 4
1193
1194.L69:
1195	STFSDUX	ys1, YS, INCY2
1196	b	.L999
1197	.align 4
1198
1199.L70:
1200	sub	A, A, INC2
1201	sub	Y, Y, INCY
1202	srawi.	J, N, 2
1203	ble	.L80
1204	.align 4
1205
1206.L71:
1207	LFDUX	alpha1, X, INCX
1208	mr	A1, A
1209	add	A2, A,  LDA
1210	add	A3, A2, LDA
1211	LFSDUX	alpha1, X, INCX
1212	LFDUX	alpha2, X, INCX
1213	add	A4, A3, LDA
1214	add	A,  A4, LDA
1215	mr	YL, Y
1216	LFSDUX	alpha2, X, INCX
1217	fpmul	alpha1, alpha, alpha1
1218	mr	YS, Y
1219	srawi.	r0,  M, 3
1220	mtspr	CTR, r0
1221	fpmul	alpha2, alpha, alpha2
1222	ble	.L75
1223
1224	LFDUX	yl1, YL, INCY
1225	LFPDUX	a1,  A1, INC2
1226	LFPDUX	a5,  A1, INC2
1227	LFPDUX	a9,  A1, INC2
1228	LFPDUX	a13, A1, INC2
1229	LFSDUX	yl1, YL, INCY
1230
1231	LFDUX	yl2, YL, INCY
1232	LFPDUX	a2,  A2, INC2
1233	LFPDUX	a6,  A2, INC2
1234	LFPDUX	a10, A2, INC2
1235	LFPDUX	a14, A2, INC2
1236	LFSDUX	yl2, YL, INCY
1237
1238	LFDUX	yl3, YL, INCY
1239	LFPDUX	a3,  A3, INC2
1240	LFPDUX	a7,  A3, INC2
1241	LFPDUX	a11, A3, INC2
1242	LFPDUX	a15, A3, INC2
1243	LFSDUX	yl3, YL, INCY
1244
1245	LFDUX	yl4, YL, INCY
1246	LFPDUX	a4,  A4, INC2
1247	LFPDUX	a8,  A4, INC2
1248	LFPDUX	a12, A4, INC2
1249	LFPDUX	a16, A4, INC2
1250	LFSDUX	yl4, YL, INCY
1251	bdz	.L73
1252	.align 4
1253
1254.L72:
1255	fxcpmadd  ys1, alpha1, a1,  yl1
1256	LFPDUX	a1,  A1, INC2
1257	LFDUX	yl1, YL, INCY
1258	fxcpmadd  ys2, alpha1, a5,  yl2
1259	LFPDUX	a5,  A1, INC2
1260	fxcpmadd  ys3, alpha1, a9,  yl3
1261	LFPDUX	a9,  A1, INC2
1262	fxcpmadd  ys4, alpha1, a13, yl4
1263	LFPDUX	a13, A1, INC2
1264	LFSDUX	yl1, YL, INCY
1265
1266	fxcsmadd  ys1, alpha1, a2,  ys1
1267	LFPDUX	a2,  A2, INC2
1268	LFDUX	yl2, YL, INCY
1269	fxcsmadd  ys2, alpha1, a6,  ys2
1270	LFPDUX	a6,  A2, INC2
1271	fxcsmadd  ys3, alpha1, a10, ys3
1272	LFPDUX	a10, A2, INC2
1273	fxcsmadd  ys4, alpha1, a14, ys4
1274	LFPDUX	a14, A2, INC2
1275	LFSDUX	yl2, YL, INCY
1276
1277	fxcpmadd  ys1, alpha2, a3,  ys1
1278	LFPDUX	a3,  A3, INC2
1279	LFDUX	yl3, YL, INCY
1280	fxcpmadd  ys2, alpha2, a7,  ys2
1281	LFPDUX	a7,  A3, INC2
1282	fxcpmadd  ys3, alpha2, a11, ys3
1283	LFPDUX	a11, A3, INC2
1284	fxcpmadd  ys4, alpha2, a15, ys4
1285	LFPDUX	a15, A3, INC2
1286	LFSDUX	yl3, YL, INCY
1287
1288	fxcsmadd  ys1, alpha2, a4,  ys1
1289	LFPDUX	a4,  A4, INC2
1290	LFDUX	yl4, YL, INCY
1291	fxcsmadd  ys2, alpha2, a8,  ys2
1292	LFPDUX	a8,  A4, INC2
1293	fxcsmadd  ys3, alpha2, a12, ys3
1294	LFPDUX	a12, A4, INC2
1295	fxcsmadd  ys4, alpha2, a16, ys4
1296	LFPDUX	a16, A4, INC2
1297	LFSDUX	yl4, YL, INCY
1298
1299	STFDUX	ys1, YS, INCY
1300	STFSDUX	ys1, YS, INCY
1301	STFDUX	ys2, YS, INCY
1302	STFSDUX	ys2, YS, INCY
1303	STFDUX	ys3, YS, INCY
1304	STFSDUX	ys3, YS, INCY
1305	STFDUX	ys4, YS, INCY
1306	STFSDUX	ys4, YS, INCY
1307	bdnz	.L72
1308	.align 4
1309
1310.L73:
1311	fxcpmadd  ys1, alpha1, a1,  yl1
1312	fxcpmadd  ys2, alpha1, a5,  yl2
1313	fxcpmadd  ys3, alpha1, a9,  yl3
1314	fxcpmadd  ys4, alpha1, a13, yl4
1315
1316	fxcsmadd  ys1, alpha1, a2,  ys1
1317	fxcsmadd  ys2, alpha1, a6,  ys2
1318	fxcsmadd  ys3, alpha1, a10, ys3
1319	fxcsmadd  ys4, alpha1, a14, ys4
1320
1321	fxcpmadd  ys1, alpha2, a3,  ys1
1322	fxcpmadd  ys2, alpha2, a7,  ys2
1323	fxcpmadd  ys3, alpha2, a11, ys3
1324	fxcpmadd  ys4, alpha2, a15, ys4
1325
1326	fxcsmadd  ys1, alpha2, a4,  ys1
1327	fxcsmadd  ys2, alpha2, a8,  ys2
1328	fxcsmadd  ys3, alpha2, a12, ys3
1329	fxcsmadd  ys4, alpha2, a16, ys4
1330
1331	STFDUX	ys1, YS, INCY
1332	STFSDUX	ys1, YS, INCY
1333	STFDUX	ys2, YS, INCY
1334	STFSDUX	ys2, YS, INCY
1335	STFDUX	ys3, YS, INCY
1336	STFSDUX	ys3, YS, INCY
1337	STFDUX	ys4, YS, INCY
1338	STFSDUX	ys4, YS, INCY
1339	.align 4
1340
1341.L75:
1342	andi.	r0, M, 7
1343	ble	.L79
1344
1345	andi.	r0, M, 4
1346	ble	.L77
1347
1348	LFDUX	yl1, YL, INCY
1349	LFPDUX	a1,  A1, INC2
1350	LFPDUX	a5,  A1, INC2
1351	LFSDUX	yl1, YL, INCY
1352	LFPDUX	a2,  A2, INC2
1353	LFPDUX	a6,  A2, INC2
1354
1355	LFDUX	yl2, YL, INCY
1356	LFPDUX	a3,  A3, INC2
1357	LFPDUX	a7,  A3, INC2
1358	LFSDUX	yl2, YL, INCY
1359	LFPDUX	a4,  A4, INC2
1360	LFPDUX	a8,  A4, INC2
1361
1362	fxcpmadd  ys1, alpha1, a1, yl1
1363	fxcpmadd  ys2, alpha1, a5, yl2
1364	fxcsmadd  ys1, alpha1, a2, ys1
1365	fxcsmadd  ys2, alpha1, a6, ys2
1366
1367	fxcpmadd  ys1, alpha2, a3, ys1
1368	fxcpmadd  ys2, alpha2, a7, ys2
1369	fxcsmadd  ys1, alpha2, a4, ys1
1370	fxcsmadd  ys2, alpha2, a8, ys2
1371
1372	STFDUX	ys1, YS, INCY
1373	STFSDUX	ys1, YS, INCY
1374	STFDUX	ys2, YS, INCY
1375	STFSDUX	ys2, YS, INCY
1376	.align 4
1377
1378.L77:
1379	andi.	r0, M, 2
1380	ble	.L78
1381
1382	LFDUX	yl1, YL, INCY
1383	LFPDUX	a1,  A1, INC2
1384	LFPDUX	a2,  A2, INC2
1385	LFSDUX	yl1, YL, INCY
1386	LFPDUX	a3,  A3, INC2
1387	LFPDUX	a4,  A4, INC2
1388
1389	fxcpmadd  ys1, alpha1, a1, yl1
1390	fxcsmadd  ys1, alpha1, a2, ys1
1391	fxcpmadd  ys1, alpha2, a3, ys1
1392	fxcsmadd  ys1, alpha2, a4, ys1
1393
1394	STFDUX	ys1, YS, INCY
1395	STFSDUX	ys1, YS, INCY
1396	.align 4
1397
1398.L78:
1399	andi.	r0, M, 1
1400	ble	.L79
1401
1402	LFDUX	yl1, YL, INCY
1403
1404	LFDUX	a1,  A1, INC2
1405	LFDUX	a2,  A2, INC2
1406	LFDUX	a3,  A3, INC2
1407	LFDUX	a4,  A4, INC2
1408
1409	fxcpmadd  ys1, alpha1, a1, yl1
1410	fxcsmadd  ys1, alpha1, a2, ys1
1411	fxcpmadd  ys1, alpha2, a3, ys1
1412	fxcsmadd  ys1, alpha2, a4, ys1
1413
1414	STFDUX	ys1, YS, INCY
1415	.align 4
1416
1417.L79:
1418	addi	J, J, -1
1419	cmpi	cr0, 0, J, 0
1420	bgt	.L71
1421	.align 4
1422
1423.L80:
1424	andi.	J, N, 2
1425	ble	.L90
1426
1427	LFDUX	alpha1, X, INCX
1428
1429	mr	A1, A
1430	add	A2, A,  LDA
1431	add	A,  A2, LDA
1432	LFSDUX	alpha1, X, INCX
1433
1434	mr	YL, Y
1435	mr	YS, Y
1436	fpmul	alpha1, alpha, alpha1
1437
1438	srawi.	r0,  M, 3
1439	mtspr	CTR, r0
1440	ble	.L85
1441
1442	LFDUX	yl1, YL, INCY
1443	LFDUX	a9,  YL, INCY
1444	LFDUX	yl2, YL, INCY
1445	LFDUX	a10, YL, INCY
1446
1447	LFPDUX	a1,  A1, INC2
1448	LFPDUX	a5,  A1, INC2
1449	LFPDUX	a3,  A1, INC2
1450	LFPDUX	a7,  A1, INC2
1451
1452	LFDUX	yl3, YL, INCY
1453	LFDUX	a11, YL, INCY
1454	LFDUX	yl4, YL, INCY
1455	LFDUX	a12, YL, INCY
1456
1457	LFPDUX	a2,  A2, INC2
1458	LFPDUX	a6,  A2, INC2
1459	LFPDUX	a4,  A2, INC2
1460	LFPDUX	a8,  A2, INC2
1461
1462	bdz	.L83
1463	.align 4
1464
1465.L82:
1466	fsmfp	yl1, a9
1467	fsmfp	yl2, a10
1468	fsmfp	yl3, a11
1469	fsmfp	yl4, a12
1470
1471	fxcpmadd  ys1, alpha1, a1,  yl1
1472	LFDUX	yl1, YL, INCY
1473	LFDUX	a9,  YL, INCY
1474	LFPDUX	a1,  A1, INC2
1475	fxcpmadd  ys2, alpha1, a5,  yl2
1476	LFDUX	yl2, YL, INCY
1477	LFDUX	a10, YL, INCY
1478	LFPDUX	a5,  A1, INC2
1479	fxcpmadd  ys3, alpha1, a3,  yl3
1480	LFDUX	yl3, YL, INCY
1481	LFDUX	a11, YL, INCY
1482	LFPDUX	a3,  A1, INC2
1483	fxcpmadd  ys4, alpha1, a7,  yl4
1484	LFDUX	yl4, YL, INCY
1485	LFDUX	a12, YL, INCY
1486	LFPDUX	a7,  A1, INC2
1487
1488	fxcsmadd  ys1, alpha1, a2,  ys1
1489	LFPDUX	a2,  A2, INC2
1490	fxcsmadd  ys2, alpha1, a6,  ys2
1491	LFPDUX	a6,  A2, INC2
1492	fxcsmadd  ys3, alpha1, a4,  ys3
1493	LFPDUX	a4,  A2, INC2
1494	fxcsmadd  ys4, alpha1, a8,  ys4
1495	LFPDUX	a8,  A2, INC2
1496
1497	STFDUX	ys1, YS, INCY
1498	STFSDUX	ys1, YS, INCY
1499	STFDUX	ys2, YS, INCY
1500	STFSDUX	ys2, YS, INCY
1501
1502	STFDUX	ys3, YS, INCY
1503	STFSDUX	ys3, YS, INCY
1504	STFDUX	ys4, YS, INCY
1505	STFSDUX	ys4, YS, INCY
1506	bdnz	.L82
1507	.align 4
1508
1509.L83:
1510	fsmfp	yl1, a9
1511	fsmfp	yl2, a10
1512	fsmfp	yl3, a11
1513	fsmfp	yl4, a12
1514
1515	fxcpmadd  ys1, alpha1, a1,  yl1
1516	fxcpmadd  ys2, alpha1, a5,  yl2
1517	fxcpmadd  ys3, alpha1, a3,  yl3
1518	fxcpmadd  ys4, alpha1, a7,  yl4
1519
1520	fxcsmadd  ys1, alpha1, a2,  ys1
1521	fxcsmadd  ys2, alpha1, a6,  ys2
1522	fxcsmadd  ys3, alpha1, a4,  ys3
1523	fxcsmadd  ys4, alpha1, a8,  ys4
1524
1525	STFDUX	ys1, YS, INCY
1526	STFSDUX	ys1, YS, INCY
1527	STFDUX	ys2, YS, INCY
1528	STFSDUX	ys2, YS, INCY
1529	STFDUX	ys3, YS, INCY
1530	STFSDUX	ys3, YS, INCY
1531	STFDUX	ys4, YS, INCY
1532	STFSDUX	ys4, YS, INCY
1533	.align 4
1534
1535.L85:
1536	andi.	r0, M, 7
1537	ble	.L90
1538
1539	andi.	r0, M, 4
1540	ble	.L87
1541
1542	LFDUX	yl1, YL, INCY
1543	LFPDUX	a1,  A1, INC2
1544	LFPDUX	a2,  A2, INC2
1545	LFSDUX	yl1, YL, INCY
1546	LFDUX	yl2, YL, INCY
1547	LFPDUX	a5,  A1, INC2
1548	LFPDUX	a6,  A2, INC2
1549	LFSDUX	yl2, YL, INCY
1550
1551	fxcpmadd  ys1, alpha1, a1, yl1
1552	fxcpmadd  ys2, alpha1, a5, yl2
1553	fxcsmadd  ys1, alpha1, a2, ys1
1554	fxcsmadd  ys2, alpha1, a6, ys2
1555
1556	STFDUX	ys1, YS, INCY
1557	STFSDUX	ys1, YS, INCY
1558	STFDUX	ys2, YS, INCY
1559	STFSDUX	ys2, YS, INCY
1560	.align 4
1561
1562.L87:
1563	andi.	r0, M, 2
1564	ble	.L88
1565
1566	LFDUX	yl1, YL, INCY
1567	LFPDUX	a1,  A1, INC2
1568	LFPDUX	a2,  A2, INC2
1569	LFSDUX	yl1, YL, INCY
1570
1571	fxcpmadd  ys1, alpha1, a1, yl1
1572	fxcsmadd  ys1, alpha1, a2, ys1
1573
1574	STFDUX	ys1, YS, INCY
1575	STFSDUX	ys1, YS, INCY
1576	.align 4
1577
1578.L88:
1579	andi.	r0, M, 1
1580	ble	.L90
1581
1582	LFDUX	yl1, YL, INCY
1583	LFDUX	a1,  A1, INC2
1584	LFDUX	a2,  A2, INC2
1585
1586	fxcpmadd  ys1, alpha1, a1, yl1
1587	fxcsmadd  ys1, alpha1, a2, ys1
1588
1589	STFDUX	ys1, YS, INCY
1590	.align 4
1591
1592.L90:
1593	andi.	J, N, 1
1594	ble	.L999
1595
1596	LFDUX	alpha1, X, INCX
1597
1598	mr	A1, A
1599	mr	YL, Y
1600	mr	YS, Y
1601	fmul	alpha1, alpha, alpha1
1602
1603	srawi.	r0,  M, 3
1604	mtspr	CTR, r0
1605	ble	.L95
1606
1607	LFDUX	yl1, YL, INCY
1608	LFSDUX	a2,  YL, INCY
1609	LFDUX	yl2, YL, INCY
1610	LFSDUX	a4,  YL, INCY
1611	LFDUX	yl3, YL, INCY
1612	LFSDUX	a6,  YL, INCY
1613	LFDUX	yl4, YL, INCY
1614	LFSDUX	a8,  YL, INCY
1615
1616	LFPDUX	a1,  A1, INC2
1617	LFPDUX	a5,  A1, INC2
1618	LFPDUX	a9,  A1, INC2
1619	LFPDUX	a13, A1, INC2
1620	bdz	.L93
1621	.align 4
1622
1623.L92:
1624	fmr	a2, yl1
1625	fmr	a4, yl2
1626	fmr	a6, yl3
1627	fmr	a8, yl4
1628
1629	fxcpmadd  ys1, alpha1, a1,  a2
1630	LFDUX	yl1, YL, INCY
1631	LFSDUX	a2,  YL, INCY
1632	fxcpmadd  ys2, alpha1, a5,  a4
1633	LFDUX	yl2, YL, INCY
1634	LFSDUX	a4,  YL, INCY
1635	fxcpmadd  ys3, alpha1, a9,  a6
1636	LFDUX	yl3, YL, INCY
1637	LFSDUX	a6,  YL, INCY
1638	fxcpmadd  ys4, alpha1, a13, a8
1639	LFDUX	yl4, YL, INCY
1640	LFSDUX	a8,  YL, INCY
1641
1642	LFPDUX	a1,  A1, INC2
1643	LFPDUX	a5,  A1, INC2
1644	LFPDUX	a9,  A1, INC2
1645	LFPDUX	a13, A1, INC2
1646
1647	STFDUX	ys1, YS, INCY
1648	STFSDUX	ys1, YS, INCY
1649	STFDUX	ys2, YS, INCY
1650	STFSDUX	ys2, YS, INCY
1651	STFDUX	ys3, YS, INCY
1652	STFSDUX	ys3, YS, INCY
1653	STFDUX	ys4, YS, INCY
1654	STFSDUX	ys4, YS, INCY
1655	bdnz	.L92
1656	.align 4
1657
1658.L93:
1659	fmr	a2, yl1
1660	fmr	a4, yl2
1661	fmr	a6, yl3
1662	fmr	a8, yl4
1663
1664	fxcpmadd  ys1, alpha1, a1,  a2
1665	fxcpmadd  ys2, alpha1, a5,  a4
1666	fxcpmadd  ys3, alpha1, a9,  a6
1667	fxcpmadd  ys4, alpha1, a13, a8
1668
1669	STFDUX	ys1, YS, INCY
1670	STFSDUX	ys1, YS, INCY
1671	STFDUX	ys2, YS, INCY
1672	STFSDUX	ys2, YS, INCY
1673	STFDUX	ys3, YS, INCY
1674	STFSDUX	ys3, YS, INCY
1675	STFDUX	ys4, YS, INCY
1676	STFSDUX	ys4, YS, INCY
1677	.align 4
1678
1679.L95:
1680	andi.	r0, M, 7
1681	ble	.L999
1682
1683	andi.	r0, M, 4
1684	ble	.L97
1685
1686	LFPDUX	a1,  A1, INC2
1687	LFDUX	yl1, YL, INCY
1688	LFDUX	yl2, YL, INCY
1689	LFPDUX	a2,  A1, INC2
1690	LFDUX	yl3, YL, INCY
1691	LFDUX	yl4, YL, INCY
1692
1693	fxcpmadd  ys1, a1, alpha1, yl1
1694	fxcsmadd  ys2, a1, alpha1, yl2
1695	fxcpmadd  ys3, a2, alpha1, yl3
1696	fxcsmadd  ys4, a2, alpha1, yl4
1697
1698	STFDUX	ys1, YS, INCY
1699	STFDUX	ys2, YS, INCY
1700	STFDUX	ys3, YS, INCY
1701	STFDUX	ys4, YS, INCY
1702	.align 4
1703
1704.L97:
1705	andi.	r0, M, 2
1706	ble	.L98
1707
1708	LFPDUX	a1,  A1, INC2
1709	LFDUX	yl1, YL, INCY
1710	LFDUX	yl2, YL, INCY
1711
1712	fxcpmadd  ys1, a1, alpha1, yl1
1713	fxcsmadd  ys2, a1, alpha1, yl2
1714
1715	STFDUX	ys1, YS, INCY
1716	STFDUX	ys2, YS, INCY
1717	.align 4
1718
1719.L98:
1720	andi.	r0, M, 1
1721	ble	.L999
1722
1723	LFDUX	yl1, YL, INCY
1724	LFDUX	a1,  A1, INC2
1725
1726	fxcpmadd  ys1, alpha1, a1, yl1
1727
1728	STFDUX	ys1, YS, INCY
1729	b	.L999
1730	.align 4
1731
1732
1733.L999:
1734	addi	SP, SP, -4
1735
1736	lwzu	r16,   4(SP)
1737	lwzu	r17,   4(SP)
1738	lwzu	r18,   4(SP)
1739	lwzu	r19,   4(SP)
1740
1741	lwzu	r20,   4(SP)
1742	lwzu	r21,   4(SP)
1743	lwzu	r22,   4(SP)
1744	lwzu	r23,   4(SP)
1745
1746	lwzu	r24,   4(SP)
1747	lwzu	r25,   4(SP)
1748	lwzu	r26,   4(SP)
1749	lwzu	r27,   4(SP)
1750
1751	lwzu	r28,   4(SP)
1752	lwzu	r29,   4(SP)
1753	lwzu	r30,   4(SP)
1754	lwzu	r31,   4(SP)
1755
1756	subi	SP, SP, 12
1757	li	r0, 16
1758
1759	lfpdux	f31, SP, r0
1760	lfpdux	f30, SP, r0
1761	lfpdux	f29, SP, r0
1762	lfpdux	f28, SP, r0
1763	lfpdux	f27, SP, r0
1764	lfpdux	f26, SP, r0
1765	lfpdux	f25, SP, r0
1766	lfpdux	f24, SP, r0
1767	lfpdux	f23, SP, r0
1768	lfpdux	f22, SP, r0
1769	lfpdux	f21, SP, r0
1770	lfpdux	f20, SP, r0
1771	lfpdux	f19, SP, r0
1772	lfpdux	f18, SP, r0
1773	lfpdux	f17, SP, r0
1774	lfpdux	f16, SP, r0
1775	lfpdux	f15, SP, r0
1776	lfpdux	f14, SP, r0
1777	addi	SP, SP, 16
1778	blr
1779
1780	EPILOGUE
1781