1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#ifndef __64BIT__
43#define LOAD	lwz
44#else
45#define LOAD	ld
46#endif
47
48#ifdef __64BIT__
49#define STACKSIZE 360
50#else
51#define STACKSIZE 272
52#endif
53
54#define ALIGN_SIZE	0xffff
55#define SWAP		  0
56#define NEG		 16
57#define ALPHA_R		 32
58#define ALPHA_I		 48
59#define FZERO		 64
60
61#define	M	r3
62#define	N	r4
63#define	K	r5
64
65#ifdef linux
66#ifndef __64BIT__
67#define A	r6
68#define	B	r7
69#define	C	r8
70#define	LDC	r9
71#else
72#define A	r8
73#define	B	r9
74#define	C	r10
75#define	LDC	r6
76#endif
77#endif
78
79#if defined(_AIX) || defined(__APPLE__)
80#if !defined(__64BIT__) && defined(DOUBLE)
81#define A	r10
82#define	B	r6
83#define	C	r7
84#define	LDC	r8
85#else
86#define A	r8
87#define	B	r9
88#define	C	r10
89#define	LDC	r6
90#endif
91#endif
92
93#define STACK	r11
94
95#define	I	r21
96#define J	r22
97#define AO	r23
98#define	BO	r24
99#define	CO1	r25
100#define CO2	r26
101
102#define PREA	r29
103#define PREB	r29
104#define PREC	r30
105#define VREG	r31
106
107#define LOAD_A	lvx
108#define LOAD_B	lvx
109
110#define OFFSET_0	  0
111#define OFFSET_1	r14
112#define OFFSET_2	r15
113#define OFFSET_3	r16
114#define OFFSET_4	r17
115#define OFFSET_5	r18
116#define OFFSET_6	r19
117#define OFFSET_7	r20
118
119#define	c01	v0
120#define	c02	v1
121#define	c03	v2
122#define	c04	v3
123#define	c05	v4
124#define	c06	v5
125#define	c07	v6
126#define	c08	v7
127#define	c09	v8
128#define	c10	v9
129#define	c11	v10
130#define	c12	v11
131#define	c13	v12
132#define	c14	v13
133#define	c15	v14
134#define	c16	v15
135
136#define	a1	v16
137#define	a2	v17
138#define	a3	v18
139#define	a4	v19
140#define	a5	v20
141#define	a6	v21
142#define	a7	v22
143#define	a8	v23
144
145#define	b1	v24
146#define	b2	v25
147#define	bp1	v26
148#define	bp2	v27
149
150#define C1	v16
151#define C2	v17
152#define C3	v18
153#define C4	v19
154#define C5	v20
155
156#define c00	v24
157
158#define VZERO		 v25
159#define PERMRSHIFT1	 v26
160#define PERMRSHIFT2	 v27
161
162#define swap		 v28
163#define neg		 v29
164#define alpha_r		 v30
165#define alpha_i		 v31
166
167#ifndef NEEDPARAM
168
169#ifndef DOUBLE
170#include "../cparam.h"
171#else
172#include "../zparam.h"
173#endif
174
175	PROLOGUE
176	PROFCODE
177
178	addi	SP, SP, -STACKSIZE
179	mr	STACK, SP
180
181	li	r0,  0 * 16
182	stvx	v20, SP, r0
183	li	r0,  1 * 16
184	stvx	v21, SP, r0
185	li	r0,  2 * 16
186	stvx	v22, SP, r0
187	li	r0,  3 * 16
188	stvx	v23, SP, r0
189	li	r0,  4 * 16
190	stvx	v24, SP, r0
191	li	r0,  5 * 16
192	stvx	v25, SP, r0
193	li	r0,  6 * 16
194	stvx	v26, SP, r0
195	li	r0,  7 * 16
196	stvx	v27, SP, r0
197	li	r0,  8 * 16
198	stvx	v28, SP, r0
199	li	r0,  9 * 16
200	stvx	v29, SP, r0
201	li	r0, 10 * 16
202	stvx	v30, SP, r0
203	li	r0, 11 * 16
204	stvx	v31, SP, r0
205
206#ifdef __64BIT__
207	std	r31,  192(SP)
208	std	r30,  200(SP)
209	std	r29,  208(SP)
210	std	r28,  216(SP)
211	std	r27,  224(SP)
212	std	r26,  232(SP)
213	std	r25,  240(SP)
214	std	r24,  248(SP)
215	std	r23,  256(SP)
216	std	r22,  264(SP)
217	std	r21,  272(SP)
218	std	r20,  280(SP)
219	std	r19,  288(SP)
220	std	r18,  296(SP)
221	std	r17,  304(SP)
222	std	r16,  312(SP)
223	std	r15,  320(SP)
224	std	r14,  328(SP)
225#else
226	stw	r31,  192(SP)
227	stw	r30,  196(SP)
228	stw	r29,  200(SP)
229	stw	r28,  204(SP)
230	stw	r27,  208(SP)
231	stw	r26,  212(SP)
232	stw	r25,  216(SP)
233	stw	r24,  220(SP)
234	stw	r23,  224(SP)
235	stw	r22,  228(SP)
236	stw	r21,  232(SP)
237	stw	r20,  236(SP)
238	stw	r19,  240(SP)
239	stw	r18,  244(SP)
240	stw	r17,  248(SP)
241	stw	r16,  252(SP)
242	stw	r15,  256(SP)
243	stw	r14,  260(SP)
244#endif
245
246
247#ifdef linux
248#ifdef __64BIT__
249	ld	LDC,    112 + STACKSIZE(SP)
250#endif
251#endif
252
253#if defined(_AIX) || defined(__APPLE__)
254#ifdef __64BIT__
255	ld	LDC,    112 + STACKSIZE(SP)
256#else
257#ifdef DOUBLE
258	lwz	B,       56 + STACKSIZE(SP)
259	lwz	C,       60 + STACKSIZE(SP)
260	lwz	LDC,     64 + STACKSIZE(SP)
261#else
262	lwz	LDC,     56 + STACKSIZE(SP)
263#endif
264#endif
265#endif
266
267#ifndef PREFETCHTEST
268#ifdef PPC970
269	li	PREC,   16 * SIZE
270#endif
271#else
272
273#ifdef linux
274#ifndef __64BIT__
275	lwz	PREB,   16 + STACKSIZE(SP)
276	lwz	PREC,   20 + STACKSIZE(SP)
277#else
278	ld	PREB,  136 + STACKSIZE(SP)
279	ld	PREC,  144 + STACKSIZE(SP)
280#endif
281#endif
282
283#if defined(_AIX) || defined(__APPLE__)
284#ifdef __64BIT__
285	ld	PREB,  136 + STACKSIZE(SP)
286	ld	PREC,  144 + STACKSIZE(SP)
287#else
288#ifdef DOUBLE
289	lwz	PREB,   72 + STACKSIZE(SP)
290	lwz	PREC,   76 + STACKSIZE(SP)
291#else
292	lwz	PREB,   68 + STACKSIZE(SP)
293	lwz	PREC,   72 + STACKSIZE(SP)
294#endif
295#endif
296#endif
297
298#endif
299
300#ifndef PREFETCHTEST
301#ifdef CELL
302	li	PREB,   (3 * 32 * SIZE)
303#else
304	li	PREB,   (5 * 32 * SIZE)
305#endif
306#endif
307
308	li	r0, -1
309	mfspr	VREG, VRsave
310
311	mtspr	VRsave, r0
312
313	addi	SP, SP, -128
314	li	r0, -8192
315
316	and	SP, SP, r0
317
318	fneg	f3, f1
319	fneg	f4, f2
320
321#if   defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
322      defined(NC) || defined(TC) || defined(NR) || defined(TR)
323	stfs	f1,  ALPHA_R +  0(SP)
324	stfs	f1,  ALPHA_R +  4(SP)
325	stfs	f1,  ALPHA_R +  8(SP)
326	stfs	f1,  ALPHA_R + 12(SP)
327
328	stfs	f4,  ALPHA_I +  0(SP)
329	stfs	f2,  ALPHA_I +  4(SP)
330	stfs	f4,  ALPHA_I +  8(SP)
331	stfs	f2,  ALPHA_I + 12(SP)
332#else
333	stfs	f1,  ALPHA_R +  0(SP)
334	stfs	f3,  ALPHA_R +  4(SP)
335	stfs	f1,  ALPHA_R +  8(SP)
336	stfs	f3,  ALPHA_R + 12(SP)
337
338	stfs	f2,  ALPHA_I +  0(SP)
339	stfs	f2,  ALPHA_I +  4(SP)
340	stfs	f2,  ALPHA_I +  8(SP)
341	stfs	f2,  ALPHA_I + 12(SP)
342#endif
343
344	li	I,    Address_L(0x04050607)
345	addis	I, I, Address_H(0x04050607)
346	stw	I, SWAP +  0(SP)
347	li	I,    Address_L(0x00010203)
348	addis	I, I, Address_H(0x00010203)
349	stw	I, SWAP +  4(SP)
350	li	I,    Address_L(0x0c0d0e0f)
351	addis	I, I, Address_H(0x0c0d0e0f)
352	stw	I, SWAP +  8(SP)
353	li	I,    Address_L(0x08090a0b)
354	addis	I, I, Address_H(0x08090a0b)
355	stw	I, SWAP + 12(SP)
356
357#if   defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
358      defined(RR) || defined(RC) || defined(CR) || defined(CC)
359	lis	I, 0x8000
360	stw	I, NEG +  0(SP)
361	stw	I, NEG +  8(SP)
362	li	I, 0
363	stw	I, NEG +  4(SP)
364	stw	I, NEG + 12(SP)
365#else
366	li	I, 0
367	stw	I, NEG +  0(SP)
368	stw	I, NEG +  8(SP)
369	lis	I, 0x8000
370	stw	I, NEG +  4(SP)
371	stw	I, NEG + 12(SP)
372#endif
373
374	li	r0, 0
375	stw	r0, FZERO(SP)
376
377	slwi	LDC, LDC, ZBASE_SHIFT
378
379	li	OFFSET_1,  4 * SIZE
380	li	OFFSET_2,  8 * SIZE
381	li	OFFSET_3, 12 * SIZE
382	li	OFFSET_4, 16 * SIZE
383	li	OFFSET_5, 20 * SIZE
384	li	OFFSET_6, 24 * SIZE
385	li	OFFSET_7, 28 * SIZE
386
387	cmpwi	cr0, M, 0
388	ble	LL(999)
389	cmpwi	cr0, N, 0
390	ble	LL(999)
391	cmpwi	cr0, K, 0
392	ble	LL(999)
393
394	srawi.	J, N,  1
395	ble	LL(50)
396	.align 4
397
398LL(01):
399	mr	CO1, C
400	add	CO2, C,  LDC
401	add	C,   CO2, LDC
402
403	mr	AO, A
404	srawi.	I, M,  3
405	ble	LL(20)
406	.align 4
407
408LL(11):
409	vxor	c01, c01, c01
410	LOAD_B	b1, OFFSET_0, B
411	vxor	c02, c02, c02
412	LOAD_A	a1, OFFSET_0, AO
413	vxor	c03, c03, c03
414	LOAD_A	a2, OFFSET_1, AO
415	vxor	c04, c04, c04
416	LOAD_A	a3, OFFSET_2, AO
417
418	vxor	c04, c04, c04
419	vxor	c05, c05, c05
420	vxor	c06, c06, c06
421	vxor	c07, c07, c07
422	vxor	c08, c08, c08
423
424	vxor	c09, c09, c09
425	dcbtst	CO1, PREC
426	vxor	c10, c10, c10
427	dcbtst	CO2, PREC
428	vxor	c11, c11, c11
429	vxor	c12, c12, c12
430	vxor	c13, c13, c13
431	mr	BO, B
432	vxor	c14, c14, c14
433	srawi.	r0,  K,  2
434	vxor	c15, c15, c15
435	mtspr	CTR, r0
436	vxor	c16, c16, c16
437	vspltw	bp1, b1, 0
438	ble	LL(13)
439	.align 4
440
441#define NOP1   mr	r3, r3
442#define NOP2   mr	r4, r4
443
444LL(12):
445	vmaddfp	c01, a1, bp1, c01
446	vspltw	bp2, b1, 1
447	vmaddfp	c02, a2, bp1, c02
448	LOAD_A	a4, OFFSET_3, AO
449	vmaddfp	c03, a3, bp1, c03
450	dcbt	AO, PREA
451	vmaddfp	c04, a4, bp1, c04
452	NOP2
453
454	vmaddfp	c05, a1, bp2, c05
455	vspltw	bp1, b1, 2
456	vmaddfp	c06, a2, bp2, c06
457	NOP2
458	vmaddfp	c07, a3, bp2, c07
459	NOP1
460	vmaddfp	c08, a4, bp2, c08
461	dcbt	BO, PREB
462
463	vmaddfp	c09, a1, bp1, c09
464	vspltw	bp2, b1, 3
465	vmaddfp	c10, a2, bp1, c10
466	LOAD_B	b2, OFFSET_1, BO
467	vmaddfp	c11, a3, bp1, c11
468	addi	BO, BO,  8 * SIZE
469	vmaddfp	c12, a4, bp1, c12
470	NOP1
471
472	vmaddfp	c13, a1, bp2, c13
473	vspltw	bp1, b2, 0
474	vmaddfp	c14, a2, bp2, c14
475	LOAD_A	a5, OFFSET_4, AO
476	vmaddfp	c15, a3, bp2, c15
477	LOAD_A	a6, OFFSET_5, AO
478	vmaddfp	c16, a4, bp2, c16
479	vspltw	bp2, b2, 1
480
481	vmaddfp	c01, a5, bp1, c01
482	LOAD_A	a7, OFFSET_6, AO
483	vmaddfp	c02, a6, bp1, c02
484	LOAD_A	a8, OFFSET_7, AO
485	vmaddfp	c03, a7, bp1, c03
486	NOP1
487	vmaddfp	c04, a8, bp1, c04
488	NOP2
489
490	vmaddfp	c05, a5, bp2, c05
491	vspltw	bp1, b2, 2
492	vmaddfp	c06, a6, bp2, c06
493	addi	AO, AO, 32 * SIZE
494	vmaddfp	c07, a7, bp2, c07
495	LOAD_B	b1, OFFSET_0, BO
496	vmaddfp	c08, a8, bp2, c08
497	NOP1
498
499	vmaddfp	c09, a5, bp1, c09
500	vspltw	bp2, b2, 3
501	vmaddfp	c10, a6, bp1, c10
502	NOP2
503	vmaddfp	c11, a7, bp1, c11
504	NOP1
505	vmaddfp	c12, a8, bp1, c12
506	dcbt	AO, PREA
507
508	vmaddfp	c13, a5, bp2, c13
509	vspltw	bp1, b1, 0
510	vmaddfp	c14, a6, bp2, c14
511	LOAD_A	a1, OFFSET_0, AO		//
512	vmaddfp	c15, a7, bp2, c15
513	LOAD_A	a2, OFFSET_1, AO
514	vmaddfp	c16, a8, bp2, c16
515	vspltw	bp2, b1, 1
516
517	vmaddfp	c01, a1, bp1, c01
518	LOAD_A	a3, OFFSET_2, AO
519	vmaddfp	c02, a2, bp1, c02
520	LOAD_A	a4, OFFSET_3, AO
521	vmaddfp	c03, a3, bp1, c03
522	NOP1
523	vmaddfp	c04, a4, bp1, c04
524	NOP2
525
526	vmaddfp	c05, a1, bp2, c05
527	vspltw	bp1, b1, 2
528	vmaddfp	c06, a2, bp2, c06
529	NOP2
530	vmaddfp	c07, a3, bp2, c07
531	NOP1
532	vmaddfp	c08, a4, bp2, c08
533	LOAD_B	b2, OFFSET_1, BO
534
535	vmaddfp	c09, a1, bp1, c09
536	vspltw	bp2, b1, 3
537	vmaddfp	c10, a2, bp1, c10
538	NOP2
539	vmaddfp	c11, a3, bp1, c11
540	NOP1
541	vmaddfp	c12, a4, bp1, c12
542	addi	BO, BO,  8 * SIZE
543
544	vmaddfp	c13, a1, bp2, c13
545	vspltw	bp1, b2, 0
546	vmaddfp	c14, a2, bp2, c14
547	LOAD_A	a5, OFFSET_4, AO
548	vmaddfp	c15, a3, bp2, c15
549	LOAD_A	a6, OFFSET_5, AO
550	vmaddfp	c16, a4, bp2, c16
551	vspltw	bp2, b2, 1
552
553	vmaddfp	c01, a5, bp1, c01
554	LOAD_A	a7, OFFSET_6, AO
555	vmaddfp	c02, a6, bp1, c02
556	LOAD_A	a8, OFFSET_7, AO
557	vmaddfp	c03, a7, bp1, c03
558	addi	AO, AO, 32 * SIZE
559	vmaddfp	c04, a8, bp1, c04
560	NOP2
561
562	vmaddfp	c05, a5, bp2, c05
563	vspltw	bp1, b2, 2
564	vmaddfp	c06, a6, bp2, c06
565	NOP2
566	vmaddfp	c07, a7, bp2, c07
567	NOP1
568	vmaddfp	c08, a8, bp2, c08
569	LOAD_B	b1, OFFSET_0, BO
570
571	vmaddfp	c09, a5, bp1, c09
572	vspltw	bp2, b2, 3
573	vmaddfp	c10, a6, bp1, c10
574	LOAD_A	a1, OFFSET_0, AO	//
575	vmaddfp	c11, a7, bp1, c11
576	NOP2
577	vmaddfp	c12, a8, bp1, c12
578	vspltw	bp1, b1, 0
579
580	vmaddfp	c13, a5, bp2, c13
581	LOAD_A	a2, OFFSET_1, AO
582	vmaddfp	c14, a6, bp2, c14
583	LOAD_A	a3, OFFSET_2, AO
584	vmaddfp	c15, a7, bp2, c15
585	NOP1
586	vmaddfp	c16, a8, bp2, c16
587	bdnz+	LL(12)
588	.align 4
589
590LL(13):
591	andi.	r0,  K,  2
592	nop
593	nop
594	ble+	LL(15)
595	.align 4
596
597	vmaddfp	c01, a1, bp1, c01
598	vspltw	bp2, b1, 1
599	vmaddfp	c02, a2, bp1, c02
600	LOAD_A	a4, OFFSET_3, AO
601	vmaddfp	c03, a3, bp1, c03
602	NOP1
603	vmaddfp	c04, a4, bp1, c04
604	NOP2
605
606	vmaddfp	c05, a1, bp2, c05
607	vspltw	bp1, b1, 2
608	vmaddfp	c06, a2, bp2, c06
609	NOP2
610	vmaddfp	c07, a3, bp2, c07
611	NOP1
612	vmaddfp	c08, a4, bp2, c08
613	LOAD_B	b2, OFFSET_1, BO
614
615	vmaddfp	c09, a1, bp1, c09
616	vspltw	bp2, b1, 3
617	vmaddfp	c10, a2, bp1, c10
618	LOAD_A	a5, OFFSET_4, AO
619	vmaddfp	c11, a3, bp1, c11
620	LOAD_A	a6, OFFSET_5, AO
621	vmaddfp	c12, a4, bp1, c12
622	addi	BO, BO,  8 * SIZE
623
624	vmaddfp	c13, a1, bp2, c13
625	vspltw	bp1, b2, 0
626	vmaddfp	c14, a2, bp2, c14
627	LOAD_A	a7, OFFSET_6, AO
628	vmaddfp	c15, a3, bp2, c15
629	LOAD_A	a8, OFFSET_7, AO
630	vmaddfp	c16, a4, bp2, c16
631	addi	AO, AO, 32 * SIZE
632
633	vmaddfp	c01, a5, bp1, c01
634	vspltw	bp2, b2, 1
635	vmaddfp	c02, a6, bp1, c02
636	NOP2
637	vmaddfp	c03, a7, bp1, c03
638	NOP1
639	vmaddfp	c04, a8, bp1, c04
640	NOP2
641
642	vmaddfp	c05, a5, bp2, c05
643	vspltw	bp1, b2, 2
644	vmaddfp	c06, a6, bp2, c06
645	NOP2
646	vmaddfp	c07, a7, bp2, c07
647	NOP1
648	vmaddfp	c08, a8, bp2, c08
649	LOAD_B	b1, OFFSET_0, BO
650
651	vmaddfp	c09, a5, bp1, c09
652	vspltw	bp2, b2, 3
653	vmaddfp	c10, a6, bp1, c10
654	LOAD_A	a1, OFFSET_0, AO
655	vmaddfp	c11, a7, bp1, c11
656	LOAD_A	a2, OFFSET_1, AO
657	vmaddfp	c12, a8, bp1, c12
658	NOP2
659
660	vmaddfp	c13, a5, bp2, c13
661	vspltw	bp1, b1, 0
662	vmaddfp	c14, a6, bp2, c14
663	LOAD_A	a3, OFFSET_2, AO
664	vmaddfp	c15, a7, bp2, c15
665	vmaddfp	c16, a8, bp2, c16
666	.align 4
667
668
669LL(15):
670	andi.	r0,  K,  1
671	vxor	VZERO, VZERO, VZERO
672	ble+	LL(18)
673	.align 4
674
675	vmaddfp	c01, a1, bp1, c01
676	vspltw	bp2, b1, 1
677	vmaddfp	c02, a2, bp1, c02
678	LOAD_A	a4, OFFSET_3, AO
679	vmaddfp	c03, a3, bp1, c03
680	nop
681	vmaddfp	c04, a4, bp1, c04
682	nop
683
684	vmaddfp	c05, a1, bp2, c05
685	vspltw	bp1, b1, 2
686	vmaddfp	c06, a2, bp2, c06
687	nop
688	vmaddfp	c07, a3, bp2, c07
689	nop
690	vmaddfp	c08, a4, bp2, c08
691	nop
692
693	vmaddfp	c09, a1, bp1, c09
694	vspltw	bp2, b1, 3
695	vmaddfp	c10, a2, bp1, c10
696	addi	AO, AO, 16 * SIZE
697	vmaddfp	c11, a3, bp1, c11
698	addi	BO, BO,  4 * SIZE
699	vmaddfp	c12, a4, bp1, c12
700	nop
701
702	vmaddfp	c13, a1, bp2, c13
703	vmaddfp	c14, a2, bp2, c14
704	vmaddfp	c15, a3, bp2, c15
705	vmaddfp	c16, a4, bp2, c16
706	.align 4
707
708LL(18):
709	lvx	swap,    OFFSET_0, SP
710	lvx	neg,     OFFSET_1, SP
711	lvx	alpha_r, OFFSET_2, SP
712	lvx	alpha_i, OFFSET_3, SP
713
714	vxor	VZERO, VZERO, VZERO
715
716	vperm	c05, c05, c05, swap
717	vperm	c06, c06, c06, swap
718	vperm	c07, c07, c07, swap
719	vperm	c08, c08, c08, swap
720
721	vperm	c13, c13, c13, swap
722	vperm	c14, c14, c14, swap
723	vperm	c15, c15, c15, swap
724	vperm	c16, c16, c16, swap
725
726	vxor	c05, c05, neg
727	vxor	c06, c06, neg
728	vxor	c07, c07, neg
729	vxor	c08, c08, neg
730
731	vxor	c13, c13, neg
732	vxor	c14, c14, neg
733	vxor	c15, c15, neg
734	vxor	c16, c16, neg
735
736	vaddfp	c01, c01, c05
737	vaddfp	c02, c02, c06
738	vaddfp	c03, c03, c07
739	vaddfp	c04, c04, c08
740
741	vaddfp	c09, c09, c13
742	vaddfp	c10, c10, c14
743	vaddfp	c11, c11, c15
744	vaddfp	c12, c12, c16
745
746	vperm	c05, c01, c01, swap
747	vperm	c06, c02, c02, swap
748	vperm	c07, c03, c03, swap
749	vperm	c08, c04, c04, swap
750
751	vperm	c13, c09, c09, swap
752	vperm	c14, c10, c10, swap
753	vperm	c15, c11, c11, swap
754	vperm	c16, c12, c12, swap
755
756	vmaddfp	c01, alpha_r, c01, VZERO
757	vmaddfp	c02, alpha_r, c02, VZERO
758	vmaddfp	c03, alpha_r, c03, VZERO
759	vmaddfp	c04, alpha_r, c04, VZERO
760
761	vmaddfp	c01, alpha_i, c05, c01
762	vmaddfp	c02, alpha_i, c06, c02
763	vmaddfp	c03, alpha_i, c07, c03
764	vmaddfp	c04, alpha_i, c08, c04
765
766	vmaddfp	c09, alpha_r, c09, VZERO
767	vmaddfp	c10, alpha_r, c10, VZERO
768	vmaddfp	c11, alpha_r, c11, VZERO
769	vmaddfp	c12, alpha_r, c12, VZERO
770
771	vmaddfp	c09, alpha_i, c13, c09
772	vmaddfp	c10, alpha_i, c14, c10
773	vmaddfp	c11, alpha_i, c15, c11
774	vmaddfp	c12, alpha_i, c16, c12
775
776	lvx	C1, OFFSET_0, CO1
777	lvx	C2, OFFSET_1, CO1
778	lvx	C3, OFFSET_2, CO1
779	lvx	C4, OFFSET_3, CO1
780	lvx	C5, OFFSET_4, CO1
781
782	lvsr	PERMRSHIFT1, 0, CO1
783	lvsr	PERMRSHIFT2, 0, CO2
784
785	vperm	c00, VZERO, c01,   PERMRSHIFT1
786	vperm	c01, c01,   c02,   PERMRSHIFT1
787	vperm	c02, c02,   c03,   PERMRSHIFT1
788	vperm	c03, c03,   c04,   PERMRSHIFT1
789	vperm	c04, c04,   VZERO, PERMRSHIFT1
790
791	vaddfp	c00, c00, C1
792	vaddfp	c01, c01, C2
793	vaddfp	c02, c02, C3
794	vaddfp	c03, c03, C4
795	vaddfp	c04, c04, C5
796
797	stvx	c00, OFFSET_0, CO1
798	stvx	c01, OFFSET_1, CO1
799	stvx	c02, OFFSET_2, CO1
800	stvx	c03, OFFSET_3, CO1
801	stvx	c04, OFFSET_4, CO1
802
803	lvx	C1, OFFSET_0, CO2
804	lvx	C2, OFFSET_1, CO2
805	lvx	C3, OFFSET_2, CO2
806	lvx	C4, OFFSET_3, CO2
807	lvx	C5, OFFSET_4, CO2
808
809	vperm	c00, VZERO, c09,   PERMRSHIFT2
810	vperm	c09, c09,   c10,   PERMRSHIFT2
811	vperm	c10, c10,   c11,   PERMRSHIFT2
812	vperm	c11, c11,   c12,   PERMRSHIFT2
813	vperm	c12, c12,   VZERO, PERMRSHIFT2
814
815	vaddfp	c00, c00, C1
816	vaddfp	c09, c09, C2
817	vaddfp	c10, c10, C3
818	vaddfp	c11, c11, C4
819	vaddfp	c12, c12, C5
820
821	stvx	c00, OFFSET_0, CO2
822	stvx	c09, OFFSET_1, CO2
823	stvx	c10, OFFSET_2, CO2
824	stvx	c11, OFFSET_3, CO2
825	stvx	c12, OFFSET_4, CO2
826
827	addi	CO1, CO1, 16 * SIZE
828	addi	CO2, CO2, 16 * SIZE
829	addic.	I, I, -1
830	bgt+	LL(11)
831	.align 4
832
833LL(20):
834	andi.	I, M,  4
835	ble	LL(30)
836
837	vxor	c01, c01, c01
838	LOAD_A	a1, OFFSET_0, AO
839	vxor	c02, c02, c02
840	LOAD_A	a2, OFFSET_1, AO
841	vxor	c05, c05, c05
842	LOAD_A	a3, OFFSET_2, AO
843	vxor	c06, c06, c06
844	LOAD_A	a4, OFFSET_3, AO
845	vxor	c09, c09, c09
846	LOAD_B	b1, OFFSET_0, B
847	vxor	c10, c10, c10
848	LOAD_B	b2, OFFSET_1, B
849	vxor	c13, c13, c13
850	vxor	c14, c14, c14
851	mr	BO, B
852	vspltw	bp1, b1, 0
853
854	srawi.	r0,  K,  1
855	mtspr	CTR, r0
856	ble	LL(25)
857	.align 4
858
859LL(22):
860	vmaddfp	c01, a1, bp1, c01
861	vspltw	bp2, b1, 1
862	addi	AO, AO, 16 * SIZE
863	vmaddfp	c02, a2, bp1, c02
864	addi	BO, BO,  8 * SIZE
865
866	vmaddfp	c05, a1, bp2, c05
867	vspltw	bp1, b1, 2
868	vmaddfp	c06, a2, bp2, c06
869
870	vmaddfp	c09, a1, bp1, c09
871	vspltw	bp2, b1, 3
872	LOAD_B	b1, OFFSET_0, BO
873	vmaddfp	c10, a2, bp1, c10
874
875	vmaddfp	c13, a1, bp2, c13
876	LOAD_A	a1, OFFSET_0, AO
877	vspltw	bp1, b2, 0
878	vmaddfp	c14, a2, bp2, c14
879	LOAD_A	a2, OFFSET_1, AO
880
881	vmaddfp	c01, a3, bp1, c01
882	vspltw	bp2, b2, 1
883	vmaddfp	c02, a4, bp1, c02
884
885	vmaddfp	c05, a3, bp2, c05
886	vspltw	bp1, b2, 2
887	vmaddfp	c06, a4, bp2, c06
888
889	vmaddfp	c09, a3, bp1, c09
890	vspltw	bp2, b2, 3
891	LOAD_B	b2, OFFSET_1, BO
892	vmaddfp	c10, a4, bp1, c10
893
894	vmaddfp	c13, a3, bp2, c13
895	LOAD_A	a3, OFFSET_2, AO
896	vmaddfp	c14, a4, bp2, c14
897	LOAD_A	a4, OFFSET_3, AO
898	vspltw	bp1, b1, 0
899	bdnz	LL(22)
900	.align 4
901
902LL(25):
903	andi.	r0,  K,  1
904	ble+	LL(28)
905	.align 4
906
907LL(26):
908	vmaddfp	c01, a1, bp1, c01
909	vspltw	bp2, b1, 1
910	vmaddfp	c02, a2, bp1, c02
911	nop
912
913	vmaddfp	c05, a1, bp2, c05
914	vspltw	bp1, b1, 2
915	vmaddfp	c06, a2, bp2, c06
916	nop
917
918	vmaddfp	c09, a1, bp1, c09
919	vspltw	bp2, b1, 3
920	vmaddfp	c10, a2, bp1, c10
921	addi	AO, AO,  8 * SIZE
922
923	vmaddfp	c13, a1, bp2, c13
924	addi	BO, BO,  4 * SIZE
925	vmaddfp	c14, a2, bp2, c14
926	nop
927	.align 4
928
929LL(28):
930	vxor	VZERO, VZERO, VZERO
931
932	lvx	swap,    OFFSET_0, SP
933	lvx	neg,     OFFSET_1, SP
934	lvx	alpha_r, OFFSET_2, SP
935	lvx	alpha_i, OFFSET_3, SP
936
937	vperm	c05, c05, c05, swap
938	vperm	c06, c06, c06, swap
939	vperm	c13, c13, c13, swap
940	vperm	c14, c14, c14, swap
941
942	vxor	c05, c05, neg
943	vxor	c06, c06, neg
944	vxor	c13, c13, neg
945	vxor	c14, c14, neg
946
947	vaddfp	c01, c01, c05
948	vaddfp	c02, c02, c06
949	vaddfp	c09, c09, c13
950	vaddfp	c10, c10, c14
951
952	vperm	c05, c01, c01, swap
953	vperm	c06, c02, c02, swap
954	vperm	c13, c09, c09, swap
955	vperm	c14, c10, c10, swap
956
957	vmaddfp	c01, alpha_r, c01, VZERO
958	vmaddfp	c02, alpha_r, c02, VZERO
959	vmaddfp	c01, alpha_i, c05, c01
960	vmaddfp	c02, alpha_i, c06, c02
961
962	vmaddfp	c09, alpha_r, c09, VZERO
963	vmaddfp	c10, alpha_r, c10, VZERO
964	vmaddfp	c09, alpha_i, c13, c09
965	vmaddfp	c10, alpha_i, c14, c10
966
967	lvx	C1, OFFSET_0, CO1
968	lvx	C2, OFFSET_1, CO1
969	lvx	C3, OFFSET_2, CO1
970
971	lvsr	PERMRSHIFT1, 0, CO1
972	lvsr	PERMRSHIFT2, 0, CO2
973
974	vperm	c00, VZERO, c01,   PERMRSHIFT1
975	vperm	c01, c01,   c02,   PERMRSHIFT1
976	vperm	c02, c02, VZERO,   PERMRSHIFT1
977
978	vaddfp	c00, c00, C1
979	vaddfp	c01, c01, C2
980	vaddfp	c02, c02, C3
981
982	stvx	c00, OFFSET_0, CO1
983	stvx	c01, OFFSET_1, CO1
984	stvx	c02, OFFSET_2, CO1
985
986	lvx	C1, OFFSET_0, CO2
987	lvx	C2, OFFSET_1, CO2
988	lvx	C3, OFFSET_2, CO2
989
990	vperm	c00, VZERO, c09,   PERMRSHIFT2
991	vperm	c09, c09,   c10,   PERMRSHIFT2
992	vperm	c10, c10,   VZERO, PERMRSHIFT2
993
994	vaddfp	c00, c00, C1
995	vaddfp	c09, c09, C2
996	vaddfp	c10, c10, C3
997
998	stvx	c00, OFFSET_0, CO2
999	stvx	c09, OFFSET_1, CO2
1000	stvx	c10, OFFSET_2, CO2
1001
1002	addi	CO1, CO1, 8 * SIZE
1003	addi	CO2, CO2, 8 * SIZE
1004	.align 4
1005
1006LL(30):
1007	andi.	I, M,  2
1008	ble	LL(40)
1009
1010	vxor	c01, c01, c01
1011	LOAD_A	a1, OFFSET_0, AO
1012	vxor	c02, c02, c02
1013	LOAD_A	a2, OFFSET_1, AO
1014	vxor	c05, c05, c05
1015	LOAD_B	b1, OFFSET_0, B
1016	vxor	c06, c06, c06
1017	LOAD_B	b2, OFFSET_1, B
1018	vxor	c09, c09, c09
1019	vxor	c10, c10, c10
1020	vxor	c13, c13, c13
1021	vxor	c14, c14, c14
1022
1023	vspltw	bp1, b1, 0
1024	mr	BO, B
1025
1026	srawi.	r0,  K,  1
1027	mtspr	CTR, r0
1028	ble	LL(35)
1029	.align 4
1030
1031LL(32):
1032	vmaddfp	c01, a1, bp1, c01
1033	addi	AO, AO,  8 * SIZE
1034	vspltw	bp2, b1, 1
1035	vmaddfp	c05, a1, bp2, c05
1036	addi	BO, BO,  8 * SIZE
1037	vspltw	bp1, b1, 2
1038	vmaddfp	c09, a1, bp1, c09
1039	vspltw	bp2, b1, 3
1040	vmaddfp	c13, a1, bp2, c13
1041	LOAD_A	a1, OFFSET_0, AO
1042	vspltw	bp1, b2, 0
1043	LOAD_B	b1, OFFSET_0, BO
1044
1045	vmaddfp	c02, a2, bp1, c02
1046	vspltw	bp2, b2, 1
1047	vmaddfp	c06, a2, bp2, c06
1048	vspltw	bp1, b2, 2
1049	vmaddfp	c10, a2, bp1, c10
1050	vspltw	bp2, b2, 3
1051	LOAD_B	b2, OFFSET_1, BO
1052	vmaddfp	c14, a2, bp2, c14
1053	LOAD_A	a2, OFFSET_1, AO
1054
1055	vspltw	bp1, b1, 0
1056	bdnz	LL(32)
1057	.align 4
1058
1059LL(35):
1060	andi.	r0,  K,  1
1061	ble+	LL(38)
1062	.align 4
1063
1064LL(36):
1065	vmaddfp	c01, a1, bp1, c01
1066	vspltw	bp2, b1, 1
1067	vmaddfp	c05, a1, bp2, c05
1068	vspltw	bp1, b1, 2
1069	vmaddfp	c09, a1, bp1, c09
1070	vspltw	bp2, b1, 3
1071	vmaddfp	c13, a1, bp2, c13
1072	addi	AO, AO,  4 * SIZE
1073	addi	BO, BO,  4 * SIZE
1074	.align 4
1075
1076LL(38):
1077	vaddfp	c01, c01, c02
1078	vaddfp	c05, c05, c06
1079	vaddfp	c09, c09, c10
1080	vaddfp	c13, c13, c14
1081
1082	vxor	VZERO, VZERO, VZERO
1083
1084	lvx	swap,    OFFSET_0, SP
1085	lvx	neg,     OFFSET_1, SP
1086	lvx	alpha_r, OFFSET_2, SP
1087	lvx	alpha_i, OFFSET_3, SP
1088
1089	vperm	c05, c05, c05, swap
1090	vperm	c13, c13, c13, swap
1091
1092	vxor	c05, c05, neg
1093	vxor	c13, c13, neg
1094
1095	vaddfp	c01, c01, c05
1096	vaddfp	c09, c09, c13
1097
1098	vperm	c05, c01, c01, swap
1099	vperm	c13, c09, c09, swap
1100
1101	vmaddfp	c01, alpha_r, c01, VZERO
1102	vmaddfp	c01, alpha_i, c05, c01
1103
1104	vmaddfp	c09, alpha_r, c09, VZERO
1105	vmaddfp	c09, alpha_i, c13, c09
1106
1107	lvx	C1, OFFSET_0, CO1
1108	lvx	C2, OFFSET_1, CO1
1109
1110	lvsr	PERMRSHIFT1, 0, CO1
1111	lvsr	PERMRSHIFT2, 0, CO2
1112
1113	vperm	c00, VZERO, c01,   PERMRSHIFT1
1114	vperm	c01, c01, VZERO,   PERMRSHIFT1
1115
1116	vaddfp	c00, c00, C1
1117	vaddfp	c01, c01, C2
1118
1119	stvx	c00, OFFSET_0, CO1
1120	stvx	c01, OFFSET_1, CO1
1121
1122	lvx	C1, OFFSET_0, CO2
1123	lvx	C2, OFFSET_1, CO2
1124
1125	vperm	c00, VZERO, c09,   PERMRSHIFT2
1126	vperm	c09, c09,   VZERO, PERMRSHIFT2
1127
1128	vaddfp	c00, c00, C1
1129	vaddfp	c09, c09, C2
1130
1131	stvx	c00, OFFSET_0, CO2
1132	stvx	c09, OFFSET_1, CO2
1133
1134	addi	CO1, CO1,  4 * SIZE
1135	addi	CO2, CO2,  4 * SIZE
1136	.align 4
1137
1138LL(40):
1139	andi.	I, M,  1
1140	ble	LL(49)
1141
1142	mr	BO, B
1143
1144	LFD	f8,   0 * SIZE(AO)
1145	LFD	f9,   1 * SIZE(AO)
1146
1147	LFD	f10,  0 * SIZE(BO)
1148	LFD	f11,  1 * SIZE(BO)
1149	LFD	f12,  2 * SIZE(BO)
1150	LFD	f13,  3 * SIZE(BO)
1151
1152	lfs	f0,  FZERO(SP)
1153 	fmr	f1,  f0
1154	fmr	f2,  f0
1155	fmr	f3,  f0
1156
1157	fmr	f4,  f0
1158	fmr	f5,  f0
1159	fmr	f6,  f0
1160	fmr	f7,  f0
1161
1162	srawi.	r0,  K,  1
1163	mtspr	CTR, r0
1164	ble	LL(45)
1165	.align 4
1166
1167LL(42):
1168	fmadd	f0,  f8, f10, f0
1169	fmadd	f2,  f8, f11, f2
1170	fmadd	f4,  f8, f12, f4
1171	fmadd	f6,  f8, f13, f6
1172
1173	fmadd	f1,  f9, f10, f1
1174	fmadd	f3,  f9, f11, f3
1175	fmadd	f5,  f9, f12, f5
1176	fmadd	f7,  f9, f13, f7
1177
1178	LFD	f8,   2 * SIZE(AO)
1179	LFD	f9,   3 * SIZE(AO)
1180
1181	LFD	f10,  4 * SIZE(BO)
1182	LFD	f11,  5 * SIZE(BO)
1183	LFD	f12,  6 * SIZE(BO)
1184	LFD	f13,  7 * SIZE(BO)
1185
1186	fmadd	f0,  f8, f10, f0
1187	fmadd	f2,  f8, f11, f2
1188	fmadd	f4,  f8, f12, f4
1189	fmadd	f6,  f8, f13, f6
1190
1191	fmadd	f1,  f9, f10, f1
1192	fmadd	f3,  f9, f11, f3
1193	fmadd	f5,  f9, f12, f5
1194	fmadd	f7,  f9, f13, f7
1195
1196	LFD	f8,   4 * SIZE(AO)
1197	LFD	f9,   5 * SIZE(AO)
1198
1199	LFD	f10,  8 * SIZE(BO)
1200	LFD	f11,  9 * SIZE(BO)
1201	LFD	f12, 10 * SIZE(BO)
1202	LFD	f13, 11 * SIZE(BO)
1203
1204	addi	AO, AO,  4 * SIZE
1205	addi	BO, BO,  8 * SIZE
1206	bdnz	LL(42)
1207	.align 4
1208
1209LL(45):
1210	andi.	r0,  K,  1
1211	ble	LL(48)
1212	.align 4
1213
1214LL(46):
1215	fmadd	f0,  f8, f10, f0
1216	fmadd	f2,  f8, f11, f2
1217	fmadd	f4,  f8, f12, f4
1218	fmadd	f6,  f8, f13, f6
1219
1220	fmadd	f1,  f9, f10, f1
1221	fmadd	f3,  f9, f11, f3
1222	fmadd	f5,  f9, f12, f5
1223	fmadd	f7,  f9, f13, f7
1224
1225	addi	AO, AO,  2 * SIZE
1226	addi	BO, BO,  4 * SIZE
1227	.align 4
1228
1229LL(48):
1230#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
1231	fsub	f0, f0, f3
1232	fadd	f1, f1, f2
1233	fsub	f4, f4, f7
1234	fadd	f5, f5, f6
1235#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
1236	fadd	f0, f0, f3
1237	fsub	f1, f1, f2
1238	fadd	f4, f4, f7
1239	fsub	f5, f5, f6
1240#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
1241	fadd	f0, f0, f3
1242	fsub	f1, f2, f1
1243	fadd	f4, f4, f7
1244	fsub	f5, f6, f5
1245#else /* RR, RC, CR, CC */
1246	fsub	f0, f0, f3
1247	fadd	f1, f1, f2
1248	fsub	f4, f4, f7
1249	fadd	f5, f5, f6
1250#endif
1251
1252	LFD	f8,  0 * SIZE(CO1)
1253	LFD	f9,  1 * SIZE(CO1)
1254	LFD	f10, 0 * SIZE(CO2)
1255	LFD	f11, 1 * SIZE(CO2)
1256
1257	lfs	f12,  ALPHA_R + 0(SP)
1258	lfs	f13,  ALPHA_I + 4(SP)
1259
1260#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1261	fmadd	f8,  f12, f0, f8
1262	fnmsub	f9,  f12, f1, f9
1263	fmadd	f10, f12, f4, f10
1264	fnmsub	f11, f12, f5, f11
1265
1266	fmadd	f8,  f13, f1, f8
1267	fmadd	f9,  f13, f0, f9
1268	fmadd	f10, f13, f5, f10
1269	fmadd	f11, f13, f4, f11
1270#else
1271	fmadd	f8,  f12, f0, f8
1272	fmadd	f9,  f12, f1, f9
1273	fmadd	f10, f12, f4, f10
1274	fmadd	f11, f12, f5, f11
1275
1276	fnmsub	f8,  f13, f1, f8
1277	fmadd	f9,  f13, f0, f9
1278	fnmsub	f10, f13, f5, f10
1279	fmadd	f11, f13, f4, f11
1280#endif
1281
1282	STFD	f8,  0 * SIZE(CO1)
1283	STFD	f9,  1 * SIZE(CO1)
1284	STFD	f10, 0 * SIZE(CO2)
1285	STFD	f11, 1 * SIZE(CO2)
1286
1287LL(49):
1288	mr	B, BO
1289
1290	addic.	J, J, -1
1291	bgt	LL(01)
1292	.align 4
1293
1294LL(50):
1295	andi.	J, N,  1
1296	ble	LL(999)
1297
1298	mr	CO1, C
1299	mr	AO, A
1300
1301	srawi.	I, M,  3
1302	ble	LL(70)
1303	.align 4
1304
1305LL(61):
1306	vxor	c01, c01, c01
1307	LOAD_B	b1, OFFSET_0, B
1308	vxor	c02, c02, c02
1309	vxor	c03, c03, c03
1310	LOAD_A	a1, OFFSET_0, AO
1311	vxor	c04, c04, c04
1312	LOAD_A	a2, OFFSET_1, AO
1313	vxor	c05, c05, c05
1314	LOAD_A	a3, OFFSET_2, AO
1315	vxor	c06, c06, c06
1316	LOAD_A	a4, OFFSET_3, AO
1317	vxor	c07, c07, c07
1318	vxor	c08, c08, c08
1319
1320	mr	BO, B
1321	dcbtst	CO1, PREC
1322	dcbtst	CO2, PREC
1323
1324	vspltw	bp1, b1, 0
1325
1326	srawi.	r0,  K,  1
1327	mtspr	CTR, r0
1328	ble	LL(65)
1329	.align 4
1330
1331LL(62):
1332	LOAD_A	a5, OFFSET_4, AO
1333	LOAD_A	a6, OFFSET_5, AO
1334	LOAD_A	a7, OFFSET_6, AO
1335	LOAD_A	a8, OFFSET_7, AO
1336
1337	vmaddfp	c01, a1, bp1, c01
1338	vspltw	bp2, b1, 1
1339	vmaddfp	c02, a2, bp1, c02
1340	vmaddfp	c03, a3, bp1, c03
1341	vmaddfp	c04, a4, bp1, c04
1342
1343	vmaddfp	c05, a1, bp2, c05
1344	vspltw	bp1, b1, 2
1345	vmaddfp	c06, a2, bp2, c06
1346	vmaddfp	c07, a3, bp2, c07
1347	vmaddfp	c08, a4, bp2, c08
1348
1349	vmaddfp	c01, a5, bp1, c01
1350	vspltw	bp2, b1, 3
1351	vmaddfp	c02, a6, bp1, c02
1352	vmaddfp	c03, a7, bp1, c03
1353	vmaddfp	c04, a8, bp1, c04
1354
1355	LOAD_B	b1, OFFSET_1, BO
1356	vspltw	bp1, b1, 0
1357
1358	vmaddfp	c05, a5, bp2, c05
1359	vmaddfp	c06, a6, bp2, c06
1360	vmaddfp	c07, a7, bp2, c07
1361	vmaddfp	c08, a8, bp2, c08
1362
1363	addi	AO, AO, 32 * SIZE
1364	addi	BO, BO,  4 * SIZE
1365
1366	LOAD_A	a1, OFFSET_0, AO
1367	LOAD_A	a2, OFFSET_1, AO
1368	LOAD_A	a3, OFFSET_2, AO
1369	LOAD_A	a4, OFFSET_3, AO
1370	bdnz	LL(62)
1371	.align 4
1372
1373LL(65):
1374	andi.	r0,  K,  1
1375	ble+	LL(68)
1376	.align 4
1377
1378LL(66):
1379	vmaddfp	c01, a1, bp1, c01
1380	vspltw	bp2, b1, 1
1381	vmaddfp	c02, a2, bp1, c02
1382	addi	AO, AO, 16 * SIZE
1383	vmaddfp	c03, a3, bp1, c03
1384	addi	BO, BO,  2 * SIZE
1385	vmaddfp	c04, a4, bp1, c04
1386	nop
1387
1388	vmaddfp	c05, a1, bp2, c05
1389	vmaddfp	c06, a2, bp2, c06
1390	vmaddfp	c07, a3, bp2, c07
1391	vmaddfp	c08, a4, bp2, c08
1392	.align 4
1393
1394LL(68):
1395	vxor	VZERO, VZERO, VZERO
1396
1397	lvx	swap,    OFFSET_0, SP
1398	lvx	neg,     OFFSET_1, SP
1399	lvx	alpha_r, OFFSET_2, SP
1400	lvx	alpha_i, OFFSET_3, SP
1401
1402	vperm	c05, c05, c05, swap
1403	vperm	c06, c06, c06, swap
1404	vperm	c07, c07, c07, swap
1405	vperm	c08, c08, c08, swap
1406
1407	vxor	c05, c05, neg
1408	vxor	c06, c06, neg
1409	vxor	c07, c07, neg
1410	vxor	c08, c08, neg
1411
1412	vaddfp	c01, c01, c05
1413	vaddfp	c02, c02, c06
1414	vaddfp	c03, c03, c07
1415	vaddfp	c04, c04, c08
1416
1417	vperm	c05, c01, c01, swap
1418	vperm	c06, c02, c02, swap
1419	vperm	c07, c03, c03, swap
1420	vperm	c08, c04, c04, swap
1421
1422	vmaddfp	c01, alpha_r, c01, VZERO
1423	vmaddfp	c02, alpha_r, c02, VZERO
1424	vmaddfp	c03, alpha_r, c03, VZERO
1425	vmaddfp	c04, alpha_r, c04, VZERO
1426
1427	vmaddfp	c01, alpha_i, c05, c01
1428	vmaddfp	c02, alpha_i, c06, c02
1429	vmaddfp	c03, alpha_i, c07, c03
1430	vmaddfp	c04, alpha_i, c08, c04
1431
1432	lvx	C1, OFFSET_0, CO1
1433	lvx	C2, OFFSET_1, CO1
1434	lvx	C3, OFFSET_2, CO1
1435	lvx	C4, OFFSET_3, CO1
1436	lvx	C5, OFFSET_4, CO1
1437
1438	lvsr	PERMRSHIFT1, 0, CO1
1439
1440	vperm	c00, VZERO, c01,   PERMRSHIFT1
1441	vperm	c01, c01,   c02,   PERMRSHIFT1
1442	vperm	c02, c02,   c03,   PERMRSHIFT1
1443	vperm	c03, c03,   c04,   PERMRSHIFT1
1444	vperm	c04, c04,   VZERO, PERMRSHIFT1
1445
1446	vaddfp	c00, c00, C1
1447	vaddfp	c01, c01, C2
1448	vaddfp	c02, c02, C3
1449	vaddfp	c03, c03, C4
1450	vaddfp	c04, c04, C5
1451
1452	stvx	c00, OFFSET_0, CO1
1453	stvx	c01, OFFSET_1, CO1
1454	stvx	c02, OFFSET_2, CO1
1455	stvx	c03, OFFSET_3, CO1
1456	stvx	c04, OFFSET_4, CO1
1457
1458	addi	CO1, CO1, 16 * SIZE
1459	addic.	I, I, -1
1460	bgt+	LL(61)
1461	.align 4
1462
1463LL(70):
1464	andi.	I, M,  4
1465	ble	LL(80)
1466
1467	vxor	c01, c01, c01
1468	LOAD_B	b1, OFFSET_0, B
1469	vxor	c02, c02, c02
1470	vxor	c03, c03, c03
1471	LOAD_A	a1, OFFSET_0, AO
1472	vxor	c04, c04, c04
1473	LOAD_A	a2, OFFSET_1, AO
1474	vxor	c05, c05, c05
1475	LOAD_A	a3, OFFSET_2, AO
1476	vxor	c06, c06, c06
1477	LOAD_A	a4, OFFSET_3, AO
1478	vxor	c07, c07, c07
1479	vxor	c08, c08, c08
1480
1481	mr	BO, B
1482
1483	vspltw	bp1, b1, 0
1484	srawi.	r0,  K,  1
1485	mtspr	CTR, r0
1486	ble	LL(75)
1487	.align 4
1488
1489LL(72):
1490	vmaddfp	c01, a1, bp1, c01
1491	vspltw	bp2, b1, 1
1492	vmaddfp	c02, a2, bp1, c02
1493
1494	vmaddfp	c05, a1, bp2, c05
1495	vspltw	bp1, b1, 2
1496	vmaddfp	c06, a2, bp2, c06
1497
1498	vmaddfp	c03, a3, bp1, c03
1499	vspltw	bp2, b1, 3
1500	vmaddfp	c04, a4, bp1, c04
1501
1502	LOAD_B	b1, OFFSET_1, BO
1503	vspltw	bp1, b1, 0
1504
1505	vmaddfp	c07, a3, bp2, c07
1506	vmaddfp	c08, a4, bp2, c08
1507
1508	addi	AO, AO, 16 * SIZE
1509	addi	BO, BO,  4 * SIZE
1510
1511	LOAD_A	a1, OFFSET_0, AO
1512	LOAD_A	a2, OFFSET_1, AO
1513	LOAD_A	a3, OFFSET_2, AO
1514	LOAD_A	a4, OFFSET_3, AO
1515	bdnz	LL(72)
1516	.align 4
1517
1518LL(75):
1519	andi.	r0,  K,  1
1520	ble+	LL(78)
1521	.align 4
1522
1523LL(76):
1524	vmaddfp	c01, a1, bp1, c01
1525	vspltw	bp2, b1, 1
1526	vmaddfp	c02, a2, bp1, c02
1527	addi	AO, AO,  8 * SIZE
1528	vmaddfp	c05, a1, bp2, c05
1529	addi	BO, BO,  2 * SIZE
1530	vmaddfp	c06, a2, bp2, c06
1531	.align 4
1532
1533LL(78):
1534	vaddfp	c01, c01, c03
1535	vaddfp	c02, c02, c04
1536	vaddfp	c05, c05, c07
1537	vaddfp	c06, c06, c08
1538
1539	vxor	VZERO, VZERO, VZERO
1540
1541	lvx	swap,    OFFSET_0, SP
1542	lvx	neg,     OFFSET_1, SP
1543	lvx	alpha_r, OFFSET_2, SP
1544	lvx	alpha_i, OFFSET_3, SP
1545
1546	vperm	c05, c05, c05, swap
1547	vperm	c06, c06, c06, swap
1548
1549	vxor	c05, c05, neg
1550	vxor	c06, c06, neg
1551
1552	vaddfp	c01, c01, c05
1553	vaddfp	c02, c02, c06
1554
1555	vperm	c05, c01, c01, swap
1556	vperm	c06, c02, c02, swap
1557
1558	vmaddfp	c01, alpha_r, c01, VZERO
1559	vmaddfp	c02, alpha_r, c02, VZERO
1560	vmaddfp	c01, alpha_i, c05, c01
1561	vmaddfp	c02, alpha_i, c06, c02
1562
1563	lvx	C1, OFFSET_0, CO1
1564	lvx	C2, OFFSET_1, CO1
1565	lvx	C3, OFFSET_2, CO1
1566
1567	lvsr	PERMRSHIFT1, 0, CO1
1568
1569	vperm	c00, VZERO, c01,   PERMRSHIFT1
1570	vperm	c01, c01,   c02,   PERMRSHIFT1
1571	vperm	c02, c02, VZERO,   PERMRSHIFT1
1572
1573	vaddfp	c00, c00, C1
1574	vaddfp	c01, c01, C2
1575	vaddfp	c02, c02, C3
1576
1577	stvx	c00, OFFSET_0, CO1
1578	stvx	c01, OFFSET_1, CO1
1579	stvx	c02, OFFSET_2, CO1
1580
1581	addi	CO1, CO1,  8 * SIZE
1582	.align 4
1583
1584LL(80):
1585	andi.	I, M,  2
1586	ble	LL(90)
1587
1588	vxor	c01, c01, c01
1589	LOAD_B	b1, OFFSET_0, B
1590	vxor	c02, c02, c02
1591	LOAD_A	a1, OFFSET_0, AO
1592	LOAD_A	a2, OFFSET_1, AO
1593	vxor	c05, c05, c05
1594	vxor	c06, c06, c06
1595
1596	mr	BO, B
1597
1598	vspltw	bp1, b1, 0
1599
1600	srawi.	r0,  K,  1
1601	mtspr	CTR, r0
1602	ble	LL(85)
1603	.align 4
1604
1605LL(82):
1606	vmaddfp	c01, a1, bp1, c01
1607	vspltw	bp2, b1, 1
1608
1609	vmaddfp	c05, a1, bp2, c05
1610	vspltw	bp1, b1, 2
1611
1612	vmaddfp	c02, a2, bp1, c02
1613	vspltw	bp2, b1, 3
1614
1615	LOAD_B	b1, OFFSET_1, BO
1616	vspltw	bp1, b1, 0
1617
1618	vmaddfp	c06, a2, bp2, c06
1619
1620	addi	AO, AO,  8 * SIZE
1621	addi	BO, BO,  4 * SIZE
1622
1623	LOAD_A	a1, OFFSET_0, AO
1624	LOAD_A	a2, OFFSET_1, AO
1625	bdnz	LL(82)
1626	.align 4
1627
1628LL(85):
1629	andi.	r0,  K,  1
1630	ble+	LL(88)
1631	.align 4
1632
1633LL(86):
1634	vspltw	bp2, b1, 1
1635	vmaddfp	c01, a1, bp1, c01
1636	vmaddfp	c05, a1, bp2, c05
1637	addi	AO, AO,  4 * SIZE
1638	addi	BO, BO,  2 * SIZE
1639	.align 4
1640
1641LL(88):
1642	vaddfp	c01, c01, c02
1643	vaddfp	c05, c05, c06
1644	vaddfp	c09, c09, c10
1645	vaddfp	c13, c13, c14
1646
1647	vxor	VZERO, VZERO, VZERO
1648
1649	lvx	swap,    OFFSET_0, SP
1650	lvx	neg,     OFFSET_1, SP
1651	lvx	alpha_r, OFFSET_2, SP
1652	lvx	alpha_i, OFFSET_3, SP
1653
1654	vperm	c05, c05, c05, swap
1655
1656	vxor	c05, c05, neg
1657
1658	vaddfp	c01, c01, c05
1659
1660	vperm	c05, c01, c01, swap
1661
1662	vmaddfp	c01, alpha_r, c01, VZERO
1663	vmaddfp	c01, alpha_i, c05, c01
1664
1665	lvx	C1, OFFSET_0, CO1
1666	lvx	C2, OFFSET_1, CO1
1667
1668	lvsr	PERMRSHIFT1, 0, CO1
1669
1670	vperm	c00, VZERO, c01,   PERMRSHIFT1
1671	vperm	c01, c01, VZERO,   PERMRSHIFT1
1672
1673	vaddfp	c00, c00, C1
1674	vaddfp	c01, c01, C2
1675
1676	stvx	c00, OFFSET_0, CO1
1677	stvx	c01, OFFSET_1, CO1
1678
1679	addi	CO1, CO1,  4 * SIZE
1680	.align 4
1681
1682LL(90):
1683	andi.	I, M,  1
1684	ble	LL(999)
1685
1686	mr	BO, B
1687
1688	LFD	f8,   0 * SIZE(AO)
1689	LFD	f9,   1 * SIZE(AO)
1690
1691	LFD	f10,  0 * SIZE(BO)
1692	LFD	f11,  1 * SIZE(BO)
1693	LFD	f12,  2 * SIZE(BO)
1694	LFD	f13,  3 * SIZE(BO)
1695
1696	lfs	f0,  FZERO(SP)
1697 	fmr	f1,  f0
1698	fmr	f2,  f0
1699	fmr	f3,  f0
1700
1701	srawi.	r0,  K,  1
1702	mtspr	CTR, r0
1703	ble	LL(95)
1704	.align 4
1705
1706LL(92):
1707	fmadd	f0,  f8, f10, f0
1708	fmadd	f2,  f8, f11, f2
1709	fmadd	f1,  f9, f10, f1
1710	fmadd	f3,  f9, f11, f3
1711
1712	LFD	f8,   2 * SIZE(AO)
1713	LFD	f9,   3 * SIZE(AO)
1714	LFD	f10,  4 * SIZE(BO)
1715	LFD	f11,  5 * SIZE(BO)
1716
1717	fmadd	f0,  f8, f12, f0
1718	fmadd	f2,  f8, f13, f2
1719	fmadd	f1,  f9, f12, f1
1720	fmadd	f3,  f9, f13, f3
1721
1722	LFD	f8,   4 * SIZE(AO)
1723	LFD	f9,   5 * SIZE(AO)
1724	LFD	f12,  6 * SIZE(BO)
1725	LFD	f13,  7 * SIZE(BO)
1726
1727	addi	AO, AO,  4 * SIZE
1728	addi	BO, BO,  4 * SIZE
1729	bdnz	LL(92)
1730	.align 4
1731
1732LL(95):
1733	andi.	r0,  K,  1
1734	ble	LL(98)
1735	.align 4
1736
1737LL(96):
1738	fmadd	f0,  f8, f10, f0
1739	fmadd	f2,  f8, f11, f2
1740	fmadd	f1,  f9, f10, f1
1741	fmadd	f3,  f9, f11, f3
1742	.align 4
1743
1744LL(98):
1745#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
1746	fsub	f0, f0, f3
1747	fadd	f1, f1, f2
1748#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
1749	fadd	f0, f0, f3
1750	fsub	f1, f1, f2
1751#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
1752	fadd	f0, f0, f3
1753	fsub	f1, f2, f1
1754#else /* RR, RC, CR, CC */
1755	fsub	f0, f0, f3
1756	fadd	f1, f1, f2
1757#endif
1758
1759	LFD	f8,  0 * SIZE(CO1)
1760	LFD	f9,  1 * SIZE(CO1)
1761
1762	lfs	f12,  ALPHA_R + 0(SP)
1763	lfs	f13,  ALPHA_I + 4(SP)
1764
1765#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1766	fmadd	f8,  f12, f0, f8
1767	fnmsub	f9,  f12, f1, f9
1768
1769	fmadd	f8,  f13, f1, f8
1770	fmadd	f9,  f13, f0, f9
1771#else
1772	fmadd	f8,  f12, f0, f8
1773	fmadd	f9,  f12, f1, f9
1774
1775	fnmsub	f8,  f13, f1, f8
1776	fmadd	f9,  f13, f0, f9
1777#endif
1778
1779	STFD	f8,  0 * SIZE(CO1)
1780	STFD	f9,  1 * SIZE(CO1)
1781	.align 4
1782
1783LL(999):
1784	mr	SP, STACK
1785
1786	li	r0,  0 * 16
1787	lvx	v20, SP, r0
1788	li	r0,  1 * 16
1789	lvx	v21, SP, r0
1790	li	r0,  2 * 16
1791	lvx	v22, SP, r0
1792	li	r0,  3 * 16
1793	lvx	v23, SP, r0
1794	li	r0,  4 * 16
1795	lvx	v24, SP, r0
1796	li	r0,  5 * 16
1797	lvx	v25, SP, r0
1798	li	r0,  6 * 16
1799	lvx	v26, SP, r0
1800	li	r0,  7 * 16
1801	lvx	v27, SP, r0
1802	li	r0,  8 * 16
1803	lvx	v28, SP, r0
1804	li	r0,  9 * 16
1805	lvx	v29, SP, r0
1806	li	r0, 10 * 16
1807	lvx	v30, SP, r0
1808	li	r0, 11 * 16
1809	lvx	v31, SP, r0
1810
1811	mtspr	VRsave, VREG
1812
1813#ifdef __64BIT__
1814	ld	r31,  192(SP)
1815	ld	r30,  200(SP)
1816	ld	r29,  208(SP)
1817	ld	r28,  216(SP)
1818	ld	r27,  224(SP)
1819	ld	r26,  232(SP)
1820	ld	r25,  240(SP)
1821	ld	r24,  248(SP)
1822	ld	r23,  256(SP)
1823	ld	r22,  264(SP)
1824	ld	r21,  272(SP)
1825	ld	r20,  280(SP)
1826	ld	r19,  288(SP)
1827	ld	r18,  296(SP)
1828	ld	r17,  304(SP)
1829	ld	r16,  312(SP)
1830	ld	r15,  320(SP)
1831	ld	r14,  328(SP)
1832#else
1833	lwz	r31,  192(SP)
1834	lwz	r30,  196(SP)
1835	lwz	r29,  200(SP)
1836	lwz	r28,  204(SP)
1837	lwz	r27,  208(SP)
1838	lwz	r26,  212(SP)
1839	lwz	r25,  216(SP)
1840	lwz	r24,  220(SP)
1841	lwz	r23,  224(SP)
1842	lwz	r22,  228(SP)
1843	lwz	r21,  232(SP)
1844	lwz	r20,  236(SP)
1845	lwz	r19,  240(SP)
1846	lwz	r18,  244(SP)
1847	lwz	r17,  248(SP)
1848	lwz	r16,  252(SP)
1849	lwz	r15,  256(SP)
1850	lwz	r14,  260(SP)
1851#endif
1852
1853	addi	SP, SP, STACKSIZE
1854
1855	blr
1856
1857	EPILOGUE
1858#endif
1859