1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#ifndef __64BIT__
43#define LOAD	lwz
44#else
45#define LOAD	ld
46#endif
47
48#ifdef __64BIT__
49#define STACKSIZE 360
50#else
51#define STACKSIZE 272
52#endif
53
54#define ALIGN_SIZE	0xffff
55#define SWAP		  0
56#define NEG		 16
57#define ALPHA_R		 32
58#define ALPHA_I		 48
59#define FZERO		 64
60
61#define	M	r3
62#define	N	r4
63#define	K	r5
64
65#ifdef linux
66#ifndef __64BIT__
67#define A	r6
68#define	B	r7
69#define	C	r8
70#define	LDC	r9
71#else
72#define A	r8
73#define	B	r9
74#define	C	r10
75#define	LDC	r6
76#endif
77#endif
78
79#if defined(_AIX) || defined(__APPLE__)
80#if !defined(__64BIT__) && defined(DOUBLE)
81#define A	r10
82#define	B	r6
83#define	C	r7
84#define	LDC	r8
85#else
86#define A	r8
87#define	B	r9
88#define	C	r10
89#define	LDC	r6
90#endif
91#endif
92
93#define STACK	r11
94
95#define	I	r21
96#define J	r22
97#define AO	r23
98#define	BO	r24
99#define	CO1	r25
100#define CO2	r26
101
102#define PREA	r29
103#define PREB	r29
104#define PREC	r30
105#define VREG	r31
106
107#define LOAD_A	lvx
108#define LOAD_B	lvx
109
110#define OFFSET_0	  0
111#define OFFSET_1	r14
112#define OFFSET_2	r15
113#define OFFSET_3	r16
114#define OFFSET_4	r17
115#define OFFSET_5	r18
116#define OFFSET_6	r19
117#define OFFSET_7	r20
118
119#define	c01	v0
120#define	c02	v1
121#define	c03	v2
122#define	c04	v3
123#define	c05	v4
124#define	c06	v5
125#define	c07	v6
126#define	c08	v7
127#define	c09	v8
128#define	c10	v9
129#define	c11	v10
130#define	c12	v11
131#define	c13	v12
132#define	c14	v13
133#define	c15	v14
134#define	c16	v15
135
136#define	a1	v16
137#define	a2	v17
138#define	a3	v18
139#define	a4	v19
140#define	a5	v20
141#define	a6	v21
142#define	a7	v22
143#define	a8	v23
144
145#define	b1	v24
146#define	b2	v25
147#define	bp1	v26
148#define	bp2	v27
149
150#define C1	v16
151#define C2	v17
152#define C3	v18
153#define C4	v19
154#define C5	v20
155
156#define c00	v24
157
158#define VZERO		 v25
159#define PERMRSHIFT1	 v26
160#define PERMRSHIFT2	 v27
161
162#define swap		 v28
163#define neg		 v29
164#define alpha_r		 v30
165#define alpha_i		 v31
166
167#ifndef NEEDPARAM
168
169	PROLOGUE
170	PROFCODE
171
172	addi	SP, SP, -STACKSIZE
173	mr	STACK, SP
174
175	li	r0,  0 * 16
176	stvx	v20, SP, r0
177	li	r0,  1 * 16
178	stvx	v21, SP, r0
179	li	r0,  2 * 16
180	stvx	v22, SP, r0
181	li	r0,  3 * 16
182	stvx	v23, SP, r0
183	li	r0,  4 * 16
184	stvx	v24, SP, r0
185	li	r0,  5 * 16
186	stvx	v25, SP, r0
187	li	r0,  6 * 16
188	stvx	v26, SP, r0
189	li	r0,  7 * 16
190	stvx	v27, SP, r0
191	li	r0,  8 * 16
192	stvx	v28, SP, r0
193	li	r0,  9 * 16
194	stvx	v29, SP, r0
195	li	r0, 10 * 16
196	stvx	v30, SP, r0
197	li	r0, 11 * 16
198	stvx	v31, SP, r0
199
200#ifdef __64BIT__
201	std	r31,  192(SP)
202	std	r30,  200(SP)
203	std	r29,  208(SP)
204	std	r28,  216(SP)
205	std	r27,  224(SP)
206	std	r26,  232(SP)
207	std	r25,  240(SP)
208	std	r24,  248(SP)
209	std	r23,  256(SP)
210	std	r22,  264(SP)
211	std	r21,  272(SP)
212	std	r20,  280(SP)
213	std	r19,  288(SP)
214	std	r18,  296(SP)
215	std	r17,  304(SP)
216	std	r16,  312(SP)
217	std	r15,  320(SP)
218	std	r14,  328(SP)
219#else
220	stw	r31,  192(SP)
221	stw	r30,  196(SP)
222	stw	r29,  200(SP)
223	stw	r28,  204(SP)
224	stw	r27,  208(SP)
225	stw	r26,  212(SP)
226	stw	r25,  216(SP)
227	stw	r24,  220(SP)
228	stw	r23,  224(SP)
229	stw	r22,  228(SP)
230	stw	r21,  232(SP)
231	stw	r20,  236(SP)
232	stw	r19,  240(SP)
233	stw	r18,  244(SP)
234	stw	r17,  248(SP)
235	stw	r16,  252(SP)
236	stw	r15,  256(SP)
237	stw	r14,  260(SP)
238#endif
239
240
241#ifdef linux
242#ifdef __64BIT__
243	ld	LDC,    112 + STACKSIZE(SP)
244#endif
245#endif
246
247#if defined(_AIX) || defined(__APPLE__)
248#ifdef __64BIT__
249	ld	LDC,    112 + STACKSIZE(SP)
250#else
251#ifdef DOUBLE
252	lwz	B,       56 + STACKSIZE(SP)
253	lwz	C,       60 + STACKSIZE(SP)
254	lwz	LDC,     64 + STACKSIZE(SP)
255#else
256	lwz	LDC,     56 + STACKSIZE(SP)
257#endif
258#endif
259#endif
260
261#ifndef PREFETCHTEST
262#ifdef PPC970
263	li	PREC,   16 * SIZE
264#endif
265#else
266
267#ifdef linux
268#ifndef __64BIT__
269	lwz	PREB,   16 + STACKSIZE(SP)
270	lwz	PREC,   20 + STACKSIZE(SP)
271#else
272	ld	PREB,  136 + STACKSIZE(SP)
273	ld	PREC,  144 + STACKSIZE(SP)
274#endif
275#endif
276
277#if defined(_AIX) || defined(__APPLE__)
278#ifdef __64BIT__
279	ld	PREB,  136 + STACKSIZE(SP)
280	ld	PREC,  144 + STACKSIZE(SP)
281#else
282#ifdef DOUBLE
283	lwz	PREB,   72 + STACKSIZE(SP)
284	lwz	PREC,   76 + STACKSIZE(SP)
285#else
286	lwz	PREB,   68 + STACKSIZE(SP)
287	lwz	PREC,   72 + STACKSIZE(SP)
288#endif
289#endif
290#endif
291
292#endif
293
294#ifndef PREFETCHTEST
295#ifdef CELL
296	li	PREB,   (3 * 32 * SIZE)
297#else
298	li	PREB,   (5 * 32 * SIZE)
299#endif
300#endif
301
302	li	r0, -1
303	mfspr	VREG, VRsave
304
305	mtspr	VRsave, r0
306
307	addi	SP, SP, -128
308	li	r0, -8192
309
310	and	SP, SP, r0
311
312	fneg	f3, f1
313	fneg	f4, f2
314
315#if   defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
316      defined(NC) || defined(TC) || defined(NR) || defined(TR)
317	stfs	f1,  ALPHA_R +  0(SP)
318	stfs	f1,  ALPHA_R +  4(SP)
319	stfs	f1,  ALPHA_R +  8(SP)
320	stfs	f1,  ALPHA_R + 12(SP)
321
322	stfs	f4,  ALPHA_I +  0(SP)
323	stfs	f2,  ALPHA_I +  4(SP)
324	stfs	f4,  ALPHA_I +  8(SP)
325	stfs	f2,  ALPHA_I + 12(SP)
326#else
327	stfs	f1,  ALPHA_R +  0(SP)
328	stfs	f3,  ALPHA_R +  4(SP)
329	stfs	f1,  ALPHA_R +  8(SP)
330	stfs	f3,  ALPHA_R + 12(SP)
331
332	stfs	f2,  ALPHA_I +  0(SP)
333	stfs	f2,  ALPHA_I +  4(SP)
334	stfs	f2,  ALPHA_I +  8(SP)
335	stfs	f2,  ALPHA_I + 12(SP)
336#endif
337
338	li	I,    Address_L(0x04050607)
339	addis	I, I, Address_H(0x04050607)
340	stw	I, SWAP +  0(SP)
341	li	I,    Address_L(0x00010203)
342	addis	I, I, Address_H(0x00010203)
343	stw	I, SWAP +  4(SP)
344	li	I,    Address_L(0x0c0d0e0f)
345	addis	I, I, Address_H(0x0c0d0e0f)
346	stw	I, SWAP +  8(SP)
347	li	I,    Address_L(0x08090a0b)
348	addis	I, I, Address_H(0x08090a0b)
349	stw	I, SWAP + 12(SP)
350
351#if   defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
352      defined(RR) || defined(RC) || defined(CR) || defined(CC)
353	lis	I, 0x8000
354	stw	I, NEG +  0(SP)
355	stw	I, NEG +  8(SP)
356	li	I, 0
357	stw	I, NEG +  4(SP)
358	stw	I, NEG + 12(SP)
359#else
360	li	I, 0
361	stw	I, NEG +  0(SP)
362	stw	I, NEG +  8(SP)
363	lis	I, 0x8000
364	stw	I, NEG +  4(SP)
365	stw	I, NEG + 12(SP)
366#endif
367
368	li	r0, 0
369	stw	r0, FZERO(SP)
370
371	slwi	LDC, LDC, ZBASE_SHIFT
372
373	li	OFFSET_1,  4 * SIZE
374	li	OFFSET_2,  8 * SIZE
375	li	OFFSET_3, 12 * SIZE
376	li	OFFSET_4, 16 * SIZE
377	li	OFFSET_5, 20 * SIZE
378	li	OFFSET_6, 24 * SIZE
379	li	OFFSET_7, 28 * SIZE
380
381	cmpwi	cr0, M, 0
382	ble	LL(999)
383	cmpwi	cr0, N, 0
384	ble	LL(999)
385	cmpwi	cr0, K, 0
386	ble	LL(999)
387
388	srawi.	J, N,  1
389	ble	LL(50)
390	.align 4
391
392LL(01):
393	mr	CO1, C
394	add	CO2, C,  LDC
395	add	C,   CO2, LDC
396
397	mr	AO, A
398	srawi.	I, M,  3
399	ble	LL(20)
400	.align 4
401
402LL(11):
403	vxor	c01, c01, c01
404	LOAD_B	b1, OFFSET_0, B
405	vxor	c02, c02, c02
406	LOAD_B	b2, OFFSET_1, B
407	vxor	c03, c03, c03
408	LOAD_A	a1, OFFSET_0, AO
409	vxor	c04, c04, c04
410	LOAD_A	a2, OFFSET_1, AO
411	vxor	c05, c05, c05
412	LOAD_A	a3, OFFSET_2, AO
413	vxor	c06, c06, c06
414	LOAD_A	a4, OFFSET_3, AO
415	vxor	c07, c07, c07
416	LOAD_A	a5, OFFSET_4, AO
417	vxor	c08, c08, c08
418
419	vxor	c09, c09, c09
420	dcbtst	CO1, PREC
421	vxor	c10, c10, c10
422	dcbtst	CO2, PREC
423	vxor	c11, c11, c11
424	vxor	c12, c12, c12
425	vxor	c13, c13, c13
426	mr	BO, B
427	vxor	c14, c14, c14
428	srawi.	r0,  K,  1
429	vxor	c15, c15, c15
430	mtspr	CTR, r0
431	vxor	c16, c16, c16
432	vspltw	bp1, b1, 0
433	ble	LL(15)
434	.align 4
435
436LL(12):
437	vmaddfp	c01, a1, bp1, c01
438	vspltw	bp2, b1, 1
439	vmaddfp	c02, a2, bp1, c02
440	DCBT(BO, PREB)
441	vmaddfp	c03, a3, bp1, c03
442	nop
443	vmaddfp	c04, a4, bp1, c04
444	LOAD_A	a6, OFFSET_5, AO
445
446	vmaddfp	c05, a1, bp2, c05
447	vspltw	bp1, b1, 2
448	vmaddfp	c06, a2, bp2, c06
449#ifdef CELL
450	DCBT(AO, PREA)
451#else
452	nop
453#endif
454	vmaddfp	c07, a3, bp2, c07
455	nop
456	vmaddfp	c08, a4, bp2, c08
457	LOAD_A	a7, OFFSET_6, AO
458
459	vmaddfp	c09, a1, bp1, c09
460	vspltw	bp2, b1, 3
461	vmaddfp	c10, a2, bp1, c10
462	LOAD_B	b1, OFFSET_2, BO
463	vmaddfp	c11, a3, bp1, c11
464	nop
465	vmaddfp	c12, a4, bp1, c12
466	LOAD_A	a8, OFFSET_7, AO
467
468	vmaddfp	c13, a1, bp2, c13
469	vspltw	bp1, b2, 0
470	vmaddfp	c14, a2, bp2, c14
471	addi	AO, AO, 32 * SIZE
472	vmaddfp	c15, a3, bp2, c15
473	nop
474	vmaddfp	c16, a4, bp2, c16
475	LOAD_A	a1, OFFSET_0, AO
476
477	vmaddfp	c01, a5, bp1, c01
478	vspltw	bp2, b2, 1
479	vmaddfp	c02, a6, bp1, c02
480	nop
481	vmaddfp	c03, a7, bp1, c03
482	nop
483	vmaddfp	c04, a8, bp1, c04
484	LOAD_A	a2, OFFSET_1, AO
485
486	vmaddfp	c05, a5, bp2, c05
487	vspltw	bp1, b2, 2
488	vmaddfp	c06, a6, bp2, c06
489	nop
490	vmaddfp	c07, a7, bp2, c07
491	nop
492	vmaddfp	c08, a8, bp2, c08
493	LOAD_A	a3, OFFSET_2, AO
494
495	vmaddfp	c09, a5, bp1, c09
496	vspltw	bp2, b2, 3
497	vmaddfp	c10, a6, bp1, c10
498	LOAD_B	b2, OFFSET_3, BO
499	vmaddfp	c11, a7, bp1, c11
500	nop
501	vmaddfp	c12, a8, bp1, c12
502	LOAD_A	a4, OFFSET_3, AO
503
504	vmaddfp	c13, a5, bp2, c13
505	vspltw	bp1, b1, 0
506	vmaddfp	c14, a6, bp2, c14
507	addi	BO, BO,  8 * SIZE
508	vmaddfp	c15, a7, bp2, c15
509	LOAD_A	a5, OFFSET_4, AO
510	vmaddfp	c16, a8, bp2, c16
511	bdnz+	LL(12)
512	.align 4
513
514LL(15):
515	lvx	swap,    OFFSET_0, SP
516	lvx	neg,     OFFSET_1, SP
517	lvx	alpha_r, OFFSET_2, SP
518	lvx	alpha_i, OFFSET_3, SP
519
520	andi.	r0,  K,  1
521	ble+	LL(18)
522	.align 4
523
524LL(16):
525	vmaddfp	c01, a1, bp1, c01
526	vspltw	bp2, b1, 1
527	vmaddfp	c02, a2, bp1, c02
528	nop
529	vmaddfp	c03, a3, bp1, c03
530	nop
531	vmaddfp	c04, a4, bp1, c04
532	nop
533
534	vmaddfp	c05, a1, bp2, c05
535	vspltw	bp1, b1, 2
536	vmaddfp	c06, a2, bp2, c06
537	nop
538	vmaddfp	c07, a3, bp2, c07
539	nop
540	vmaddfp	c08, a4, bp2, c08
541	nop
542
543	vmaddfp	c09, a1, bp1, c09
544	vspltw	bp2, b1, 3
545	vmaddfp	c10, a2, bp1, c10
546	addi	AO, AO, 16 * SIZE
547	vmaddfp	c11, a3, bp1, c11
548	addi	BO, BO,  4 * SIZE
549	vmaddfp	c12, a4, bp1, c12
550	nop
551
552	vmaddfp	c13, a1, bp2, c13
553	vmaddfp	c14, a2, bp2, c14
554	vmaddfp	c15, a3, bp2, c15
555	vmaddfp	c16, a4, bp2, c16
556	.align 4
557
558LL(18):
559	vxor	VZERO, VZERO, VZERO
560
561	vperm	c05, c05, c05, swap
562	vperm	c06, c06, c06, swap
563	vperm	c07, c07, c07, swap
564	vperm	c08, c08, c08, swap
565
566	vperm	c13, c13, c13, swap
567	vperm	c14, c14, c14, swap
568	vperm	c15, c15, c15, swap
569	vperm	c16, c16, c16, swap
570
571	vxor	c05, c05, neg
572	vxor	c06, c06, neg
573	vxor	c07, c07, neg
574	vxor	c08, c08, neg
575
576	vxor	c13, c13, neg
577	vxor	c14, c14, neg
578	vxor	c15, c15, neg
579	vxor	c16, c16, neg
580
581	vaddfp	c01, c01, c05
582	vaddfp	c02, c02, c06
583	vaddfp	c03, c03, c07
584	vaddfp	c04, c04, c08
585
586	vaddfp	c09, c09, c13
587	vaddfp	c10, c10, c14
588	vaddfp	c11, c11, c15
589	vaddfp	c12, c12, c16
590
591	vperm	c05, c01, c01, swap
592	vperm	c06, c02, c02, swap
593	vperm	c07, c03, c03, swap
594	vperm	c08, c04, c04, swap
595
596	vperm	c13, c09, c09, swap
597	vperm	c14, c10, c10, swap
598	vperm	c15, c11, c11, swap
599	vperm	c16, c12, c12, swap
600
601	vmaddfp	c01, alpha_r, c01, VZERO
602	vmaddfp	c02, alpha_r, c02, VZERO
603	vmaddfp	c03, alpha_r, c03, VZERO
604	vmaddfp	c04, alpha_r, c04, VZERO
605
606	vmaddfp	c01, alpha_i, c05, c01
607	vmaddfp	c02, alpha_i, c06, c02
608	vmaddfp	c03, alpha_i, c07, c03
609	vmaddfp	c04, alpha_i, c08, c04
610
611	vmaddfp	c09, alpha_r, c09, VZERO
612	vmaddfp	c10, alpha_r, c10, VZERO
613	vmaddfp	c11, alpha_r, c11, VZERO
614	vmaddfp	c12, alpha_r, c12, VZERO
615
616	vmaddfp	c09, alpha_i, c13, c09
617	vmaddfp	c10, alpha_i, c14, c10
618	vmaddfp	c11, alpha_i, c15, c11
619	vmaddfp	c12, alpha_i, c16, c12
620
621	lvx	C1, OFFSET_0, CO1
622	lvx	C2, OFFSET_1, CO1
623	lvx	C3, OFFSET_2, CO1
624	lvx	C4, OFFSET_3, CO1
625	lvx	C5, OFFSET_4, CO1
626
627	lvsr	PERMRSHIFT1, 0, CO1
628	lvsr	PERMRSHIFT2, 0, CO2
629
630	vperm	c00, VZERO, c01,   PERMRSHIFT1
631	vperm	c01, c01,   c02,   PERMRSHIFT1
632	vperm	c02, c02,   c03,   PERMRSHIFT1
633	vperm	c03, c03,   c04,   PERMRSHIFT1
634	vperm	c04, c04,   VZERO, PERMRSHIFT1
635
636	vaddfp	c00, c00, C1
637	vaddfp	c01, c01, C2
638	vaddfp	c02, c02, C3
639	vaddfp	c03, c03, C4
640	vaddfp	c04, c04, C5
641
642	stvx	c00, OFFSET_0, CO1
643	stvx	c01, OFFSET_1, CO1
644	stvx	c02, OFFSET_2, CO1
645	stvx	c03, OFFSET_3, CO1
646	stvx	c04, OFFSET_4, CO1
647
648	lvx	C1, OFFSET_0, CO2
649	lvx	C2, OFFSET_1, CO2
650	lvx	C3, OFFSET_2, CO2
651	lvx	C4, OFFSET_3, CO2
652	lvx	C5, OFFSET_4, CO2
653
654	vperm	c00, VZERO, c09,   PERMRSHIFT2
655	vperm	c09, c09,   c10,   PERMRSHIFT2
656	vperm	c10, c10,   c11,   PERMRSHIFT2
657	vperm	c11, c11,   c12,   PERMRSHIFT2
658	vperm	c12, c12,   VZERO, PERMRSHIFT2
659
660	vaddfp	c00, c00, C1
661	vaddfp	c09, c09, C2
662	vaddfp	c10, c10, C3
663	vaddfp	c11, c11, C4
664	vaddfp	c12, c12, C5
665
666	stvx	c00, OFFSET_0, CO2
667	stvx	c09, OFFSET_1, CO2
668	stvx	c10, OFFSET_2, CO2
669	stvx	c11, OFFSET_3, CO2
670	stvx	c12, OFFSET_4, CO2
671
672	addi	CO1, CO1, 16 * SIZE
673	addi	CO2, CO2, 16 * SIZE
674	addic.	I, I, -1
675	bgt+	LL(11)
676	.align 4
677
678LL(20):
679	andi.	I, M,  4
680	ble	LL(30)
681
682	vxor	c01, c01, c01
683	LOAD_A	a1, OFFSET_0, AO
684	vxor	c02, c02, c02
685	LOAD_A	a2, OFFSET_1, AO
686	vxor	c05, c05, c05
687	LOAD_A	a3, OFFSET_2, AO
688	vxor	c06, c06, c06
689	LOAD_A	a4, OFFSET_3, AO
690	vxor	c09, c09, c09
691	LOAD_B	b1, OFFSET_0, B
692	vxor	c10, c10, c10
693	LOAD_B	b2, OFFSET_1, B
694	vxor	c13, c13, c13
695	vxor	c14, c14, c14
696	mr	BO, B
697	vspltw	bp1, b1, 0
698
699	srawi.	r0,  K,  1
700	mtspr	CTR, r0
701	ble	LL(25)
702	.align 4
703
704LL(22):
705	vmaddfp	c01, a1, bp1, c01
706	vspltw	bp2, b1, 1
707	addi	AO, AO, 16 * SIZE
708	vmaddfp	c02, a2, bp1, c02
709	addi	BO, BO,  8 * SIZE
710
711	vmaddfp	c05, a1, bp2, c05
712	vspltw	bp1, b1, 2
713	vmaddfp	c06, a2, bp2, c06
714
715	vmaddfp	c09, a1, bp1, c09
716	vspltw	bp2, b1, 3
717	LOAD_B	b1, OFFSET_0, BO
718	vmaddfp	c10, a2, bp1, c10
719
720	vmaddfp	c13, a1, bp2, c13
721	LOAD_A	a1, OFFSET_0, AO
722	vspltw	bp1, b2, 0
723	vmaddfp	c14, a2, bp2, c14
724	LOAD_A	a2, OFFSET_1, AO
725
726	vmaddfp	c01, a3, bp1, c01
727	vspltw	bp2, b2, 1
728	vmaddfp	c02, a4, bp1, c02
729
730	vmaddfp	c05, a3, bp2, c05
731	vspltw	bp1, b2, 2
732	vmaddfp	c06, a4, bp2, c06
733
734	vmaddfp	c09, a3, bp1, c09
735	vspltw	bp2, b2, 3
736	LOAD_B	b2, OFFSET_1, BO
737	vmaddfp	c10, a4, bp1, c10
738
739	vmaddfp	c13, a3, bp2, c13
740	LOAD_A	a3, OFFSET_2, AO
741	vmaddfp	c14, a4, bp2, c14
742	LOAD_A	a4, OFFSET_3, AO
743	vspltw	bp1, b1, 0
744	bdnz	LL(22)
745	.align 4
746
747LL(25):
748	andi.	r0,  K,  1
749	ble+	LL(28)
750	.align 4
751
752LL(26):
753	vmaddfp	c01, a1, bp1, c01
754	vspltw	bp2, b1, 1
755	vmaddfp	c02, a2, bp1, c02
756	nop
757
758	vmaddfp	c05, a1, bp2, c05
759	vspltw	bp1, b1, 2
760	vmaddfp	c06, a2, bp2, c06
761	nop
762
763	vmaddfp	c09, a1, bp1, c09
764	vspltw	bp2, b1, 3
765	vmaddfp	c10, a2, bp1, c10
766	addi	AO, AO,  8 * SIZE
767
768	vmaddfp	c13, a1, bp2, c13
769	addi	BO, BO,  4 * SIZE
770	vmaddfp	c14, a2, bp2, c14
771	nop
772	.align 4
773
774LL(28):
775	vxor	VZERO, VZERO, VZERO
776
777	lvx	swap,    OFFSET_0, SP
778	lvx	neg,     OFFSET_1, SP
779	lvx	alpha_r, OFFSET_2, SP
780	lvx	alpha_i, OFFSET_3, SP
781
782	vperm	c05, c05, c05, swap
783	vperm	c06, c06, c06, swap
784	vperm	c13, c13, c13, swap
785	vperm	c14, c14, c14, swap
786
787	vxor	c05, c05, neg
788	vxor	c06, c06, neg
789	vxor	c13, c13, neg
790	vxor	c14, c14, neg
791
792	vaddfp	c01, c01, c05
793	vaddfp	c02, c02, c06
794	vaddfp	c09, c09, c13
795	vaddfp	c10, c10, c14
796
797	vperm	c05, c01, c01, swap
798	vperm	c06, c02, c02, swap
799	vperm	c13, c09, c09, swap
800	vperm	c14, c10, c10, swap
801
802	vmaddfp	c01, alpha_r, c01, VZERO
803	vmaddfp	c02, alpha_r, c02, VZERO
804	vmaddfp	c01, alpha_i, c05, c01
805	vmaddfp	c02, alpha_i, c06, c02
806
807	vmaddfp	c09, alpha_r, c09, VZERO
808	vmaddfp	c10, alpha_r, c10, VZERO
809	vmaddfp	c09, alpha_i, c13, c09
810	vmaddfp	c10, alpha_i, c14, c10
811
812	lvx	C1, OFFSET_0, CO1
813	lvx	C2, OFFSET_1, CO1
814	lvx	C3, OFFSET_2, CO1
815
816	lvsr	PERMRSHIFT1, 0, CO1
817	lvsr	PERMRSHIFT2, 0, CO2
818
819	vperm	c00, VZERO, c01,   PERMRSHIFT1
820	vperm	c01, c01,   c02,   PERMRSHIFT1
821	vperm	c02, c02, VZERO,   PERMRSHIFT1
822
823	vaddfp	c00, c00, C1
824	vaddfp	c01, c01, C2
825	vaddfp	c02, c02, C3
826
827	stvx	c00, OFFSET_0, CO1
828	stvx	c01, OFFSET_1, CO1
829	stvx	c02, OFFSET_2, CO1
830
831	lvx	C1, OFFSET_0, CO2
832	lvx	C2, OFFSET_1, CO2
833	lvx	C3, OFFSET_2, CO2
834
835	vperm	c00, VZERO, c09,   PERMRSHIFT2
836	vperm	c09, c09,   c10,   PERMRSHIFT2
837	vperm	c10, c10,   VZERO, PERMRSHIFT2
838
839	vaddfp	c00, c00, C1
840	vaddfp	c09, c09, C2
841	vaddfp	c10, c10, C3
842
843	stvx	c00, OFFSET_0, CO2
844	stvx	c09, OFFSET_1, CO2
845	stvx	c10, OFFSET_2, CO2
846
847	addi	CO1, CO1, 8 * SIZE
848	addi	CO2, CO2, 8 * SIZE
849	.align 4
850
851LL(30):
852	andi.	I, M,  2
853	ble	LL(40)
854
855	vxor	c01, c01, c01
856	LOAD_A	a1, OFFSET_0, AO
857	vxor	c02, c02, c02
858	LOAD_A	a2, OFFSET_1, AO
859	vxor	c05, c05, c05
860	LOAD_B	b1, OFFSET_0, B
861	vxor	c06, c06, c06
862	LOAD_B	b2, OFFSET_1, B
863	vxor	c09, c09, c09
864	vxor	c10, c10, c10
865	vxor	c13, c13, c13
866	vxor	c14, c14, c14
867
868	vspltw	bp1, b1, 0
869	mr	BO, B
870
871	srawi.	r0,  K,  1
872	mtspr	CTR, r0
873	ble	LL(35)
874	.align 4
875
876LL(32):
877	vmaddfp	c01, a1, bp1, c01
878	addi	AO, AO,  8 * SIZE
879	vspltw	bp2, b1, 1
880	vmaddfp	c05, a1, bp2, c05
881	addi	BO, BO,  8 * SIZE
882	vspltw	bp1, b1, 2
883	vmaddfp	c09, a1, bp1, c09
884	vspltw	bp2, b1, 3
885	vmaddfp	c13, a1, bp2, c13
886	LOAD_A	a1, OFFSET_0, AO
887	vspltw	bp1, b2, 0
888	LOAD_B	b1, OFFSET_0, BO
889
890	vmaddfp	c02, a2, bp1, c02
891	vspltw	bp2, b2, 1
892	vmaddfp	c06, a2, bp2, c06
893	vspltw	bp1, b2, 2
894	vmaddfp	c10, a2, bp1, c10
895	vspltw	bp2, b2, 3
896	LOAD_B	b2, OFFSET_1, BO
897	vmaddfp	c14, a2, bp2, c14
898	LOAD_A	a2, OFFSET_1, AO
899
900	vspltw	bp1, b1, 0
901	bdnz	LL(32)
902	.align 4
903
904LL(35):
905	andi.	r0,  K,  1
906	ble+	LL(38)
907	.align 4
908
909LL(36):
910	vmaddfp	c01, a1, bp1, c01
911	vspltw	bp2, b1, 1
912	vmaddfp	c05, a1, bp2, c05
913	vspltw	bp1, b1, 2
914	vmaddfp	c09, a1, bp1, c09
915	vspltw	bp2, b1, 3
916	vmaddfp	c13, a1, bp2, c13
917	addi	AO, AO,  4 * SIZE
918	addi	BO, BO,  4 * SIZE
919	.align 4
920
921LL(38):
922	vaddfp	c01, c01, c02
923	vaddfp	c05, c05, c06
924	vaddfp	c09, c09, c10
925	vaddfp	c13, c13, c14
926
927	vxor	VZERO, VZERO, VZERO
928
929	lvx	swap,    OFFSET_0, SP
930	lvx	neg,     OFFSET_1, SP
931	lvx	alpha_r, OFFSET_2, SP
932	lvx	alpha_i, OFFSET_3, SP
933
934	vperm	c05, c05, c05, swap
935	vperm	c13, c13, c13, swap
936
937	vxor	c05, c05, neg
938	vxor	c13, c13, neg
939
940	vaddfp	c01, c01, c05
941	vaddfp	c09, c09, c13
942
943	vperm	c05, c01, c01, swap
944	vperm	c13, c09, c09, swap
945
946	vmaddfp	c01, alpha_r, c01, VZERO
947	vmaddfp	c01, alpha_i, c05, c01
948
949	vmaddfp	c09, alpha_r, c09, VZERO
950	vmaddfp	c09, alpha_i, c13, c09
951
952	lvx	C1, OFFSET_0, CO1
953	lvx	C2, OFFSET_1, CO1
954
955	lvsr	PERMRSHIFT1, 0, CO1
956	lvsr	PERMRSHIFT2, 0, CO2
957
958	vperm	c00, VZERO, c01,   PERMRSHIFT1
959	vperm	c01, c01, VZERO,   PERMRSHIFT1
960
961	vaddfp	c00, c00, C1
962	vaddfp	c01, c01, C2
963
964	stvx	c00, OFFSET_0, CO1
965	stvx	c01, OFFSET_1, CO1
966
967	lvx	C1, OFFSET_0, CO2
968	lvx	C2, OFFSET_1, CO2
969
970	vperm	c00, VZERO, c09,   PERMRSHIFT2
971	vperm	c09, c09,   VZERO, PERMRSHIFT2
972
973	vaddfp	c00, c00, C1
974	vaddfp	c09, c09, C2
975
976	stvx	c00, OFFSET_0, CO2
977	stvx	c09, OFFSET_1, CO2
978
979	addi	CO1, CO1,  4 * SIZE
980	addi	CO2, CO2,  4 * SIZE
981	.align 4
982
983LL(40):
984	andi.	I, M,  1
985	ble	LL(49)
986
987	mr	BO, B
988
989	LFD	f8,   0 * SIZE(AO)
990	LFD	f9,   1 * SIZE(AO)
991
992	LFD	f10,  0 * SIZE(BO)
993	LFD	f11,  1 * SIZE(BO)
994	LFD	f12,  2 * SIZE(BO)
995	LFD	f13,  3 * SIZE(BO)
996
997	lfs	f0,  FZERO(SP)
998 	fmr	f1,  f0
999	fmr	f2,  f0
1000	fmr	f3,  f0
1001
1002	fmr	f4,  f0
1003	fmr	f5,  f0
1004	fmr	f6,  f0
1005	fmr	f7,  f0
1006
1007	srawi.	r0,  K,  1
1008	mtspr	CTR, r0
1009	ble	LL(45)
1010	.align 4
1011
1012LL(42):
1013	fmadd	f0,  f8, f10, f0
1014	fmadd	f2,  f8, f11, f2
1015	fmadd	f4,  f8, f12, f4
1016	fmadd	f6,  f8, f13, f6
1017
1018	fmadd	f1,  f9, f10, f1
1019	fmadd	f3,  f9, f11, f3
1020	fmadd	f5,  f9, f12, f5
1021	fmadd	f7,  f9, f13, f7
1022
1023	LFD	f8,   2 * SIZE(AO)
1024	LFD	f9,   3 * SIZE(AO)
1025
1026	LFD	f10,  4 * SIZE(BO)
1027	LFD	f11,  5 * SIZE(BO)
1028	LFD	f12,  6 * SIZE(BO)
1029	LFD	f13,  7 * SIZE(BO)
1030
1031	fmadd	f0,  f8, f10, f0
1032	fmadd	f2,  f8, f11, f2
1033	fmadd	f4,  f8, f12, f4
1034	fmadd	f6,  f8, f13, f6
1035
1036	fmadd	f1,  f9, f10, f1
1037	fmadd	f3,  f9, f11, f3
1038	fmadd	f5,  f9, f12, f5
1039	fmadd	f7,  f9, f13, f7
1040
1041	LFD	f8,   4 * SIZE(AO)
1042	LFD	f9,   5 * SIZE(AO)
1043
1044	LFD	f10,  8 * SIZE(BO)
1045	LFD	f11,  9 * SIZE(BO)
1046	LFD	f12, 10 * SIZE(BO)
1047	LFD	f13, 11 * SIZE(BO)
1048
1049	addi	AO, AO,  4 * SIZE
1050	addi	BO, BO,  8 * SIZE
1051	bdnz	LL(42)
1052	.align 4
1053
1054LL(45):
1055	andi.	r0,  K,  1
1056	ble	LL(48)
1057	.align 4
1058
1059LL(46):
1060	fmadd	f0,  f8, f10, f0
1061	fmadd	f2,  f8, f11, f2
1062	fmadd	f4,  f8, f12, f4
1063	fmadd	f6,  f8, f13, f6
1064
1065	fmadd	f1,  f9, f10, f1
1066	fmadd	f3,  f9, f11, f3
1067	fmadd	f5,  f9, f12, f5
1068	fmadd	f7,  f9, f13, f7
1069
1070	addi	AO, AO,  2 * SIZE
1071	addi	BO, BO,  4 * SIZE
1072	.align 4
1073
1074LL(48):
1075#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
1076	fsub	f0, f0, f3
1077	fadd	f1, f1, f2
1078	fsub	f4, f4, f7
1079	fadd	f5, f5, f6
1080#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
1081	fadd	f0, f0, f3
1082	fsub	f1, f1, f2
1083	fadd	f4, f4, f7
1084	fsub	f5, f5, f6
1085#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
1086	fadd	f0, f0, f3
1087	fsub	f1, f2, f1
1088	fadd	f4, f4, f7
1089	fsub	f5, f6, f5
1090#else /* RR, RC, CR, CC */
1091	fsub	f0, f0, f3
1092	fadd	f1, f1, f2
1093	fsub	f4, f4, f7
1094	fadd	f5, f5, f6
1095#endif
1096
1097	LFD	f8,  0 * SIZE(CO1)
1098	LFD	f9,  1 * SIZE(CO1)
1099	LFD	f10, 0 * SIZE(CO2)
1100	LFD	f11, 1 * SIZE(CO2)
1101
1102	lfs	f12,  ALPHA_R + 0(SP)
1103	lfs	f13,  ALPHA_I + 4(SP)
1104
1105#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1106	fmadd	f8,  f12, f0, f8
1107	fnmsub	f9,  f12, f1, f9
1108	fmadd	f10, f12, f4, f10
1109	fnmsub	f11, f12, f5, f11
1110
1111	fmadd	f8,  f13, f1, f8
1112	fmadd	f9,  f13, f0, f9
1113	fmadd	f10, f13, f5, f10
1114	fmadd	f11, f13, f4, f11
1115#else
1116	fmadd	f8,  f12, f0, f8
1117	fmadd	f9,  f12, f1, f9
1118	fmadd	f10, f12, f4, f10
1119	fmadd	f11, f12, f5, f11
1120
1121	fnmsub	f8,  f13, f1, f8
1122	fmadd	f9,  f13, f0, f9
1123	fnmsub	f10, f13, f5, f10
1124	fmadd	f11, f13, f4, f11
1125#endif
1126
1127	STFD	f8,  0 * SIZE(CO1)
1128	STFD	f9,  1 * SIZE(CO1)
1129	STFD	f10, 0 * SIZE(CO2)
1130	STFD	f11, 1 * SIZE(CO2)
1131
1132LL(49):
1133	mr	B, BO
1134
1135	addic.	J, J, -1
1136	bgt	LL(01)
1137	.align 4
1138
1139LL(50):
1140	andi.	J, N,  1
1141	ble	LL(999)
1142
1143	mr	CO1, C
1144	mr	AO, A
1145
1146	srawi.	I, M,  3
1147	ble	LL(70)
1148	.align 4
1149
1150LL(61):
1151	vxor	c01, c01, c01
1152	LOAD_B	b1, OFFSET_0, B
1153	vxor	c02, c02, c02
1154	vxor	c03, c03, c03
1155	LOAD_A	a1, OFFSET_0, AO
1156	vxor	c04, c04, c04
1157	LOAD_A	a2, OFFSET_1, AO
1158	vxor	c05, c05, c05
1159	LOAD_A	a3, OFFSET_2, AO
1160	vxor	c06, c06, c06
1161	LOAD_A	a4, OFFSET_3, AO
1162	vxor	c07, c07, c07
1163	vxor	c08, c08, c08
1164
1165	mr	BO, B
1166	dcbtst	CO1, PREC
1167	dcbtst	CO2, PREC
1168
1169	vspltw	bp1, b1, 0
1170
1171	srawi.	r0,  K,  1
1172	mtspr	CTR, r0
1173	ble	LL(65)
1174	.align 4
1175
1176LL(62):
1177	LOAD_A	a5, OFFSET_4, AO
1178	LOAD_A	a6, OFFSET_5, AO
1179	LOAD_A	a7, OFFSET_6, AO
1180	LOAD_A	a8, OFFSET_7, AO
1181
1182	vmaddfp	c01, a1, bp1, c01
1183	vspltw	bp2, b1, 1
1184	vmaddfp	c02, a2, bp1, c02
1185	vmaddfp	c03, a3, bp1, c03
1186	vmaddfp	c04, a4, bp1, c04
1187
1188	vmaddfp	c05, a1, bp2, c05
1189	vspltw	bp1, b1, 2
1190	vmaddfp	c06, a2, bp2, c06
1191	vmaddfp	c07, a3, bp2, c07
1192	vmaddfp	c08, a4, bp2, c08
1193
1194	vmaddfp	c01, a5, bp1, c01
1195	vspltw	bp2, b1, 3
1196	vmaddfp	c02, a6, bp1, c02
1197	vmaddfp	c03, a7, bp1, c03
1198	vmaddfp	c04, a8, bp1, c04
1199
1200	LOAD_B	b1, OFFSET_1, BO
1201	vspltw	bp1, b1, 0
1202
1203	vmaddfp	c05, a5, bp2, c05
1204	vmaddfp	c06, a6, bp2, c06
1205	vmaddfp	c07, a7, bp2, c07
1206	vmaddfp	c08, a8, bp2, c08
1207
1208	addi	AO, AO, 32 * SIZE
1209	addi	BO, BO,  4 * SIZE
1210
1211	LOAD_A	a1, OFFSET_0, AO
1212	LOAD_A	a2, OFFSET_1, AO
1213	LOAD_A	a3, OFFSET_2, AO
1214	LOAD_A	a4, OFFSET_3, AO
1215	bdnz	LL(62)
1216	.align 4
1217
1218LL(65):
1219	andi.	r0,  K,  1
1220	ble+	LL(68)
1221	.align 4
1222
1223LL(66):
1224	vmaddfp	c01, a1, bp1, c01
1225	vspltw	bp2, b1, 1
1226	vmaddfp	c02, a2, bp1, c02
1227	addi	AO, AO, 16 * SIZE
1228	vmaddfp	c03, a3, bp1, c03
1229	addi	BO, BO,  2 * SIZE
1230	vmaddfp	c04, a4, bp1, c04
1231	nop
1232
1233	vmaddfp	c05, a1, bp2, c05
1234	vmaddfp	c06, a2, bp2, c06
1235	vmaddfp	c07, a3, bp2, c07
1236	vmaddfp	c08, a4, bp2, c08
1237	.align 4
1238
1239LL(68):
1240	vxor	VZERO, VZERO, VZERO
1241
1242	lvx	swap,    OFFSET_0, SP
1243	lvx	neg,     OFFSET_1, SP
1244	lvx	alpha_r, OFFSET_2, SP
1245	lvx	alpha_i, OFFSET_3, SP
1246
1247	vperm	c05, c05, c05, swap
1248	vperm	c06, c06, c06, swap
1249	vperm	c07, c07, c07, swap
1250	vperm	c08, c08, c08, swap
1251
1252	vxor	c05, c05, neg
1253	vxor	c06, c06, neg
1254	vxor	c07, c07, neg
1255	vxor	c08, c08, neg
1256
1257	vaddfp	c01, c01, c05
1258	vaddfp	c02, c02, c06
1259	vaddfp	c03, c03, c07
1260	vaddfp	c04, c04, c08
1261
1262	vperm	c05, c01, c01, swap
1263	vperm	c06, c02, c02, swap
1264	vperm	c07, c03, c03, swap
1265	vperm	c08, c04, c04, swap
1266
1267	vmaddfp	c01, alpha_r, c01, VZERO
1268	vmaddfp	c02, alpha_r, c02, VZERO
1269	vmaddfp	c03, alpha_r, c03, VZERO
1270	vmaddfp	c04, alpha_r, c04, VZERO
1271
1272	vmaddfp	c01, alpha_i, c05, c01
1273	vmaddfp	c02, alpha_i, c06, c02
1274	vmaddfp	c03, alpha_i, c07, c03
1275	vmaddfp	c04, alpha_i, c08, c04
1276
1277	lvx	C1, OFFSET_0, CO1
1278	lvx	C2, OFFSET_1, CO1
1279	lvx	C3, OFFSET_2, CO1
1280	lvx	C4, OFFSET_3, CO1
1281	lvx	C5, OFFSET_4, CO1
1282
1283	lvsr	PERMRSHIFT1, 0, CO1
1284
1285	vperm	c00, VZERO, c01,   PERMRSHIFT1
1286	vperm	c01, c01,   c02,   PERMRSHIFT1
1287	vperm	c02, c02,   c03,   PERMRSHIFT1
1288	vperm	c03, c03,   c04,   PERMRSHIFT1
1289	vperm	c04, c04,   VZERO, PERMRSHIFT1
1290
1291	vaddfp	c00, c00, C1
1292	vaddfp	c01, c01, C2
1293	vaddfp	c02, c02, C3
1294	vaddfp	c03, c03, C4
1295	vaddfp	c04, c04, C5
1296
1297	stvx	c00, OFFSET_0, CO1
1298	stvx	c01, OFFSET_1, CO1
1299	stvx	c02, OFFSET_2, CO1
1300	stvx	c03, OFFSET_3, CO1
1301	stvx	c04, OFFSET_4, CO1
1302
1303	addi	CO1, CO1, 16 * SIZE
1304	addic.	I, I, -1
1305	bgt+	LL(61)
1306	.align 4
1307
1308LL(70):
1309	andi.	I, M,  4
1310	ble	LL(80)
1311
1312	vxor	c01, c01, c01
1313	LOAD_B	b1, OFFSET_0, B
1314	vxor	c02, c02, c02
1315	vxor	c03, c03, c03
1316	LOAD_A	a1, OFFSET_0, AO
1317	vxor	c04, c04, c04
1318	LOAD_A	a2, OFFSET_1, AO
1319	vxor	c05, c05, c05
1320	LOAD_A	a3, OFFSET_2, AO
1321	vxor	c06, c06, c06
1322	LOAD_A	a4, OFFSET_3, AO
1323	vxor	c07, c07, c07
1324	vxor	c08, c08, c08
1325
1326	mr	BO, B
1327
1328	vspltw	bp1, b1, 0
1329	srawi.	r0,  K,  1
1330	mtspr	CTR, r0
1331	ble	LL(75)
1332	.align 4
1333
1334LL(72):
1335	vmaddfp	c01, a1, bp1, c01
1336	vspltw	bp2, b1, 1
1337	vmaddfp	c02, a2, bp1, c02
1338
1339	vmaddfp	c05, a1, bp2, c05
1340	vspltw	bp1, b1, 2
1341	vmaddfp	c06, a2, bp2, c06
1342
1343	vmaddfp	c03, a3, bp1, c03
1344	vspltw	bp2, b1, 3
1345	vmaddfp	c04, a4, bp1, c04
1346
1347	LOAD_B	b1, OFFSET_1, BO
1348	vspltw	bp1, b1, 0
1349
1350	vmaddfp	c07, a3, bp2, c07
1351	vmaddfp	c08, a4, bp2, c08
1352
1353	addi	AO, AO, 16 * SIZE
1354	addi	BO, BO,  4 * SIZE
1355
1356	LOAD_A	a1, OFFSET_0, AO
1357	LOAD_A	a2, OFFSET_1, AO
1358	LOAD_A	a3, OFFSET_2, AO
1359	LOAD_A	a4, OFFSET_3, AO
1360	bdnz	LL(72)
1361	.align 4
1362
1363LL(75):
1364	andi.	r0,  K,  1
1365	ble+	LL(78)
1366	.align 4
1367
1368LL(76):
1369	vmaddfp	c01, a1, bp1, c01
1370	vspltw	bp2, b1, 1
1371	vmaddfp	c02, a2, bp1, c02
1372	addi	AO, AO,  8 * SIZE
1373	vmaddfp	c05, a1, bp2, c05
1374	addi	BO, BO,  2 * SIZE
1375	vmaddfp	c06, a2, bp2, c06
1376	.align 4
1377
1378LL(78):
1379	vaddfp	c01, c01, c03
1380	vaddfp	c02, c02, c04
1381	vaddfp	c05, c05, c07
1382	vaddfp	c06, c06, c08
1383
1384	vxor	VZERO, VZERO, VZERO
1385
1386	lvx	swap,    OFFSET_0, SP
1387	lvx	neg,     OFFSET_1, SP
1388	lvx	alpha_r, OFFSET_2, SP
1389	lvx	alpha_i, OFFSET_3, SP
1390
1391	vperm	c05, c05, c05, swap
1392	vperm	c06, c06, c06, swap
1393
1394	vxor	c05, c05, neg
1395	vxor	c06, c06, neg
1396
1397	vaddfp	c01, c01, c05
1398	vaddfp	c02, c02, c06
1399
1400	vperm	c05, c01, c01, swap
1401	vperm	c06, c02, c02, swap
1402
1403	vmaddfp	c01, alpha_r, c01, VZERO
1404	vmaddfp	c02, alpha_r, c02, VZERO
1405	vmaddfp	c01, alpha_i, c05, c01
1406	vmaddfp	c02, alpha_i, c06, c02
1407
1408	lvx	C1, OFFSET_0, CO1
1409	lvx	C2, OFFSET_1, CO1
1410	lvx	C3, OFFSET_2, CO1
1411
1412	lvsr	PERMRSHIFT1, 0, CO1
1413
1414	vperm	c00, VZERO, c01,   PERMRSHIFT1
1415	vperm	c01, c01,   c02,   PERMRSHIFT1
1416	vperm	c02, c02, VZERO,   PERMRSHIFT1
1417
1418	vaddfp	c00, c00, C1
1419	vaddfp	c01, c01, C2
1420	vaddfp	c02, c02, C3
1421
1422	stvx	c00, OFFSET_0, CO1
1423	stvx	c01, OFFSET_1, CO1
1424	stvx	c02, OFFSET_2, CO1
1425
1426	addi	CO1, CO1,  8 * SIZE
1427	.align 4
1428
1429LL(80):
1430	andi.	I, M,  2
1431	ble	LL(90)
1432
1433	vxor	c01, c01, c01
1434	LOAD_B	b1, OFFSET_0, B
1435	vxor	c02, c02, c02
1436	LOAD_A	a1, OFFSET_0, AO
1437	LOAD_A	a2, OFFSET_1, AO
1438	vxor	c05, c05, c05
1439	vxor	c06, c06, c06
1440
1441	mr	BO, B
1442
1443	vspltw	bp1, b1, 0
1444
1445	srawi.	r0,  K,  1
1446	mtspr	CTR, r0
1447	ble	LL(85)
1448	.align 4
1449
1450LL(82):
1451	vmaddfp	c01, a1, bp1, c01
1452	vspltw	bp2, b1, 1
1453
1454	vmaddfp	c05, a1, bp2, c05
1455	vspltw	bp1, b1, 2
1456
1457	vmaddfp	c02, a2, bp1, c02
1458	vspltw	bp2, b1, 3
1459
1460	LOAD_B	b1, OFFSET_1, BO
1461	vspltw	bp1, b1, 0
1462
1463	vmaddfp	c06, a2, bp2, c06
1464
1465	addi	AO, AO,  8 * SIZE
1466	addi	BO, BO,  4 * SIZE
1467
1468	LOAD_A	a1, OFFSET_0, AO
1469	LOAD_A	a2, OFFSET_1, AO
1470	bdnz	LL(82)
1471	.align 4
1472
1473LL(85):
1474	andi.	r0,  K,  1
1475	ble+	LL(88)
1476	.align 4
1477
1478LL(86):
1479	vspltw	bp2, b1, 1
1480	vmaddfp	c01, a1, bp1, c01
1481	vmaddfp	c05, a1, bp2, c05
1482	addi	AO, AO,  4 * SIZE
1483	addi	BO, BO,  2 * SIZE
1484	.align 4
1485
1486LL(88):
1487	vaddfp	c01, c01, c02
1488	vaddfp	c05, c05, c06
1489	vaddfp	c09, c09, c10
1490	vaddfp	c13, c13, c14
1491
1492	vxor	VZERO, VZERO, VZERO
1493
1494	lvx	swap,    OFFSET_0, SP
1495	lvx	neg,     OFFSET_1, SP
1496	lvx	alpha_r, OFFSET_2, SP
1497	lvx	alpha_i, OFFSET_3, SP
1498
1499	vperm	c05, c05, c05, swap
1500
1501	vxor	c05, c05, neg
1502
1503	vaddfp	c01, c01, c05
1504
1505	vperm	c05, c01, c01, swap
1506
1507	vmaddfp	c01, alpha_r, c01, VZERO
1508	vmaddfp	c01, alpha_i, c05, c01
1509
1510	lvx	C1, OFFSET_0, CO1
1511	lvx	C2, OFFSET_1, CO1
1512
1513	lvsr	PERMRSHIFT1, 0, CO1
1514
1515	vperm	c00, VZERO, c01,   PERMRSHIFT1
1516	vperm	c01, c01, VZERO,   PERMRSHIFT1
1517
1518	vaddfp	c00, c00, C1
1519	vaddfp	c01, c01, C2
1520
1521	stvx	c00, OFFSET_0, CO1
1522	stvx	c01, OFFSET_1, CO1
1523
1524	addi	CO1, CO1,  4 * SIZE
1525	.align 4
1526
1527LL(90):
1528	andi.	I, M,  1
1529	ble	LL(999)
1530
1531	mr	BO, B
1532
1533	LFD	f8,   0 * SIZE(AO)
1534	LFD	f9,   1 * SIZE(AO)
1535
1536	LFD	f10,  0 * SIZE(BO)
1537	LFD	f11,  1 * SIZE(BO)
1538	LFD	f12,  2 * SIZE(BO)
1539	LFD	f13,  3 * SIZE(BO)
1540
1541	lfs	f0,  FZERO(SP)
1542 	fmr	f1,  f0
1543	fmr	f2,  f0
1544	fmr	f3,  f0
1545
1546	srawi.	r0,  K,  1
1547	mtspr	CTR, r0
1548	ble	LL(95)
1549	.align 4
1550
1551LL(92):
1552	fmadd	f0,  f8, f10, f0
1553	fmadd	f2,  f8, f11, f2
1554	fmadd	f1,  f9, f10, f1
1555	fmadd	f3,  f9, f11, f3
1556
1557	LFD	f8,   2 * SIZE(AO)
1558	LFD	f9,   3 * SIZE(AO)
1559	LFD	f10,  4 * SIZE(BO)
1560	LFD	f11,  5 * SIZE(BO)
1561
1562	fmadd	f0,  f8, f12, f0
1563	fmadd	f2,  f8, f13, f2
1564	fmadd	f1,  f9, f12, f1
1565	fmadd	f3,  f9, f13, f3
1566
1567	LFD	f8,   4 * SIZE(AO)
1568	LFD	f9,   5 * SIZE(AO)
1569	LFD	f12,  6 * SIZE(BO)
1570	LFD	f13,  7 * SIZE(BO)
1571
1572	addi	AO, AO,  4 * SIZE
1573	addi	BO, BO,  4 * SIZE
1574	bdnz	LL(92)
1575	.align 4
1576
1577LL(95):
1578	andi.	r0,  K,  1
1579	ble	LL(98)
1580	.align 4
1581
1582LL(96):
1583	fmadd	f0,  f8, f10, f0
1584	fmadd	f2,  f8, f11, f2
1585	fmadd	f1,  f9, f10, f1
1586	fmadd	f3,  f9, f11, f3
1587	.align 4
1588
1589LL(98):
1590#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
1591	fsub	f0, f0, f3
1592	fadd	f1, f1, f2
1593#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
1594	fadd	f0, f0, f3
1595	fsub	f1, f1, f2
1596#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
1597	fadd	f0, f0, f3
1598	fsub	f1, f2, f1
1599#else /* RR, RC, CR, CC */
1600	fsub	f0, f0, f3
1601	fadd	f1, f1, f2
1602#endif
1603
1604	LFD	f8,  0 * SIZE(CO1)
1605	LFD	f9,  1 * SIZE(CO1)
1606
1607	lfs	f12,  ALPHA_R + 0(SP)
1608	lfs	f13,  ALPHA_I + 4(SP)
1609
1610#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1611	fmadd	f8,  f12, f0, f8
1612	fnmsub	f9,  f12, f1, f9
1613
1614	fmadd	f8,  f13, f1, f8
1615	fmadd	f9,  f13, f0, f9
1616#else
1617	fmadd	f8,  f12, f0, f8
1618	fmadd	f9,  f12, f1, f9
1619
1620	fnmsub	f8,  f13, f1, f8
1621	fmadd	f9,  f13, f0, f9
1622#endif
1623
1624	STFD	f8,  0 * SIZE(CO1)
1625	STFD	f9,  1 * SIZE(CO1)
1626	.align 4
1627
1628LL(999):
1629	mr	SP, STACK
1630
1631	li	r0,  0 * 16
1632	lvx	v20, SP, r0
1633	li	r0,  1 * 16
1634	lvx	v21, SP, r0
1635	li	r0,  2 * 16
1636	lvx	v22, SP, r0
1637	li	r0,  3 * 16
1638	lvx	v23, SP, r0
1639	li	r0,  4 * 16
1640	lvx	v24, SP, r0
1641	li	r0,  5 * 16
1642	lvx	v25, SP, r0
1643	li	r0,  6 * 16
1644	lvx	v26, SP, r0
1645	li	r0,  7 * 16
1646	lvx	v27, SP, r0
1647	li	r0,  8 * 16
1648	lvx	v28, SP, r0
1649	li	r0,  9 * 16
1650	lvx	v29, SP, r0
1651	li	r0, 10 * 16
1652	lvx	v30, SP, r0
1653	li	r0, 11 * 16
1654	lvx	v31, SP, r0
1655
1656	mtspr	VRsave, VREG
1657
1658#ifdef __64BIT__
1659	ld	r31,  192(SP)
1660	ld	r30,  200(SP)
1661	ld	r29,  208(SP)
1662	ld	r28,  216(SP)
1663	ld	r27,  224(SP)
1664	ld	r26,  232(SP)
1665	ld	r25,  240(SP)
1666	ld	r24,  248(SP)
1667	ld	r23,  256(SP)
1668	ld	r22,  264(SP)
1669	ld	r21,  272(SP)
1670	ld	r20,  280(SP)
1671	ld	r19,  288(SP)
1672	ld	r18,  296(SP)
1673	ld	r17,  304(SP)
1674	ld	r16,  312(SP)
1675	ld	r15,  320(SP)
1676	ld	r14,  328(SP)
1677#else
1678	lwz	r31,  192(SP)
1679	lwz	r30,  196(SP)
1680	lwz	r29,  200(SP)
1681	lwz	r28,  204(SP)
1682	lwz	r27,  208(SP)
1683	lwz	r26,  212(SP)
1684	lwz	r25,  216(SP)
1685	lwz	r24,  220(SP)
1686	lwz	r23,  224(SP)
1687	lwz	r22,  228(SP)
1688	lwz	r21,  232(SP)
1689	lwz	r20,  236(SP)
1690	lwz	r19,  240(SP)
1691	lwz	r18,  244(SP)
1692	lwz	r17,  248(SP)
1693	lwz	r16,  252(SP)
1694	lwz	r15,  256(SP)
1695	lwz	r14,  260(SP)
1696#endif
1697
1698	addi	SP, SP, STACKSIZE
1699
1700	blr
1701
1702	EPILOGUE
1703#endif
1704