1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#ifndef __64BIT__
26#define LOAD	lwz
27#else
28#define LOAD	ld
29#endif
30
31#ifdef __64BIT__
32#define STACKSIZE 360
33#else
34#define STACKSIZE 272
35#endif
36
37#define ALPHA		  0
38#define FZERO		 16
39
40#define	M	r3
41#define	N	r4
42#define	K	r5
43
44#ifdef linux
45#ifndef __64BIT__
46#define A	r6
47#define	B	r7
48#define	C	r8
49#define	LDC	r9
50#else
51#define A	r7
52#define	B	r8
53#define	C	r9
54#define	LDC	r10
55#endif
56#endif
57
58#if defined(_AIX) || defined(__APPLE__)
59#if !defined(__64BIT__) && defined(DOUBLE)
60#define A	r8
61#define	B	r9
62#define	C	r10
63#define	LDC	r7
64#else
65#define A	r7
66#define	B	r8
67#define	C	r9
68#define	LDC	r10
69#endif
70#endif
71
72#define STACK	r11
73
74#define	I	r21
75#define J	r22
76#define AO	r23
77#define	BO	r24
78#define	CO1	r25
79#define CO2	r26
80#define	CO3	r27
81#define	CO4	r28
82
83#define PREA	r29
84#define PREB	r29
85#define PREC	r30
86#define VREG	r31
87
88#define LOAD_A	lvx
89#define LOAD_B	lvx
90
91#define OFFSET_0	  0
92#define OFFSET_1	r14
93#define OFFSET_2	r15
94#define OFFSET_3	r16
95#define OFFSET_4	r17
96#define OFFSET_5	r18
97#define OFFSET_6	r19
98#define OFFSET_7	r20
99
100#define	c01	v0
101#define	c02	v1
102#define	c03	v2
103#define	c04	v3
104#define	c05	v4
105#define	c06	v5
106#define	c07	v6
107#define	c08	v7
108#define	c09	v8
109#define	c10	v9
110#define	c11	v10
111#define	c12	v11
112#define	c13	v12
113#define	c14	v13
114#define	c15	v14
115#define	c16	v15
116
117#define	a1	v16
118#define	a2	v17
119#define	a3	v18
120#define	a4	v19
121#define	a5	v20
122#define	a6	v21
123#define	a7	v22
124#define	a8	v23
125
126#define	b1	v24
127#define	b2	v25
128#define	bp1	v26
129#define	bp2	v27
130
131#define C1	v16
132#define C2	v17
133#define C3	v18
134#define C4	v19
135#define C5	v20
136#define C6	v21
137#define C7	v22
138#define C8	v23
139#define C9	v24
140
141#define c00	v25
142
143#define PERMRSHIFT1	 v26
144#define PERMRSHIFT2	 v27
145#define PERMRSHIFT3	 v28
146#define PERMRSHIFT4	 v29
147
148#define VZERO	v30
149#define alpha	v31
150
151#ifndef NEEDPARAM
152
153#ifndef DOUBLE
154#include "../sparam.h"
155#else
156#include "../dparam.h"
157#endif
158
159	PROLOGUE
160	PROFCODE
161
162	addi	SP, SP, -STACKSIZE
163	mr	STACK, SP
164
165	li	r0,  0 * 16
166	stvx	v20, SP, r0
167	li	r0,  1 * 16
168	stvx	v21, SP, r0
169	li	r0,  2 * 16
170	stvx	v22, SP, r0
171	li	r0,  3 * 16
172	stvx	v23, SP, r0
173	li	r0,  4 * 16
174	stvx	v24, SP, r0
175	li	r0,  5 * 16
176	stvx	v25, SP, r0
177	li	r0,  6 * 16
178	stvx	v26, SP, r0
179	li	r0,  7 * 16
180	stvx	v27, SP, r0
181	li	r0,  8 * 16
182	stvx	v28, SP, r0
183	li	r0,  9 * 16
184	stvx	v29, SP, r0
185	li	r0, 10 * 16
186	stvx	v30, SP, r0
187	li	r0, 11 * 16
188	stvx	v31, SP, r0
189
190#ifdef __64BIT__
191	std	r31,  192(SP)
192	std	r30,  200(SP)
193	std	r29,  208(SP)
194	std	r28,  216(SP)
195	std	r27,  224(SP)
196	std	r26,  232(SP)
197	std	r25,  240(SP)
198	std	r24,  248(SP)
199	std	r23,  256(SP)
200	std	r22,  264(SP)
201	std	r21,  272(SP)
202	std	r20,  280(SP)
203	std	r19,  288(SP)
204	std	r18,  296(SP)
205	std	r17,  304(SP)
206	std	r16,  312(SP)
207	std	r15,  320(SP)
208	std	r14,  328(SP)
209#else
210	stw	r31,  192(SP)
211	stw	r30,  196(SP)
212	stw	r29,  200(SP)
213	stw	r28,  204(SP)
214	stw	r27,  208(SP)
215	stw	r26,  212(SP)
216	stw	r25,  216(SP)
217	stw	r24,  220(SP)
218	stw	r23,  224(SP)
219	stw	r22,  228(SP)
220	stw	r21,  232(SP)
221	stw	r20,  236(SP)
222	stw	r19,  240(SP)
223	stw	r18,  244(SP)
224	stw	r17,  248(SP)
225	stw	r16,  252(SP)
226	stw	r15,  256(SP)
227	stw	r14,  260(SP)
228#endif
229
230
231#if defined(_AIX) || defined(__APPLE__)
232#if !defined(__64BIT__) && defined(DOUBLE)
233	lwz	LDC,    56 + STACKSIZE(SP)
234#endif
235#endif
236
237	li	r0, -1
238
239	mfspr	VREG, VRsave
240	mtspr	VRsave, r0
241
242	addi	SP, SP, -128
243	li	r0, -128
244	and	SP, SP, r0
245
246	li	OFFSET_1,  4 * SIZE
247	li	OFFSET_2,  8 * SIZE
248	li	OFFSET_3, 12 * SIZE
249	li	OFFSET_4, 16 * SIZE
250	li	OFFSET_5, 20 * SIZE
251	li	OFFSET_6, 24 * SIZE
252	li	OFFSET_7, 28 * SIZE
253
254	stfs	f1,  ALPHA +  0(SP)
255	stfs	f1,  ALPHA +  4(SP)
256	stfs	f1,  ALPHA +  8(SP)
257	stfs	f1,  ALPHA + 12(SP)
258
259	li	r29, 0
260	stw	r29, FZERO(SP)
261
262	slwi	LDC, LDC, BASE_SHIFT
263
264	li	PREC,   (15 * SIZE)
265#ifdef CELL
266	li	PREB,   (5 * 32 * SIZE)
267#else
268	li	PREB,   (5 * 32 * SIZE)
269#endif
270
271	cmpwi	cr0, M, 0
272	ble	LL(999)
273	cmpwi	cr0, N, 0
274	ble	LL(999)
275	cmpwi	cr0, K, 0
276	ble	LL(999)
277
278	srawi.	J, N,  2
279	ble	LL(60)
280	.align 4
281
282LL(01):
283	mr	CO1, C
284	add	CO2, C,  LDC
285	add	CO3, CO2, LDC
286	add	CO4, CO3, LDC
287	add	C,   CO4, LDC
288
289	mr	AO, A
290	srawi.	I, M,  4
291	ble	LL(20)
292	.align 4
293
294LL(11):
295	vxor	c01, c01, c01
296	LOAD_B	b1, OFFSET_0, B
297	vxor	c02, c02, c02
298	LOAD_A	a1, OFFSET_0, AO
299	vxor	c03, c03, c03
300	LOAD_A	a2, OFFSET_1, AO
301	vxor	c04, c04, c04
302	LOAD_A	a3, OFFSET_2, AO
303
304	vxor	c05, c05, c05
305	vxor	c06, c06, c06
306	vxor	c07, c07, c07
307	vxor	c08, c08, c08
308
309	vxor	c09, c09, c09
310	dcbtst	CO1, PREC
311	vxor	c10, c10, c10
312	dcbtst	CO2, PREC
313	vxor	c11, c11, c11
314	dcbtst	CO3, PREC
315	vxor	c12, c12, c12
316	dcbtst	CO4, PREC
317	vxor	c13, c13, c13
318	mr	BO, B
319	vxor	c14, c14, c14
320	srawi.	r0,  K,  2
321	vxor	c15, c15, c15
322	mtspr	CTR, r0
323	vxor	c16, c16, c16
324	vspltw	bp1, b1, 0
325	ble	LL(13)
326	.align 4
327
328#define NOP1   mr	r3, r3
329#define NOP2   mr	r4, r4
330
331LL(12):
332	vmaddfp	c01, a1, bp1, c01
333	vspltw	bp2, b1, 1
334	vmaddfp	c02, a2, bp1, c02
335	LOAD_A	a4, OFFSET_3, AO
336	vmaddfp	c03, a3, bp1, c03
337	dcbt	AO, PREA
338	vmaddfp	c04, a4, bp1, c04
339	NOP2
340
341	vmaddfp	c05, a1, bp2, c05
342	vspltw	bp1, b1, 2
343	vmaddfp	c06, a2, bp2, c06
344	NOP2
345	vmaddfp	c07, a3, bp2, c07
346	NOP1
347	vmaddfp	c08, a4, bp2, c08
348	dcbt	BO, PREB
349
350	vmaddfp	c09, a1, bp1, c09
351	vspltw	bp2, b1, 3
352	vmaddfp	c10, a2, bp1, c10
353	LOAD_B	b2, OFFSET_1, BO
354	vmaddfp	c11, a3, bp1, c11
355	addi	BO, BO,  8 * SIZE
356	vmaddfp	c12, a4, bp1, c12
357	NOP1
358
359	vmaddfp	c13, a1, bp2, c13
360	vspltw	bp1, b2, 0
361	vmaddfp	c14, a2, bp2, c14
362	LOAD_A	a5, OFFSET_4, AO
363	vmaddfp	c15, a3, bp2, c15
364	LOAD_A	a6, OFFSET_5, AO
365	vmaddfp	c16, a4, bp2, c16
366	vspltw	bp2, b2, 1
367
368	vmaddfp	c01, a5, bp1, c01
369	LOAD_A	a7, OFFSET_6, AO
370	vmaddfp	c02, a6, bp1, c02
371	LOAD_A	a8, OFFSET_7, AO
372	vmaddfp	c03, a7, bp1, c03
373	NOP1
374	vmaddfp	c04, a8, bp1, c04
375	NOP2
376
377	vmaddfp	c05, a5, bp2, c05
378	vspltw	bp1, b2, 2
379	vmaddfp	c06, a6, bp2, c06
380	addi	AO, AO, 32 * SIZE
381	vmaddfp	c07, a7, bp2, c07
382	LOAD_B	b1, OFFSET_0, BO
383	vmaddfp	c08, a8, bp2, c08
384	NOP1
385
386	vmaddfp	c09, a5, bp1, c09
387	vspltw	bp2, b2, 3
388	vmaddfp	c10, a6, bp1, c10
389	NOP2
390	vmaddfp	c11, a7, bp1, c11
391	NOP1
392	vmaddfp	c12, a8, bp1, c12
393	dcbt	AO, PREA
394
395	vmaddfp	c13, a5, bp2, c13
396	vspltw	bp1, b1, 0
397	vmaddfp	c14, a6, bp2, c14
398	LOAD_A	a1, OFFSET_0, AO		//
399	vmaddfp	c15, a7, bp2, c15
400	LOAD_A	a2, OFFSET_1, AO
401	vmaddfp	c16, a8, bp2, c16
402	vspltw	bp2, b1, 1
403
404	vmaddfp	c01, a1, bp1, c01
405	LOAD_A	a3, OFFSET_2, AO
406	vmaddfp	c02, a2, bp1, c02
407	LOAD_A	a4, OFFSET_3, AO
408	vmaddfp	c03, a3, bp1, c03
409	NOP1
410	vmaddfp	c04, a4, bp1, c04
411	NOP2
412
413	vmaddfp	c05, a1, bp2, c05
414	vspltw	bp1, b1, 2
415	vmaddfp	c06, a2, bp2, c06
416	NOP2
417	vmaddfp	c07, a3, bp2, c07
418	NOP1
419	vmaddfp	c08, a4, bp2, c08
420	LOAD_B	b2, OFFSET_1, BO
421
422	vmaddfp	c09, a1, bp1, c09
423	vspltw	bp2, b1, 3
424	vmaddfp	c10, a2, bp1, c10
425	NOP2
426	vmaddfp	c11, a3, bp1, c11
427	NOP1
428	vmaddfp	c12, a4, bp1, c12
429	addi	BO, BO,  8 * SIZE
430
431	vmaddfp	c13, a1, bp2, c13
432	vspltw	bp1, b2, 0
433	vmaddfp	c14, a2, bp2, c14
434	LOAD_A	a5, OFFSET_4, AO
435	vmaddfp	c15, a3, bp2, c15
436	LOAD_A	a6, OFFSET_5, AO
437	vmaddfp	c16, a4, bp2, c16
438	vspltw	bp2, b2, 1
439
440	vmaddfp	c01, a5, bp1, c01
441	LOAD_A	a7, OFFSET_6, AO
442	vmaddfp	c02, a6, bp1, c02
443	LOAD_A	a8, OFFSET_7, AO
444	vmaddfp	c03, a7, bp1, c03
445	addi	AO, AO, 32 * SIZE
446	vmaddfp	c04, a8, bp1, c04
447	NOP2
448
449	vmaddfp	c05, a5, bp2, c05
450	vspltw	bp1, b2, 2
451	vmaddfp	c06, a6, bp2, c06
452	NOP2
453	vmaddfp	c07, a7, bp2, c07
454	NOP1
455	vmaddfp	c08, a8, bp2, c08
456	LOAD_B	b1, OFFSET_0, BO
457
458	vmaddfp	c09, a5, bp1, c09
459	vspltw	bp2, b2, 3
460	vmaddfp	c10, a6, bp1, c10
461	LOAD_A	a1, OFFSET_0, AO	//
462	vmaddfp	c11, a7, bp1, c11
463	NOP2
464	vmaddfp	c12, a8, bp1, c12
465	vspltw	bp1, b1, 0
466
467	vmaddfp	c13, a5, bp2, c13
468	LOAD_A	a2, OFFSET_1, AO
469	vmaddfp	c14, a6, bp2, c14
470	LOAD_A	a3, OFFSET_2, AO
471	vmaddfp	c15, a7, bp2, c15
472	NOP1
473	vmaddfp	c16, a8, bp2, c16
474	bdnz+	LL(12)
475	.align 4
476
477LL(13):
478	andi.	r0,  K,  2
479	nop
480	nop
481	ble+	LL(15)
482	.align 4
483
484	vmaddfp	c01, a1, bp1, c01
485	vspltw	bp2, b1, 1
486	vmaddfp	c02, a2, bp1, c02
487	LOAD_A	a4, OFFSET_3, AO
488	vmaddfp	c03, a3, bp1, c03
489	NOP1
490	vmaddfp	c04, a4, bp1, c04
491	NOP2
492
493	vmaddfp	c05, a1, bp2, c05
494	vspltw	bp1, b1, 2
495	vmaddfp	c06, a2, bp2, c06
496	NOP2
497	vmaddfp	c07, a3, bp2, c07
498	NOP1
499	vmaddfp	c08, a4, bp2, c08
500	LOAD_B	b2, OFFSET_1, BO
501
502	vmaddfp	c09, a1, bp1, c09
503	vspltw	bp2, b1, 3
504	vmaddfp	c10, a2, bp1, c10
505	LOAD_A	a5, OFFSET_4, AO
506	vmaddfp	c11, a3, bp1, c11
507	LOAD_A	a6, OFFSET_5, AO
508	vmaddfp	c12, a4, bp1, c12
509	addi	BO, BO,  8 * SIZE
510
511	vmaddfp	c13, a1, bp2, c13
512	vspltw	bp1, b2, 0
513	vmaddfp	c14, a2, bp2, c14
514	LOAD_A	a7, OFFSET_6, AO
515	vmaddfp	c15, a3, bp2, c15
516	LOAD_A	a8, OFFSET_7, AO
517	vmaddfp	c16, a4, bp2, c16
518	addi	AO, AO, 32 * SIZE
519
520	vmaddfp	c01, a5, bp1, c01
521	vspltw	bp2, b2, 1
522	vmaddfp	c02, a6, bp1, c02
523	NOP2
524	vmaddfp	c03, a7, bp1, c03
525	NOP1
526	vmaddfp	c04, a8, bp1, c04
527	NOP2
528
529	vmaddfp	c05, a5, bp2, c05
530	vspltw	bp1, b2, 2
531	vmaddfp	c06, a6, bp2, c06
532	NOP2
533	vmaddfp	c07, a7, bp2, c07
534	NOP1
535	vmaddfp	c08, a8, bp2, c08
536	LOAD_B	b1, OFFSET_0, BO
537
538	vmaddfp	c09, a5, bp1, c09
539	vspltw	bp2, b2, 3
540	vmaddfp	c10, a6, bp1, c10
541	LOAD_A	a1, OFFSET_0, AO
542	vmaddfp	c11, a7, bp1, c11
543	LOAD_A	a2, OFFSET_1, AO
544	vmaddfp	c12, a8, bp1, c12
545	NOP2
546
547	vmaddfp	c13, a5, bp2, c13
548	vspltw	bp1, b1, 0
549	vmaddfp	c14, a6, bp2, c14
550	LOAD_A	a3, OFFSET_2, AO
551	vmaddfp	c15, a7, bp2, c15
552	vmaddfp	c16, a8, bp2, c16
553	.align 4
554
555LL(15):
556	andi.	r0,  K,  1
557	lvx	alpha, OFFSET_0, SP
558	vxor	VZERO, VZERO, VZERO
559	ble+	LL(18)
560	.align 4
561
562	vmaddfp	c01, a1, bp1, c01
563	vspltw	bp2, b1, 1
564	vmaddfp	c02, a2, bp1, c02
565	LOAD_A	a4, OFFSET_3, AO
566	vmaddfp	c03, a3, bp1, c03
567	nop
568	vmaddfp	c04, a4, bp1, c04
569	nop
570
571	vmaddfp	c05, a1, bp2, c05
572	vspltw	bp1, b1, 2
573	vmaddfp	c06, a2, bp2, c06
574	nop
575	vmaddfp	c07, a3, bp2, c07
576	nop
577	vmaddfp	c08, a4, bp2, c08
578	nop
579
580	vmaddfp	c09, a1, bp1, c09
581	vspltw	bp2, b1, 3
582	vmaddfp	c10, a2, bp1, c10
583	addi	AO, AO, 16 * SIZE
584	vmaddfp	c11, a3, bp1, c11
585	addi	BO, BO,  4 * SIZE
586	vmaddfp	c12, a4, bp1, c12
587	nop
588
589	vmaddfp	c13, a1, bp2, c13
590	vmaddfp	c14, a2, bp2, c14
591	vmaddfp	c15, a3, bp2, c15
592	vmaddfp	c16, a4, bp2, c16
593	.align 4
594
595LL(18):
596	lvx	C1, OFFSET_0, CO1
597	cmpwi	cr0, LDC, 32 * SIZE
598	lvx	C2, OFFSET_1, CO1
599	lvsr	PERMRSHIFT1, 0, CO1
600	lvx	C3, OFFSET_2, CO1
601	lvsr	PERMRSHIFT2, 0, CO2
602	lvx	C4, OFFSET_3, CO1
603	lvsr	PERMRSHIFT3, 0, CO3
604	lvx	C5, OFFSET_4, CO1
605	lvsr	PERMRSHIFT4, 0, CO4
606	ble	LL(19)
607
608	vperm	c00, VZERO, c01,   PERMRSHIFT1
609	vperm	c01, c01,   c02,   PERMRSHIFT1
610	vperm	c02, c02,   c03,   PERMRSHIFT1
611	vperm	c03, c03,   c04,   PERMRSHIFT1
612	vperm	c04, c04,   VZERO, PERMRSHIFT1
613
614	vmaddfp	c00, alpha, c00, C1
615	lvx	C1, OFFSET_0, CO2
616	vmaddfp	c01, alpha, c01, C2
617	lvx	C6, OFFSET_1, CO2
618	vmaddfp	c02, alpha, c02, C3
619	lvx	C7, OFFSET_2, CO2
620	vmaddfp	c03, alpha, c03, C4
621	lvx	C8, OFFSET_3, CO2
622	vmaddfp	c04, alpha, c04, C5
623	lvx	C9, OFFSET_4, CO2
624
625	stvx	c00, OFFSET_0, CO1
626	vperm	c00, VZERO, c05,   PERMRSHIFT2
627	stvx	c01, OFFSET_1, CO1
628	vperm	c05, c05,   c06,   PERMRSHIFT2
629	stvx	c02, OFFSET_2, CO1
630	vperm	c06, c06,   c07,   PERMRSHIFT2
631	stvx	c03, OFFSET_3, CO1
632	vperm	c07, c07,   c08,   PERMRSHIFT2
633	stvx	c04, OFFSET_4, CO1
634	vperm	c08, c08,   VZERO, PERMRSHIFT2
635
636	vmaddfp	c00, alpha, c00, C1
637	lvx	C1, OFFSET_0, CO3
638	vmaddfp	c05, alpha, c05, C6
639	lvx	C2, OFFSET_1, CO3
640	vmaddfp	c06, alpha, c06, C7
641	lvx	C3, OFFSET_2, CO3
642	vmaddfp	c07, alpha, c07, C8
643	lvx	C4, OFFSET_3, CO3
644	vmaddfp	c08, alpha, c08, C9
645	lvx	C5, OFFSET_4, CO3
646
647	stvx	c00, OFFSET_0, CO2
648	vperm	c00, VZERO, c09,   PERMRSHIFT3
649	stvx	c05, OFFSET_1, CO2
650	vperm	c09, c09,   c10,   PERMRSHIFT3
651	stvx	c06, OFFSET_2, CO2
652	vperm	c10, c10,   c11,   PERMRSHIFT3
653	stvx	c07, OFFSET_3, CO2
654	vperm	c11, c11,   c12,   PERMRSHIFT3
655	stvx	c08, OFFSET_4, CO2
656	vperm	c12, c12,   VZERO, PERMRSHIFT3
657
658	vmaddfp	c00, alpha, c00, C1
659	lvx	C9, OFFSET_4, CO4
660	vmaddfp	c09, alpha, c09, C2
661	lvx	C1, OFFSET_0, CO4
662	vmaddfp	c10, alpha, c10, C3
663	lvx	C6, OFFSET_1, CO4
664	vmaddfp	c11, alpha, c11, C4
665	lvx	C7, OFFSET_2, CO4
666	vmaddfp	c12, alpha, c12, C5
667	lvx	C8, OFFSET_3, CO4
668
669	stvx	c00, OFFSET_0, CO3
670	vperm	c00, VZERO, c13,   PERMRSHIFT4
671	stvx	c09, OFFSET_1, CO3
672	vperm	c13, c13,   c14,   PERMRSHIFT4
673	stvx	c10, OFFSET_2, CO3
674	vperm	c14, c14,   c15,   PERMRSHIFT4
675	stvx	c11, OFFSET_3, CO3
676	vperm	c15, c15,   c16,   PERMRSHIFT4
677	stvx	c12, OFFSET_4, CO3
678	vperm	c16, c16,   VZERO, PERMRSHIFT4
679
680	vmaddfp	c00, alpha, c00, C1
681	vmaddfp	c13, alpha, c13, C6
682	vmaddfp	c14, alpha, c14, C7
683	vmaddfp	c15, alpha, c15, C8
684	vmaddfp	c16, alpha, c16, C9
685
686	stvx	c00, OFFSET_0, CO4
687	stvx	c13, OFFSET_1, CO4
688	stvx	c14, OFFSET_2, CO4
689	stvx	c15, OFFSET_3, CO4
690	stvx	c16, OFFSET_4, CO4
691
692	addi	CO1, CO1, 16 * SIZE
693	addi	CO2, CO2, 16 * SIZE
694	addi	CO3, CO3, 16 * SIZE
695	addi	CO4, CO4, 16 * SIZE
696
697	addic.	I, I, -1
698	bgt+	LL(11)
699	b	LL(20)
700	.align 4
701
702LL(19):
703	lvx	C6, OFFSET_1, CO2
704	lvx	C7, OFFSET_2, CO2
705	lvx	C8, OFFSET_3, CO2
706	lvx	C9, OFFSET_4, CO2
707
708	vperm	c00, VZERO, c01,   PERMRSHIFT1
709	vperm	c01, c01,   c02,   PERMRSHIFT1
710	vperm	c02, c02,   c03,   PERMRSHIFT1
711	vperm	c03, c03,   c04,   PERMRSHIFT1
712	vperm	c04, c04,   VZERO, PERMRSHIFT1
713
714	vmaddfp	c00, alpha, c00, C1
715	vmaddfp	c01, alpha, c01, C2
716	lvx	C2, OFFSET_1, CO3
717	vmaddfp	c02, alpha, c02, C3
718	lvx	C3, OFFSET_2, CO3
719	vmaddfp	c03, alpha, c03, C4
720	lvx	C4, OFFSET_3, CO3
721	vmaddfp	c04, alpha, c04, C5
722	lvx	C5, OFFSET_4, CO3
723
724	stvx	c00, OFFSET_0, CO1
725	stvx	c01, OFFSET_1, CO1
726	stvx	c02, OFFSET_2, CO1
727	stvx	c03, OFFSET_3, CO1
728	stvx	c04, OFFSET_4, CO1
729
730	lvx	C1, OFFSET_0, CO2
731
732	vperm	c00, VZERO, c05,   PERMRSHIFT2
733	vperm	c05, c05,   c06,   PERMRSHIFT2
734	vperm	c06, c06,   c07,   PERMRSHIFT2
735	vperm	c07, c07,   c08,   PERMRSHIFT2
736	vperm	c08, c08,   VZERO, PERMRSHIFT2
737
738	vmaddfp	c00, alpha, c00, C1
739	vmaddfp	c05, alpha, c05, C6
740	lvx	C6, OFFSET_1, CO4
741	vmaddfp	c06, alpha, c06, C7
742	lvx	C7, OFFSET_2, CO4
743	vmaddfp	c07, alpha, c07, C8
744	lvx	C8, OFFSET_3, CO4
745	vmaddfp	c08, alpha, c08, C9
746	lvx	C9, OFFSET_4, CO4
747
748	stvx	c00, OFFSET_0, CO2
749	stvx	c05, OFFSET_1, CO2
750	stvx	c06, OFFSET_2, CO2
751	stvx	c07, OFFSET_3, CO2
752	stvx	c08, OFFSET_4, CO2
753
754	lvx	C1, OFFSET_0, CO3
755
756	vperm	c00, VZERO, c09,   PERMRSHIFT3
757	vperm	c09, c09,   c10,   PERMRSHIFT3
758	vperm	c10, c10,   c11,   PERMRSHIFT3
759	vperm	c11, c11,   c12,   PERMRSHIFT3
760	vperm	c12, c12,   VZERO, PERMRSHIFT3
761
762	vmaddfp	c00, alpha, c00, C1
763	vmaddfp	c09, alpha, c09, C2
764	vmaddfp	c10, alpha, c10, C3
765	vmaddfp	c11, alpha, c11, C4
766	vmaddfp	c12, alpha, c12, C5
767
768	stvx	c00, OFFSET_0, CO3
769	stvx	c09, OFFSET_1, CO3
770	stvx	c10, OFFSET_2, CO3
771	stvx	c11, OFFSET_3, CO3
772	stvx	c12, OFFSET_4, CO3
773
774	lvx	C1, OFFSET_0, CO4
775
776	vperm	c00, VZERO, c13,   PERMRSHIFT4
777	vperm	c13, c13,   c14,   PERMRSHIFT4
778	vperm	c14, c14,   c15,   PERMRSHIFT4
779	vperm	c15, c15,   c16,   PERMRSHIFT4
780	vperm	c16, c16,   VZERO, PERMRSHIFT4
781
782	vmaddfp	c00, alpha, c00, C1
783	vmaddfp	c13, alpha, c13, C6
784	vmaddfp	c14, alpha, c14, C7
785	vmaddfp	c15, alpha, c15, C8
786	vmaddfp	c16, alpha, c16, C9
787
788	stvx	c00, OFFSET_0, CO4
789	stvx	c13, OFFSET_1, CO4
790	stvx	c14, OFFSET_2, CO4
791	stvx	c15, OFFSET_3, CO4
792	stvx	c16, OFFSET_4, CO4
793
794	addi	CO1, CO1, 16 * SIZE
795	addi	CO2, CO2, 16 * SIZE
796	addi	CO3, CO3, 16 * SIZE
797	addi	CO4, CO4, 16 * SIZE
798
799	addic.	I, I, -1
800	bgt+	LL(11)
801	.align 4
802
803LL(20):
804	andi.	I, M,  8
805	ble	LL(30)
806
807	vxor	c01, c01, c01
808	LOAD_A	a1, OFFSET_0, AO
809	vxor	c02, c02, c02
810	LOAD_A	a2, OFFSET_1, AO
811	vxor	c05, c05, c05
812	LOAD_A	a3, OFFSET_2, AO
813	vxor	c06, c06, c06
814	LOAD_A	a4, OFFSET_3, AO
815	vxor	c09, c09, c09
816	LOAD_B	b1, OFFSET_0, B
817	vxor	c10, c10, c10
818	LOAD_B	b2, OFFSET_1, B
819	vxor	c13, c13, c13
820	vxor	c14, c14, c14
821	mr	BO, B
822	vspltw	bp1, b1, 0
823
824	srawi.	r0,  K,  1
825	mtspr	CTR, r0
826	ble	LL(25)
827	.align 4
828
829LL(22):
830	vmaddfp	c01, a1, bp1, c01
831	vspltw	bp2, b1, 1
832	addi	AO, AO, 16 * SIZE
833	vmaddfp	c02, a2, bp1, c02
834	addi	BO, BO,  8 * SIZE
835
836	vmaddfp	c05, a1, bp2, c05
837	vspltw	bp1, b1, 2
838	vmaddfp	c06, a2, bp2, c06
839
840	vmaddfp	c09, a1, bp1, c09
841	vspltw	bp2, b1, 3
842	LOAD_B	b1, OFFSET_0, BO
843	vmaddfp	c10, a2, bp1, c10
844
845	vmaddfp	c13, a1, bp2, c13
846	LOAD_A	a1, OFFSET_0, AO
847	vspltw	bp1, b2, 0
848	vmaddfp	c14, a2, bp2, c14
849	LOAD_A	a2, OFFSET_1, AO
850
851	vmaddfp	c01, a3, bp1, c01
852	vspltw	bp2, b2, 1
853	vmaddfp	c02, a4, bp1, c02
854
855	vmaddfp	c05, a3, bp2, c05
856	vspltw	bp1, b2, 2
857	vmaddfp	c06, a4, bp2, c06
858
859	vmaddfp	c09, a3, bp1, c09
860	vspltw	bp2, b2, 3
861	LOAD_B	b2, OFFSET_1, BO
862	vmaddfp	c10, a4, bp1, c10
863
864	vmaddfp	c13, a3, bp2, c13
865	LOAD_A	a3, OFFSET_2, AO
866	vmaddfp	c14, a4, bp2, c14
867	LOAD_A	a4, OFFSET_3, AO
868	vspltw	bp1, b1, 0
869	bdnz	LL(22)
870	.align 4
871
872LL(25):
873	andi.	r0,  K,  1
874	lvx	alpha, OFFSET_0, SP
875	vxor	VZERO, VZERO, VZERO
876	ble+	LL(28)
877	.align 4
878
879LL(26):
880	vmaddfp	c01, a1, bp1, c01
881	vspltw	bp2, b1, 1
882	vmaddfp	c02, a2, bp1, c02
883	nop
884
885	vmaddfp	c05, a1, bp2, c05
886	vspltw	bp1, b1, 2
887	vmaddfp	c06, a2, bp2, c06
888	nop
889
890	vmaddfp	c09, a1, bp1, c09
891	vspltw	bp2, b1, 3
892	vmaddfp	c10, a2, bp1, c10
893	addi	AO, AO,  8 * SIZE
894
895	vmaddfp	c13, a1, bp2, c13
896	addi	BO, BO,  4 * SIZE
897	vmaddfp	c14, a2, bp2, c14
898	nop
899	.align 4
900
901LL(28):
902	lvx	C1, OFFSET_0, CO1
903	lvx	C2, OFFSET_1, CO1
904	lvx	C3, OFFSET_2, CO1
905
906	lvsr	PERMRSHIFT1, 0, CO1
907	lvsr	PERMRSHIFT2, 0, CO2
908	lvsr	PERMRSHIFT3, 0, CO3
909	lvsr	PERMRSHIFT4, 0, CO4
910
911	vperm	c00, VZERO, c01,   PERMRSHIFT1
912	vperm	c01, c01,   c02,   PERMRSHIFT1
913	vperm	c02, c02,   VZERO, PERMRSHIFT1
914
915	vmaddfp	c00, alpha, c00, C1
916	vmaddfp	c01, alpha, c01, C2
917	vmaddfp	c02, alpha, c02, C3
918
919	stvx	c00, OFFSET_0, CO1
920	stvx	c01, OFFSET_1, CO1
921	stvx	c02, OFFSET_2, CO1
922
923	lvx	C1, OFFSET_0, CO2
924	lvx	C2, OFFSET_1, CO2
925	lvx	C3, OFFSET_2, CO2
926
927	vperm	c00, VZERO, c05,   PERMRSHIFT2
928	vperm	c05, c05,   c06,   PERMRSHIFT2
929	vperm	c06, c06,   VZERO, PERMRSHIFT2
930
931	vmaddfp	c00, alpha, c00, C1
932	vmaddfp	c05, alpha, c05, C2
933	vmaddfp	c06, alpha, c06, C3
934
935	stvx	c00, OFFSET_0, CO2
936	stvx	c05, OFFSET_1, CO2
937	stvx	c06, OFFSET_2, CO2
938
939	lvx	C1, OFFSET_0, CO3
940	lvx	C2, OFFSET_1, CO3
941	lvx	C3, OFFSET_2, CO3
942
943	vperm	c00, VZERO, c09,   PERMRSHIFT3
944	vperm	c09, c09,   c10,   PERMRSHIFT3
945	vperm	c10, c10,   VZERO, PERMRSHIFT3
946
947	vmaddfp	c00, alpha, c00, C1
948	vmaddfp	c09, alpha, c09, C2
949	vmaddfp	c10, alpha, c10, C3
950
951	stvx	c00, OFFSET_0, CO3
952	stvx	c09, OFFSET_1, CO3
953	stvx	c10, OFFSET_2, CO3
954
955	lvx	C1, OFFSET_0, CO4
956	lvx	C2, OFFSET_1, CO4
957	lvx	C3, OFFSET_2, CO4
958
959	vperm	c00, VZERO, c13,   PERMRSHIFT4
960	vperm	c13, c13,   c14,   PERMRSHIFT4
961	vperm	c14, c14,   VZERO, PERMRSHIFT4
962
963	vmaddfp	c00, alpha, c00, C1
964	vmaddfp	c13, alpha, c13, C2
965	vmaddfp	c14, alpha, c14, C3
966
967	stvx	c00, OFFSET_0, CO4
968	stvx	c13, OFFSET_1, CO4
969	stvx	c14, OFFSET_2, CO4
970
971	addi	CO1, CO1, 8 * SIZE
972	addi	CO2, CO2, 8 * SIZE
973	addi	CO3, CO3, 8 * SIZE
974	addi	CO4, CO4, 8 * SIZE
975	.align 4
976
977LL(30):
978	andi.	I, M,  4
979	ble	LL(40)
980
981	vxor	c01, c01, c01
982	LOAD_A	a1, OFFSET_0, AO
983	vxor	c02, c02, c02
984	LOAD_A	a2, OFFSET_1, AO
985	vxor	c05, c05, c05
986	LOAD_B	b1, OFFSET_0, B
987	vxor	c06, c06, c06
988	LOAD_B	b2, OFFSET_1, B
989	vxor	c09, c09, c09
990	vxor	c10, c10, c10
991	vxor	c13, c13, c13
992	vxor	c14, c14, c14
993
994	vspltw	bp1, b1, 0
995	mr	BO, B
996
997	srawi.	r0,  K,  1
998	mtspr	CTR, r0
999	ble	LL(35)
1000	.align 4
1001
1002LL(32):
1003	vmaddfp	c01, a1, bp1, c01
1004	addi	AO, AO,  8 * SIZE
1005	vspltw	bp2, b1, 1
1006	vmaddfp	c05, a1, bp2, c05
1007	addi	BO, BO,  8 * SIZE
1008	vspltw	bp1, b1, 2
1009	vmaddfp	c09, a1, bp1, c09
1010	vspltw	bp2, b1, 3
1011	vmaddfp	c13, a1, bp2, c13
1012	LOAD_A	a1, OFFSET_0, AO
1013	vspltw	bp1, b2, 0
1014	LOAD_B	b1, OFFSET_0, BO
1015
1016	vmaddfp	c02, a2, bp1, c02
1017	vspltw	bp2, b2, 1
1018	vmaddfp	c06, a2, bp2, c06
1019	vspltw	bp1, b2, 2
1020	vmaddfp	c10, a2, bp1, c10
1021	vspltw	bp2, b2, 3
1022	LOAD_B	b2, OFFSET_1, BO
1023	vmaddfp	c14, a2, bp2, c14
1024	LOAD_A	a2, OFFSET_1, AO
1025
1026	vspltw	bp1, b1, 0
1027	bdnz	LL(32)
1028	.align 4
1029
1030LL(35):
1031	andi.	r0,  K,  1
1032	lvx	alpha, OFFSET_0, SP
1033	vxor	VZERO, VZERO, VZERO
1034	ble+	LL(38)
1035	.align 4
1036
1037LL(36):
1038	vmaddfp	c01, a1, bp1, c01
1039	vspltw	bp2, b1, 1
1040	vmaddfp	c05, a1, bp2, c05
1041	vspltw	bp1, b1, 2
1042	vmaddfp	c09, a1, bp1, c09
1043	vspltw	bp2, b1, 3
1044	vmaddfp	c13, a1, bp2, c13
1045	addi	AO, AO,  4 * SIZE
1046	addi	BO, BO,  4 * SIZE
1047	.align 4
1048
1049LL(38):
1050	vaddfp	c01, c01, c02
1051	vaddfp	c05, c05, c06
1052	vaddfp	c09, c09, c10
1053	vaddfp	c13, c13, c14
1054
1055	lvx	C1, OFFSET_0, CO1
1056	lvx	C2, OFFSET_1, CO1
1057
1058	lvsr	PERMRSHIFT1, 0, CO1
1059	lvsr	PERMRSHIFT2, 0, CO2
1060	lvsr	PERMRSHIFT3, 0, CO3
1061	lvsr	PERMRSHIFT4, 0, CO4
1062
1063	vperm	c00, VZERO, c01,   PERMRSHIFT1
1064	vperm	c01, c01,   VZERO, PERMRSHIFT1
1065
1066	vmaddfp	c00, alpha, c00, C1
1067	vmaddfp	c01, alpha, c01, C2
1068
1069	stvx	c00, OFFSET_0, CO1
1070	stvx	c01, OFFSET_1, CO1
1071
1072	lvx	C1, OFFSET_0, CO2
1073	lvx	C2, OFFSET_1, CO2
1074
1075	vperm	c00, VZERO, c05,   PERMRSHIFT2
1076	vperm	c05, c05,   VZERO, PERMRSHIFT2
1077
1078	vmaddfp	c00, alpha, c00, C1
1079	vmaddfp	c05, alpha, c05, C2
1080
1081	stvx	c00, OFFSET_0, CO2
1082	stvx	c05, OFFSET_1, CO2
1083
1084	lvx	C1, OFFSET_0, CO3
1085	lvx	C2, OFFSET_1, CO3
1086
1087	vperm	c00, VZERO, c09,   PERMRSHIFT3
1088	vperm	c09, c09,   VZERO, PERMRSHIFT3
1089
1090	vmaddfp	c00, alpha, c00, C1
1091	vmaddfp	c09, alpha, c09, C2
1092
1093	stvx	c00, OFFSET_0, CO3
1094	stvx	c09, OFFSET_1, CO3
1095
1096	lvx	C1, OFFSET_0, CO4
1097	lvx	C2, OFFSET_1, CO4
1098
1099	vperm	c00, VZERO, c13,   PERMRSHIFT4
1100	vperm	c13, c13,   VZERO, PERMRSHIFT4
1101
1102	vmaddfp	c00, alpha, c00, C1
1103	vmaddfp	c13, alpha, c13, C2
1104
1105	stvx	c00, OFFSET_0, CO4
1106	stvx	c13, OFFSET_1, CO4
1107
1108	addi	CO1, CO1, 4 * SIZE
1109	addi	CO2, CO2, 4 * SIZE
1110	addi	CO3, CO3, 4 * SIZE
1111	addi	CO4, CO4, 4 * SIZE
1112	.align 4
1113
1114LL(40):
1115	andi.	I, M,  2
1116	ble	LL(50)
1117
1118	mr	BO, B
1119
1120	LFD	f8,   0 * SIZE(AO)
1121	LFD	f9,   1 * SIZE(AO)
1122
1123	LFD	f10,  0 * SIZE(B)
1124	LFD	f11,  1 * SIZE(B)
1125	LFD	f12,  2 * SIZE(B)
1126	LFD	f13,  3 * SIZE(B)
1127
1128	lfs	f0,  FZERO(SP)
1129 	fmr	f1,  f0
1130	fmr	f2,  f0
1131	fmr	f3,  f0
1132
1133	fmr	f4,  f0
1134	fmr	f5,  f0
1135	fmr	f6,  f0
1136	fmr	f7,  f0
1137
1138	srawi.	r0,  K,  1
1139	mtspr	CTR, r0
1140	ble	LL(45)
1141	.align 4
1142
1143LL(42):
1144	FMADD	f0,  f8, f10, f0
1145	FMADD	f2,  f8, f11, f2
1146	FMADD	f4,  f8, f12, f4
1147	FMADD	f6,  f8, f13, f6
1148
1149	FMADD	f1,  f9, f10, f1
1150	FMADD	f3,  f9, f11, f3
1151	FMADD	f5,  f9, f12, f5
1152	FMADD	f7,  f9, f13, f7
1153
1154	LFD	f8,   2 * SIZE(AO)
1155	LFD	f9,   3 * SIZE(AO)
1156
1157	LFD	f10,  4 * SIZE(BO)
1158	LFD	f11,  5 * SIZE(BO)
1159	LFD	f12,  6 * SIZE(BO)
1160	LFD	f13,  7 * SIZE(BO)
1161
1162	FMADD	f0,  f8, f10, f0
1163	FMADD	f2,  f8, f11, f2
1164	FMADD	f4,  f8, f12, f4
1165	FMADD	f6,  f8, f13, f6
1166
1167	FMADD	f1,  f9, f10, f1
1168	FMADD	f3,  f9, f11, f3
1169	FMADD	f5,  f9, f12, f5
1170	FMADD	f7,  f9, f13, f7
1171
1172	LFD	f8,   4 * SIZE(AO)
1173	LFD	f9,   5 * SIZE(AO)
1174
1175	LFD	f10,  8 * SIZE(BO)
1176	LFD	f11,  9 * SIZE(BO)
1177	LFD	f12, 10 * SIZE(BO)
1178	LFD	f13, 11 * SIZE(BO)
1179
1180	addi	AO, AO,  4 * SIZE
1181	addi	BO, BO,  8 * SIZE
1182	bdnz	LL(42)
1183	.align 4
1184
1185LL(45):
1186	andi.	r0,  K,  1
1187	ble	LL(48)
1188	.align 4
1189
1190LL(46):
1191	FMADD	f0,  f8, f10, f0
1192	FMADD	f2,  f8, f11, f2
1193	FMADD	f4,  f8, f12, f4
1194	FMADD	f6,  f8, f13, f6
1195
1196	FMADD	f1,  f9, f10, f1
1197	FMADD	f3,  f9, f11, f3
1198	FMADD	f5,  f9, f12, f5
1199	FMADD	f7,  f9, f13, f7
1200
1201	LFD	f8,   2 * SIZE(AO)
1202	LFD	f9,   3 * SIZE(AO)
1203
1204	LFD	f10,  4 * SIZE(BO)
1205	LFD	f11,  5 * SIZE(BO)
1206	LFD	f12,  6 * SIZE(BO)
1207	LFD	f13,  7 * SIZE(BO)
1208
1209	addi	AO, AO,  2 * SIZE
1210	addi	BO, BO,  4 * SIZE
1211	.align 4
1212
1213LL(48):
1214	lfs	f13,  ALPHA(SP)
1215
1216	LFD	f8,  0 * SIZE(CO1)
1217	LFD	f9,  1 * SIZE(CO1)
1218	LFD	f10, 0 * SIZE(CO2)
1219	LFD	f11, 1 * SIZE(CO2)
1220
1221	FMADD	f0,  f0, f13, f8
1222	FMADD	f1,  f1, f13, f9
1223	FMADD	f2,  f2, f13, f10
1224	FMADD	f3,  f3, f13, f11
1225
1226	LFD	f8,  0 * SIZE(CO3)
1227	LFD	f9,  1 * SIZE(CO3)
1228	LFD	f10, 0 * SIZE(CO4)
1229	LFD	f11, 1 * SIZE(CO4)
1230
1231	FMADD	f4,  f4, f13, f8
1232	FMADD	f5,  f5, f13, f9
1233	FMADD	f6,  f6, f13, f10
1234	FMADD	f7,  f7, f13, f11
1235
1236	STFD	f0,  0 * SIZE(CO1)
1237	STFD	f1,  1 * SIZE(CO1)
1238	STFD	f2,  0 * SIZE(CO2)
1239	STFD	f3,  1 * SIZE(CO2)
1240
1241	STFD	f4,  0 * SIZE(CO3)
1242	STFD	f5,  1 * SIZE(CO3)
1243	STFD	f6,  0 * SIZE(CO4)
1244	STFD	f7,  1 * SIZE(CO4)
1245
1246	addi	CO1, CO1, 2 * SIZE
1247	addi	CO2, CO2, 2 * SIZE
1248	addi	CO3, CO3, 2 * SIZE
1249	addi	CO4, CO4, 2 * SIZE
1250	.align 4
1251
1252LL(50):
1253	andi.	I, M,  1
1254	ble	LL(59)
1255
1256	mr	BO, B
1257
1258	LFD	f8,   0 * SIZE(AO)
1259	LFD	f9,   1 * SIZE(AO)
1260
1261	LFD	f10,  0 * SIZE(B)
1262	LFD	f11,  1 * SIZE(B)
1263	LFD	f12,  2 * SIZE(B)
1264	LFD	f13,  3 * SIZE(B)
1265
1266	lfs	f0,  FZERO(SP)
1267 	fmr	f1,  f0
1268	fmr	f2,  f0
1269	fmr	f3,  f0
1270
1271	srawi.	r0,  K,  1
1272	mtspr	CTR, r0
1273	ble	LL(55)
1274	.align 4
1275
1276LL(52):
1277	FMADD	f0,  f8, f10, f0
1278	FMADD	f1,  f8, f11, f1
1279	FMADD	f2,  f8, f12, f2
1280	FMADD	f3,  f8, f13, f3
1281
1282	LFD	f8,   2 * SIZE(AO)
1283
1284	LFD	f10,  4 * SIZE(BO)
1285	LFD	f11,  5 * SIZE(BO)
1286	LFD	f12,  6 * SIZE(BO)
1287	LFD	f13,  7 * SIZE(BO)
1288
1289	FMADD	f0,  f9, f10, f0
1290	FMADD	f1,  f9, f11, f1
1291	FMADD	f2,  f9, f12, f2
1292	FMADD	f3,  f9, f13, f3
1293
1294	LFD	f9,   3 * SIZE(AO)
1295
1296	LFD	f10,  8 * SIZE(BO)
1297	LFD	f11,  9 * SIZE(BO)
1298	LFD	f12, 10 * SIZE(BO)
1299	LFD	f13, 11 * SIZE(BO)
1300
1301	addi	AO, AO,  2 * SIZE
1302	addi	BO, BO,  8 * SIZE
1303	bdnz	LL(52)
1304	.align 4
1305
1306LL(55):
1307	andi.	r0,  K,  1
1308	ble	LL(58)
1309	.align 4
1310
1311LL(56):
1312	FMADD	f0,  f8, f10, f0
1313	FMADD	f1,  f8, f11, f1
1314	FMADD	f2,  f8, f12, f2
1315	FMADD	f3,  f8, f13, f3
1316
1317	LFD	f8,   2 * SIZE(AO)
1318
1319	LFD	f10,  4 * SIZE(BO)
1320	LFD	f11,  5 * SIZE(BO)
1321	LFD	f12,  6 * SIZE(BO)
1322	LFD	f13,  7 * SIZE(BO)
1323
1324	addi	AO, AO,  1 * SIZE
1325	addi	BO, BO,  4 * SIZE
1326	.align 4
1327
1328LL(58):
1329	lfs	f13,  ALPHA(SP)
1330
1331	LFD	f8,  0 * SIZE(CO1)
1332	LFD	f9,  0 * SIZE(CO2)
1333	LFD	f10, 0 * SIZE(CO3)
1334	LFD	f11, 0 * SIZE(CO4)
1335
1336	FMADD	f0,  f0, f13, f8
1337	FMADD	f1,  f1, f13, f9
1338	FMADD	f2,  f2, f13, f10
1339	FMADD	f3,  f3, f13, f11
1340
1341	STFD	f0,  0 * SIZE(CO1)
1342	STFD	f1,  0 * SIZE(CO2)
1343	STFD	f2,  0 * SIZE(CO3)
1344	STFD	f3,  0 * SIZE(CO4)
1345	.align 4
1346
1347LL(59):
1348	mr	B, BO
1349
1350	addic.	J, J, -1
1351	bgt	LL(01)
1352	.align 4
1353
1354LL(60):
1355	andi.	r0, N,  2
1356	ble	LL(120)
1357
1358	mr	CO1, C
1359	add	CO2, C,  LDC
1360	add	C,  CO2, LDC
1361
1362	mr	AO, A
1363	srawi.	I, M,  4
1364	ble	LL(80)
1365	.align 4
1366
1367LL(71):
1368	vxor	c01, c01, c01
1369	LOAD_B	b1, OFFSET_0, B
1370	vxor	c02, c02, c02
1371	vxor	c03, c03, c03
1372	LOAD_A	a1, OFFSET_0, AO
1373	vxor	c04, c04, c04
1374	LOAD_A	a2, OFFSET_1, AO
1375	vxor	c05, c05, c05
1376	LOAD_A	a3, OFFSET_2, AO
1377	vxor	c06, c06, c06
1378	LOAD_A	a4, OFFSET_3, AO
1379	vxor	c07, c07, c07
1380	vxor	c08, c08, c08
1381
1382	mr	BO, B
1383	dcbtst	CO1, PREC
1384	dcbtst	CO2, PREC
1385
1386	vspltw	bp1, b1, 0
1387
1388	srawi.	r0,  K,  1
1389	mtspr	CTR, r0
1390	ble	LL(75)
1391	.align 4
1392
1393LL(72):
1394	LOAD_A	a5, OFFSET_4, AO
1395	LOAD_A	a6, OFFSET_5, AO
1396	LOAD_A	a7, OFFSET_6, AO
1397	LOAD_A	a8, OFFSET_7, AO
1398
1399	vmaddfp	c01, a1, bp1, c01
1400	vspltw	bp2, b1, 1
1401	vmaddfp	c02, a2, bp1, c02
1402	vmaddfp	c03, a3, bp1, c03
1403	vmaddfp	c04, a4, bp1, c04
1404
1405	vmaddfp	c05, a1, bp2, c05
1406	vspltw	bp1, b1, 2
1407	vmaddfp	c06, a2, bp2, c06
1408	vmaddfp	c07, a3, bp2, c07
1409	vmaddfp	c08, a4, bp2, c08
1410
1411	vmaddfp	c01, a5, bp1, c01
1412	vspltw	bp2, b1, 3
1413	vmaddfp	c02, a6, bp1, c02
1414	vmaddfp	c03, a7, bp1, c03
1415	vmaddfp	c04, a8, bp1, c04
1416
1417	LOAD_B	b1, OFFSET_1, BO
1418	vspltw	bp1, b1, 0
1419
1420	vmaddfp	c05, a5, bp2, c05
1421	vmaddfp	c06, a6, bp2, c06
1422	vmaddfp	c07, a7, bp2, c07
1423	vmaddfp	c08, a8, bp2, c08
1424
1425	addi	AO, AO, 32 * SIZE
1426	addi	BO, BO,  4 * SIZE
1427
1428	LOAD_A	a1, OFFSET_0, AO
1429	LOAD_A	a2, OFFSET_1, AO
1430	LOAD_A	a3, OFFSET_2, AO
1431	LOAD_A	a4, OFFSET_3, AO
1432	bdnz	LL(72)
1433	.align 4
1434
1435LL(75):
1436	andi.	r0,  K,  1
1437	lvx	alpha, OFFSET_0, SP
1438	vxor	VZERO, VZERO, VZERO
1439	ble+	LL(78)
1440	.align 4
1441
1442LL(76):
1443	vmaddfp	c01, a1, bp1, c01
1444	vspltw	bp2, b1, 1
1445	vmaddfp	c02, a2, bp1, c02
1446	addi	AO, AO, 16 * SIZE
1447	vmaddfp	c03, a3, bp1, c03
1448	addi	BO, BO,  2 * SIZE
1449	vmaddfp	c04, a4, bp1, c04
1450	nop
1451
1452	vmaddfp	c05, a1, bp2, c05
1453	vmaddfp	c06, a2, bp2, c06
1454	vmaddfp	c07, a3, bp2, c07
1455	vmaddfp	c08, a4, bp2, c08
1456	.align 4
1457
1458LL(78):
1459	lvx	C1, OFFSET_0, CO1
1460	lvx	C2, OFFSET_1, CO1
1461	lvx	C3, OFFSET_2, CO1
1462	lvx	C4, OFFSET_3, CO1
1463	lvx	C5, OFFSET_4, CO1
1464
1465	lvsr	PERMRSHIFT1, 0, CO1
1466	lvsr	PERMRSHIFT2, 0, CO2
1467	lvsr	PERMRSHIFT3, 0, CO3
1468	lvsr	PERMRSHIFT4, 0, CO4
1469
1470	vperm	c00, VZERO, c01,   PERMRSHIFT1
1471	vperm	c01, c01,   c02,   PERMRSHIFT1
1472	vperm	c02, c02,   c03,   PERMRSHIFT1
1473	vperm	c03, c03,   c04,   PERMRSHIFT1
1474	vperm	c04, c04,   VZERO, PERMRSHIFT1
1475
1476	vmaddfp	c00, alpha, c00, C1
1477	vmaddfp	c01, alpha, c01, C2
1478	vmaddfp	c02, alpha, c02, C3
1479	vmaddfp	c03, alpha, c03, C4
1480	vmaddfp	c04, alpha, c04, C5
1481
1482	stvx	c00, OFFSET_0, CO1
1483	stvx	c01, OFFSET_1, CO1
1484	stvx	c02, OFFSET_2, CO1
1485	stvx	c03, OFFSET_3, CO1
1486	stvx	c04, OFFSET_4, CO1
1487
1488	lvx	C1, OFFSET_0, CO2
1489	lvx	C2, OFFSET_1, CO2
1490	lvx	C3, OFFSET_2, CO2
1491	lvx	C4, OFFSET_3, CO2
1492	lvx	C5, OFFSET_4, CO2
1493
1494	vperm	c00, VZERO, c05,   PERMRSHIFT2
1495	vperm	c05, c05,   c06,   PERMRSHIFT2
1496	vperm	c06, c06,   c07,   PERMRSHIFT2
1497	vperm	c07, c07,   c08,   PERMRSHIFT2
1498	vperm	c08, c08,   VZERO, PERMRSHIFT2
1499
1500	vmaddfp	c00, alpha, c00, C1
1501	vmaddfp	c05, alpha, c05, C2
1502	vmaddfp	c06, alpha, c06, C3
1503	vmaddfp	c07, alpha, c07, C4
1504	vmaddfp	c08, alpha, c08, C5
1505
1506	stvx	c00, OFFSET_0, CO2
1507	stvx	c05, OFFSET_1, CO2
1508	stvx	c06, OFFSET_2, CO2
1509	stvx	c07, OFFSET_3, CO2
1510	stvx	c08, OFFSET_4, CO2
1511
1512	addi	CO1, CO1, 16 * SIZE
1513	addi	CO2, CO2, 16 * SIZE
1514	addic.	I, I, -1
1515	bgt+	LL(71)
1516	.align 4
1517
1518LL(80):
1519	andi.	I, M,  8
1520	ble	LL(90)
1521
1522	vxor	c01, c01, c01
1523	LOAD_B	b1, OFFSET_0, B
1524	vxor	c02, c02, c02
1525	vxor	c03, c03, c03
1526	LOAD_A	a1, OFFSET_0, AO
1527	vxor	c04, c04, c04
1528	LOAD_A	a2, OFFSET_1, AO
1529	vxor	c05, c05, c05
1530	LOAD_A	a3, OFFSET_2, AO
1531	vxor	c06, c06, c06
1532	LOAD_A	a4, OFFSET_3, AO
1533	vxor	c07, c07, c07
1534	vxor	c08, c08, c08
1535
1536	mr	BO, B
1537
1538	vspltw	bp1, b1, 0
1539	srawi.	r0,  K,  1
1540	mtspr	CTR, r0
1541	ble	LL(85)
1542	.align 4
1543
1544LL(82):
1545	vmaddfp	c01, a1, bp1, c01
1546	vspltw	bp2, b1, 1
1547	vmaddfp	c02, a2, bp1, c02
1548
1549	vmaddfp	c05, a1, bp2, c05
1550	vspltw	bp1, b1, 2
1551	vmaddfp	c06, a2, bp2, c06
1552
1553	vmaddfp	c03, a3, bp1, c03
1554	vspltw	bp2, b1, 3
1555	vmaddfp	c04, a4, bp1, c04
1556
1557	LOAD_B	b1, OFFSET_1, BO
1558	vspltw	bp1, b1, 0
1559
1560	vmaddfp	c07, a3, bp2, c07
1561	vmaddfp	c08, a4, bp2, c08
1562
1563	addi	AO, AO, 16 * SIZE
1564	addi	BO, BO,  4 * SIZE
1565
1566	LOAD_A	a1, OFFSET_0, AO
1567	LOAD_A	a2, OFFSET_1, AO
1568	LOAD_A	a3, OFFSET_2, AO
1569	LOAD_A	a4, OFFSET_3, AO
1570	bdnz	LL(82)
1571	.align 4
1572
1573LL(85):
1574	andi.	r0,  K,  1
1575	lvx	alpha, OFFSET_0, SP
1576	vxor	VZERO, VZERO, VZERO
1577	ble+	LL(88)
1578	.align 4
1579
1580LL(86):
1581	vmaddfp	c01, a1, bp1, c01
1582	vspltw	bp2, b1, 1
1583	vmaddfp	c02, a2, bp1, c02
1584	addi	AO, AO,  8 * SIZE
1585	vmaddfp	c05, a1, bp2, c05
1586	addi	BO, BO,  2 * SIZE
1587	vmaddfp	c06, a2, bp2, c06
1588	.align 4
1589
1590LL(88):
1591	lvx	C1, OFFSET_0, CO1
1592	lvx	C2, OFFSET_1, CO1
1593	lvx	C3, OFFSET_2, CO1
1594
1595	vaddfp	c01, c01, c03
1596	vaddfp	c02, c02, c04
1597	vaddfp	c05, c05, c07
1598	vaddfp	c06, c06, c08
1599
1600	lvsr	PERMRSHIFT1, 0, CO1
1601	lvsr	PERMRSHIFT2, 0, CO2
1602	lvsr	PERMRSHIFT3, 0, CO3
1603	lvsr	PERMRSHIFT4, 0, CO4
1604
1605	vperm	c00, VZERO, c01,   PERMRSHIFT1
1606	vperm	c01, c01,   c02,   PERMRSHIFT1
1607	vperm	c02, c02,   VZERO, PERMRSHIFT1
1608
1609	vmaddfp	c00, alpha, c00, C1
1610	vmaddfp	c01, alpha, c01, C2
1611	vmaddfp	c02, alpha, c02, C3
1612
1613	stvx	c00, OFFSET_0, CO1
1614	stvx	c01, OFFSET_1, CO1
1615	stvx	c02, OFFSET_2, CO1
1616
1617	lvx	C1, OFFSET_0, CO2
1618	lvx	C2, OFFSET_1, CO2
1619	lvx	C3, OFFSET_2, CO2
1620
1621	vperm	c00, VZERO, c05,   PERMRSHIFT2
1622	vperm	c05, c05,   c06,   PERMRSHIFT2
1623	vperm	c06, c06,   VZERO, PERMRSHIFT2
1624
1625	vmaddfp	c00, alpha, c00, C1
1626	vmaddfp	c05, alpha, c05, C2
1627	vmaddfp	c06, alpha, c06, C3
1628
1629	stvx	c00, OFFSET_0, CO2
1630	stvx	c05, OFFSET_1, CO2
1631	stvx	c06, OFFSET_2, CO2
1632
1633	addi	CO1, CO1, 8 * SIZE
1634	addi	CO2, CO2, 8 * SIZE
1635	.align 4
1636
1637LL(90):
1638	andi.	I, M,  4
1639	ble	LL(100)
1640
1641	vxor	c01, c01, c01
1642	LOAD_B	b1, OFFSET_0, B
1643	vxor	c02, c02, c02
1644	LOAD_A	a1, OFFSET_0, AO
1645	LOAD_A	a2, OFFSET_1, AO
1646	vxor	c05, c05, c05
1647	vxor	c06, c06, c06
1648
1649	mr	BO, B
1650
1651	vspltw	bp1, b1, 0
1652
1653	srawi.	r0,  K,  1
1654	mtspr	CTR, r0
1655	ble	LL(95)
1656	.align 4
1657
1658LL(92):
1659	vmaddfp	c01, a1, bp1, c01
1660	vspltw	bp2, b1, 1
1661
1662	vmaddfp	c05, a1, bp2, c05
1663	vspltw	bp1, b1, 2
1664
1665	vmaddfp	c02, a2, bp1, c02
1666	vspltw	bp2, b1, 3
1667
1668	LOAD_B	b1, OFFSET_1, BO
1669	vspltw	bp1, b1, 0
1670
1671	vmaddfp	c06, a2, bp2, c06
1672
1673	addi	AO, AO,  8 * SIZE
1674	addi	BO, BO,  4 * SIZE
1675
1676	LOAD_A	a1, OFFSET_0, AO
1677	LOAD_A	a2, OFFSET_1, AO
1678	bdnz	LL(92)
1679	.align 4
1680
1681LL(95):
1682	andi.	r0,  K,  1
1683	lvx	alpha, OFFSET_0, SP
1684	vxor	VZERO, VZERO, VZERO
1685	ble+	LL(98)
1686	.align 4
1687
1688LL(96):
1689	vspltw	bp2, b1, 1
1690	vmaddfp	c01, a1, bp1, c01
1691	vmaddfp	c05, a1, bp2, c05
1692	addi	AO, AO,  4 * SIZE
1693	addi	BO, BO,  2 * SIZE
1694	.align 4
1695
1696LL(98):
1697	vaddfp	c01, c01, c02
1698	vaddfp	c05, c05, c06
1699	vaddfp	c09, c09, c10
1700	vaddfp	c13, c13, c14
1701
1702	lvx	C1, OFFSET_0, CO1
1703	lvx	C2, OFFSET_1, CO1
1704
1705	lvsr	PERMRSHIFT1, 0, CO1
1706	lvsr	PERMRSHIFT2, 0, CO2
1707	lvsr	PERMRSHIFT3, 0, CO3
1708	lvsr	PERMRSHIFT4, 0, CO4
1709
1710	vperm	c00, VZERO, c01,   PERMRSHIFT1
1711	vperm	c01, c01,   VZERO, PERMRSHIFT1
1712
1713	vmaddfp	c00, alpha, c00, C1
1714	vmaddfp	c01, alpha, c01, C2
1715
1716	stvx	c00, OFFSET_0, CO1
1717	stvx	c01, OFFSET_1, CO1
1718
1719	lvx	C1, OFFSET_0, CO2
1720	lvx	C2, OFFSET_1, CO2
1721
1722	vperm	c00, VZERO, c05,   PERMRSHIFT2
1723	vperm	c05, c05,   VZERO, PERMRSHIFT2
1724
1725	vmaddfp	c00, alpha, c00, C1
1726	vmaddfp	c05, alpha, c05, C2
1727
1728	stvx	c00, OFFSET_0, CO2
1729	stvx	c05, OFFSET_1, CO2
1730
1731	addi	CO1, CO1, 4 * SIZE
1732	addi	CO2, CO2, 4 * SIZE
1733	.align 4
1734
1735LL(100):
1736	andi.	I, M,  2
1737	ble	LL(110)
1738
1739	mr	BO, B
1740
1741	LFD	f8,   0 * SIZE(AO)
1742	LFD	f9,   1 * SIZE(AO)
1743
1744	LFD	f10,  0 * SIZE(B)
1745	LFD	f11,  1 * SIZE(B)
1746	LFD	f12,  2 * SIZE(B)
1747	LFD	f13,  3 * SIZE(B)
1748
1749	lfs	f0,  FZERO(SP)
1750 	fmr	f1,  f0
1751	fmr	f2,  f0
1752	fmr	f3,  f0
1753
1754	fmr	f4,  f0
1755	fmr	f5,  f0
1756	fmr	f6,  f0
1757	fmr	f7,  f0
1758
1759	srawi.	r0,  K,  1
1760	mtspr	CTR, r0
1761	ble	LL(105)
1762	.align 4
1763
1764LL(102):
1765	FMADD	f0,  f8, f10, f0
1766	FMADD	f1,  f9, f10, f1
1767	FMADD	f2,  f8, f11, f2
1768	FMADD	f3,  f9, f11, f3
1769
1770	LFD	f8,   2 * SIZE(AO)
1771	LFD	f9,   3 * SIZE(AO)
1772
1773	FMADD	f4,  f8, f12, f4
1774	FMADD	f5,  f9, f12, f5
1775	FMADD	f6,  f8, f13, f6
1776	FMADD	f7,  f9, f13, f7
1777
1778	LFD	f8,  4 * SIZE(AO)
1779	LFD	f9,  5 * SIZE(AO)
1780
1781	LFD	f10,  4 * SIZE(BO)
1782	LFD	f11,  5 * SIZE(BO)
1783	LFD	f12,  6 * SIZE(BO)
1784	LFD	f13,  7 * SIZE(BO)
1785
1786	addi	AO, AO,  4 * SIZE
1787	addi	BO, BO,  4 * SIZE
1788	bdnz	LL(102)
1789	.align 4
1790
1791LL(105):
1792	andi.	r0,  K,  1
1793	lfs	f13,  ALPHA(SP)
1794	ble	LL(108)
1795	.align 4
1796
1797LL(106):
1798	FMADD	f0,  f8, f10, f0
1799	FMADD	f1,  f9, f10, f1
1800	FMADD	f2,  f8, f11, f2
1801	FMADD	f3,  f9, f11, f3
1802
1803	LFD	f8,   2 * SIZE(AO)
1804	LFD	f9,   3 * SIZE(AO)
1805
1806	LFD	f10,  2 * SIZE(BO)
1807	LFD	f11,  3 * SIZE(BO)
1808
1809	addi	AO, AO,  2 * SIZE
1810	addi	BO, BO,  2 * SIZE
1811	.align 4
1812
1813LL(108):
1814	LFD	f8,  0 * SIZE(CO1)
1815	LFD	f9,  1 * SIZE(CO1)
1816	LFD	f10, 0 * SIZE(CO2)
1817	LFD	f11, 1 * SIZE(CO2)
1818
1819	FADD	f0, f0, f4
1820	FADD	f1, f1, f5
1821	FADD	f2, f2, f6
1822	FADD	f3, f3, f7
1823
1824	FMADD	f0,  f0, f13, f8
1825	FMADD	f1,  f1, f13, f9
1826	FMADD	f2,  f2, f13, f10
1827	FMADD	f3,  f3, f13, f11
1828
1829	STFD	f0,  0 * SIZE(CO1)
1830	STFD	f1,  1 * SIZE(CO1)
1831	STFD	f2,  0 * SIZE(CO2)
1832	STFD	f3,  1 * SIZE(CO2)
1833
1834	addi	CO1, CO1, 2 * SIZE
1835	addi	CO2, CO2, 2 * SIZE
1836	.align 4
1837
1838LL(110):
1839	andi.	I, M,  1
1840	ble	LL(119)
1841
1842	mr	BO, B
1843
1844	LFD	f8,   0 * SIZE(AO)
1845	LFD	f9,   1 * SIZE(AO)
1846
1847	LFD	f10,  0 * SIZE(B)
1848	LFD	f11,  1 * SIZE(B)
1849	LFD	f12,  2 * SIZE(B)
1850	LFD	f13,  3 * SIZE(B)
1851
1852	lfs	f0,  FZERO(SP)
1853 	fmr	f1,  f0
1854	fmr	f2,  f0
1855	fmr	f3,  f0
1856
1857	srawi.	r0,  K,  1
1858	mtspr	CTR, r0
1859	ble	LL(115)
1860	.align 4
1861
1862LL(112):
1863	FMADD	f0,  f8, f10, f0
1864	FMADD	f1,  f8, f11, f1
1865	FMADD	f2,  f9, f12, f2
1866	FMADD	f3,  f9, f13, f3
1867
1868	LFD	f8,   2 * SIZE(AO)
1869	LFD	f9,   3 * SIZE(AO)
1870
1871	LFD	f10,  4 * SIZE(BO)
1872	LFD	f11,  5 * SIZE(BO)
1873	LFD	f12,  6 * SIZE(BO)
1874	LFD	f13,  7 * SIZE(BO)
1875
1876	addi	AO, AO,  2 * SIZE
1877	addi	BO, BO,  4 * SIZE
1878	bdnz	LL(112)
1879	.align 4
1880
1881LL(115):
1882	andi.	r0,  K,  1
1883	lfs	f13,  ALPHA(SP)
1884	ble	LL(118)
1885	.align 4
1886
1887LL(116):
1888	FMADD	f0,  f8, f10, f0
1889	FMADD	f1,  f8, f11, f1
1890
1891	LFD	f8,   1 * SIZE(AO)
1892
1893	LFD	f10,  2 * SIZE(BO)
1894	LFD	f11,  3 * SIZE(BO)
1895
1896	addi	AO, AO,  1 * SIZE
1897	addi	BO, BO,  2 * SIZE
1898	.align 4
1899
1900LL(118):
1901	LFD	f8,  0 * SIZE(CO1)
1902	LFD	f9,  0 * SIZE(CO2)
1903
1904	FADD	f0, f0, f2
1905	FADD	f1, f1, f3
1906
1907	FMADD	f0,  f0, f13, f8
1908	FMADD	f1,  f1, f13, f9
1909
1910	STFD	f0,  0 * SIZE(CO1)
1911	STFD	f1,  0 * SIZE(CO2)
1912	.align 4
1913
1914LL(119):
1915	mr	B, BO
1916	.align 4
1917
1918LL(120):
1919	andi.	r0, N,  1
1920	ble	LL(999)
1921
1922	mr	CO1, C
1923	mr	AO, A
1924	srawi.	I, M,  4
1925	ble	LL(140)
1926	.align 4
1927
1928LL(130):
1929	vxor	c01, c01, c01
1930	vxor	c02, c02, c02
1931	vxor	c03, c03, c03
1932	vxor	c04, c04, c04
1933
1934	mr	BO, B
1935
1936	dcbtst	CO1, PREC
1937
1938	mr	J, K
1939
1940	andi.	r0,  B,  15
1941	ble+	LL(131)
1942
1943	LOAD_A	a1, OFFSET_0, AO
1944	LOAD_A	a2, OFFSET_1, AO
1945	LOAD_A	a3, OFFSET_2, AO
1946	LOAD_A	a4, OFFSET_3, AO
1947	LOAD_B	b1, OFFSET_0, BO
1948	vspltw	bp1, b1,  2
1949	vspltw	bp2, b1,  3
1950
1951	addi	AO, AO, 16 * SIZE
1952	addi	BO, BO, SIZE
1953
1954	vmaddfp	c01, a1, bp1, c01
1955	vmaddfp	c02, a2, bp1, c02
1956	vmaddfp	c03, a3, bp1, c03
1957	vmaddfp	c04, a4, bp1, c04
1958	subi	J, J, 1
1959	cmpwi	cr0, J, 0
1960	ble	LL(138)
1961
1962	LOAD_A	a1, OFFSET_0, AO
1963	LOAD_A	a2, OFFSET_1, AO
1964	LOAD_A	a3, OFFSET_2, AO
1965	LOAD_A	a4, OFFSET_3, AO
1966
1967	addi	AO, AO, 16 * SIZE
1968	addi	BO, BO, SIZE
1969
1970	vmaddfp	c01, a1, bp2, c01
1971	vmaddfp	c02, a2, bp2, c02
1972	vmaddfp	c03, a3, bp2, c03
1973	vmaddfp	c04, a4, bp2, c04
1974	subi	J, J, 1
1975	cmpwi	cr0, J, 0
1976	ble	LL(138)
1977	.align 4
1978
1979
1980LL(131):
1981	LOAD_A	a1, OFFSET_0, AO
1982	LOAD_A	a2, OFFSET_1, AO
1983	LOAD_A	a3, OFFSET_2, AO
1984	LOAD_A	a4, OFFSET_3, AO
1985	LOAD_A	a5, OFFSET_4, AO
1986	LOAD_A	a6, OFFSET_5, AO
1987	LOAD_A	a7, OFFSET_6, AO
1988	LOAD_A	a8, OFFSET_7, AO
1989
1990	LOAD_B	b1, OFFSET_0, BO
1991
1992	srawi.	r0,  J,  2
1993	mtspr	CTR, r0
1994	ble	LL(135)
1995	.align 4
1996
1997LL(133):
1998	vspltw	bp1, b1,  0
1999	vmaddfp	c01, a1, bp1, c01
2000	vmaddfp	c02, a2, bp1, c02
2001	vmaddfp	c03, a3, bp1, c03
2002	vmaddfp	c04, a4, bp1, c04
2003
2004	vspltw	bp2, b1,  1
2005	vmaddfp	c01, a5, bp2, c01
2006	vmaddfp	c02, a6, bp2, c02
2007	vmaddfp	c03, a7, bp2, c03
2008	vmaddfp	c04, a8, bp2, c04
2009
2010	addi	AO, AO, 32 * SIZE
2011
2012	LOAD_A	a1, OFFSET_0, AO
2013	LOAD_A	a2, OFFSET_1, AO
2014	LOAD_A	a3, OFFSET_2, AO
2015	LOAD_A	a4, OFFSET_3, AO
2016
2017	vspltw	bp1, b1,  2
2018	vmaddfp	c01, a1, bp1, c01
2019	vmaddfp	c02, a2, bp1, c02
2020	vmaddfp	c03, a3, bp1, c03
2021	vmaddfp	c04, a4, bp1, c04
2022
2023	LOAD_A	a5, OFFSET_4, AO
2024	LOAD_A	a6, OFFSET_5, AO
2025	LOAD_A	a7, OFFSET_6, AO
2026	LOAD_A	a8, OFFSET_7, AO
2027
2028	vspltw	bp2, b1,  3
2029	vmaddfp	c01, a5, bp2, c01
2030	vmaddfp	c02, a6, bp2, c02
2031	vmaddfp	c03, a7, bp2, c03
2032	vmaddfp	c04, a8, bp2, c04
2033
2034	addi	AO, AO, 32 * SIZE
2035	addi	BO, BO,  4 * SIZE
2036
2037	LOAD_A	a1, OFFSET_0, AO
2038	LOAD_A	a2, OFFSET_1, AO
2039	LOAD_A	a3, OFFSET_2, AO
2040	LOAD_A	a4, OFFSET_3, AO
2041
2042	LOAD_A	a5, OFFSET_4, AO
2043	LOAD_A	a6, OFFSET_5, AO
2044	LOAD_A	a7, OFFSET_6, AO
2045	LOAD_A	a8, OFFSET_7, AO
2046
2047	LOAD_B	b1, OFFSET_0, BO
2048
2049	bdnz	LL(133)
2050	.align 4
2051
2052LL(135):
2053	andi.	r0,  J,  3
2054	ble+	LL(138)
2055
2056	cmpwi	cr0, r0, 3
2057	bne	LL(136)
2058
2059	vspltw	bp1, b1,  0
2060	vmaddfp	c01, a1, bp1, c01
2061	vmaddfp	c02, a2, bp1, c02
2062	vmaddfp	c03, a3, bp1, c03
2063	vmaddfp	c04, a4, bp1, c04
2064
2065	addi	AO, AO, 16 * SIZE
2066
2067	LOAD_A	a1, OFFSET_0, AO
2068	LOAD_A	a2, OFFSET_1, AO
2069	LOAD_A	a3, OFFSET_2, AO
2070	LOAD_A	a4, OFFSET_3, AO
2071
2072	vspltw	bp2, b1,  1
2073	vmaddfp	c01, a1, bp2, c01
2074	vmaddfp	c02, a2, bp2, c02
2075	vmaddfp	c03, a3, bp2, c03
2076	vmaddfp	c04, a4, bp2, c04
2077
2078	addi	AO, AO, 16 * SIZE
2079
2080	LOAD_A	a1, OFFSET_0, AO
2081	LOAD_A	a2, OFFSET_1, AO
2082	LOAD_A	a3, OFFSET_2, AO
2083	LOAD_A	a4, OFFSET_3, AO
2084
2085	vspltw	bp1, b1,  2
2086	vmaddfp	c01, a1, bp1, c01
2087	vmaddfp	c02, a2, bp1, c02
2088	vmaddfp	c03, a3, bp1, c03
2089	vmaddfp	c04, a4, bp1, c04
2090
2091	addi	AO, AO, 16 * SIZE
2092	addi	BO, BO,  3 * SIZE
2093	b	LL(138)
2094	.align 4
2095
2096LL(136):
2097	cmpwi	cr0, r0, 2
2098	bne	LL(137)
2099
2100	vspltw	bp1, b1,  0
2101	vspltw	bp2, b1,  1
2102
2103	vmaddfp	c01, a1, bp1, c01
2104	vmaddfp	c02, a2, bp1, c02
2105	vmaddfp	c03, a3, bp1, c03
2106	vmaddfp	c04, a4, bp1, c04
2107
2108	LOAD_A	a1, OFFSET_4, AO
2109	LOAD_A	a2, OFFSET_5, AO
2110	LOAD_A	a3, OFFSET_6, AO
2111	LOAD_A	a4, OFFSET_7, AO
2112
2113	vmaddfp	c01, a1, bp2, c01
2114	vmaddfp	c02, a2, bp2, c02
2115	vmaddfp	c03, a3, bp2, c03
2116	vmaddfp	c04, a4, bp2, c04
2117
2118	addi	AO, AO, 32 * SIZE
2119	addi	BO, BO,  2 * SIZE
2120	b	LL(138)
2121	.align 4
2122
2123LL(137):
2124	cmpwi	cr0, r0, 1
2125	bne	LL(138)
2126
2127	vspltw	bp1, b1,  0
2128
2129	vmaddfp	c01, a1, bp1, c01
2130	vmaddfp	c02, a2, bp1, c02
2131	vmaddfp	c03, a3, bp1, c03
2132	vmaddfp	c04, a4, bp1, c04
2133
2134	addi	AO, AO, 16 * SIZE
2135	addi	BO, BO,  1 * SIZE
2136	.align 4
2137
2138LL(138):
2139	lvx	alpha, OFFSET_0, SP
2140	vxor	VZERO, VZERO, VZERO
2141
2142	lvx	C1, OFFSET_0, CO1
2143	lvx	C2, OFFSET_1, CO1
2144	lvx	C3, OFFSET_2, CO1
2145	lvx	C4, OFFSET_3, CO1
2146	lvx	C5, OFFSET_4, CO1
2147
2148	lvsr	PERMRSHIFT1, 0, CO1
2149
2150	vperm	c00, VZERO, c01,   PERMRSHIFT1
2151	vperm	c01, c01,   c02,   PERMRSHIFT1
2152	vperm	c02, c02,   c03,   PERMRSHIFT1
2153	vperm	c03, c03,   c04,   PERMRSHIFT1
2154	vperm	c04, c04,   VZERO, PERMRSHIFT1
2155
2156	vmaddfp	c00, alpha, c00, C1
2157	vmaddfp	c01, alpha, c01, C2
2158	vmaddfp	c02, alpha, c02, C3
2159	vmaddfp	c03, alpha, c03, C4
2160	vmaddfp	c04, alpha, c04, C5
2161
2162	stvx	c00, OFFSET_0, CO1
2163	stvx	c01, OFFSET_1, CO1
2164	stvx	c02, OFFSET_2, CO1
2165	stvx	c03, OFFSET_3, CO1
2166	stvx	c04, OFFSET_4, CO1
2167
2168	addi	CO1, CO1, 16 * SIZE
2169	addic.	I, I, -1
2170	bgt+	LL(130)
2171	.align 4
2172
2173LL(140):
2174	andi.	I, M,  8
2175	ble	LL(150)
2176
2177	vxor	c01, c01, c01
2178	vxor	c02, c02, c02
2179
2180	mr	BO, B
2181
2182	mr	J, K
2183
2184	andi.	r0,  B,  15
2185	ble+	LL(141)
2186
2187	LOAD_A	a1, OFFSET_0, AO
2188	LOAD_A	a2, OFFSET_1, AO
2189	LOAD_B	b1, OFFSET_0, BO
2190	vspltw	bp1, b1,  2
2191	vspltw	bp2, b1,  3
2192
2193	addi	AO, AO, 8 * SIZE
2194	addi	BO, BO, SIZE
2195
2196	vmaddfp	c01, a1, bp1, c01
2197	vmaddfp	c02, a2, bp1, c02
2198	subi	J, J, 1
2199	cmpwi	cr0, J, 0
2200	ble	LL(148)
2201
2202	LOAD_A	a1, OFFSET_0, AO
2203	LOAD_A	a2, OFFSET_1, AO
2204
2205	addi	AO, AO, 8 * SIZE
2206	addi	BO, BO, SIZE
2207
2208	vmaddfp	c01, a1, bp2, c01
2209	vmaddfp	c02, a2, bp2, c02
2210	subi	J, J, 1
2211	cmpwi	cr0, J, 0
2212	ble	LL(148)
2213	.align 4
2214
2215
2216LL(141):
2217	LOAD_A	a1, OFFSET_0, AO
2218	LOAD_A	a2, OFFSET_1, AO
2219	LOAD_A	a3, OFFSET_2, AO
2220	LOAD_A	a4, OFFSET_3, AO
2221	LOAD_A	a5, OFFSET_4, AO
2222	LOAD_A	a6, OFFSET_5, AO
2223	LOAD_A	a7, OFFSET_6, AO
2224	LOAD_A	a8, OFFSET_7, AO
2225
2226	LOAD_B	b1, OFFSET_0, BO
2227
2228	srawi.	r0,  J,  2
2229	mtspr	CTR, r0
2230	ble	LL(145)
2231	.align 4
2232
2233LL(143):
2234	vspltw	bp1, b1,  0
2235	vmaddfp	c01, a1, bp1, c01
2236	vmaddfp	c02, a2, bp1, c02
2237
2238	vspltw	bp2, b1,  1
2239	vmaddfp	c01, a3, bp2, c01
2240	vmaddfp	c02, a4, bp2, c02
2241
2242	vspltw	bp1, b1,  2
2243	vmaddfp	c01, a5, bp1, c01
2244	vmaddfp	c02, a6, bp1, c02
2245
2246	vspltw	bp2, b1,  3
2247	vmaddfp	c01, a7, bp2, c01
2248	vmaddfp	c02, a8, bp2, c02
2249
2250	addi	AO, AO, 32 * SIZE
2251	addi	BO, BO,  4 * SIZE
2252
2253	LOAD_A	a1, OFFSET_0, AO
2254	LOAD_A	a2, OFFSET_1, AO
2255	LOAD_A	a3, OFFSET_2, AO
2256	LOAD_A	a4, OFFSET_3, AO
2257
2258	LOAD_A	a5, OFFSET_4, AO
2259	LOAD_A	a6, OFFSET_5, AO
2260	LOAD_A	a7, OFFSET_6, AO
2261	LOAD_A	a8, OFFSET_7, AO
2262
2263	LOAD_B	b1, OFFSET_0, BO
2264
2265	bdnz	LL(143)
2266	.align 4
2267
2268LL(145):
2269	andi.	r0,  J,  3
2270	ble+	LL(148)
2271
2272	cmpwi	cr0, r0, 3
2273	bne	LL(146)
2274
2275	vspltw	bp1, b1,  0
2276	vmaddfp	c01, a1, bp1, c01
2277	vmaddfp	c02, a2, bp1, c02
2278
2279	vspltw	bp2, b1,  1
2280	vmaddfp	c01, a3, bp2, c01
2281	vmaddfp	c02, a4, bp2, c02
2282
2283	LOAD_A	a1, OFFSET_4, AO
2284	LOAD_A	a2, OFFSET_5, AO
2285
2286	vspltw	bp1, b1,  2
2287	vmaddfp	c01, a1, bp1, c01
2288	vmaddfp	c02, a2, bp1, c02
2289
2290
2291	addi	AO, AO, 24 * SIZE
2292	addi	BO, BO,  3 * SIZE
2293	b	LL(148)
2294	.align 4
2295
2296LL(146):
2297	cmpwi	cr0, r0, 2
2298	bne	LL(147)
2299
2300	vspltw	bp1, b1,  0
2301	vspltw	bp2, b1,  1
2302
2303	vmaddfp	c01, a1, bp1, c01
2304	vmaddfp	c02, a2, bp1, c02
2305
2306	vmaddfp	c01, a3, bp2, c01
2307	vmaddfp	c02, a4, bp2, c02
2308
2309	addi	AO, AO, 16 * SIZE
2310	addi	BO, BO,  2 * SIZE
2311	b	LL(148)
2312	.align 4
2313
2314LL(147):
2315	cmpwi	cr0, r0, 1
2316	bne	LL(148)
2317
2318	vspltw	bp1, b1,  0
2319
2320	vmaddfp	c01, a1, bp1, c01
2321	vmaddfp	c02, a2, bp1, c02
2322
2323	addi	AO, AO,  8 * SIZE
2324	addi	BO, BO,  1 * SIZE
2325	.align 4
2326
2327LL(148):
2328	lvx	alpha, OFFSET_0, SP
2329	vxor	VZERO, VZERO, VZERO
2330
2331	lvx	C1, OFFSET_0, CO1
2332	lvx	C2, OFFSET_1, CO1
2333	lvx	C3, OFFSET_2, CO1
2334
2335	lvsr	PERMRSHIFT1, 0, CO1
2336
2337	vperm	c00, VZERO, c01,   PERMRSHIFT1
2338	vperm	c01, c01,   c02,   PERMRSHIFT1
2339	vperm	c02, c02,   VZERO, PERMRSHIFT1
2340
2341	vmaddfp	c00, alpha, c00, C1
2342	vmaddfp	c01, alpha, c01, C2
2343	vmaddfp	c02, alpha, c02, C3
2344
2345	stvx	c00, OFFSET_0, CO1
2346	stvx	c01, OFFSET_1, CO1
2347	stvx	c02, OFFSET_2, CO1
2348	addi	CO1, CO1, 8 * SIZE
2349	.align 4
2350
2351LL(150):
2352	andi.	I, M,  4
2353	ble	LL(160)
2354
2355	vxor	c01, c01, c01
2356
2357	mr	BO, B
2358
2359	mr	J, K
2360
2361	andi.	r0,  B,  15
2362	ble+	LL(151)
2363
2364	LOAD_A	a1, OFFSET_0, AO
2365	LOAD_B	b1, OFFSET_0, BO
2366	vspltw	bp1, b1,  2
2367	vspltw	bp2, b1,  3
2368
2369	addi	AO, AO, 4 * SIZE
2370	addi	BO, BO, SIZE
2371
2372	vmaddfp	c01, a1, bp1, c01
2373	subi	J, J, 1
2374	cmpwi	cr0, J, 0
2375	ble	LL(158)
2376
2377	LOAD_A	a1, OFFSET_0, AO
2378	addi	AO, AO, 4 * SIZE
2379	addi	BO, BO, SIZE
2380
2381	vmaddfp	c01, a1, bp2, c01
2382	subi	J, J, 1
2383	cmpwi	cr0, J, 0
2384	ble	LL(158)
2385	.align 4
2386
2387
2388LL(151):
2389	LOAD_A	a1, OFFSET_0, AO
2390	LOAD_A	a2, OFFSET_1, AO
2391	LOAD_A	a3, OFFSET_2, AO
2392	LOAD_A	a4, OFFSET_3, AO
2393	LOAD_B	b1, OFFSET_0, BO
2394
2395	srawi.	r0,  J,  2
2396	mtspr	CTR, r0
2397	ble	LL(155)
2398	.align 4
2399
2400LL(153):
2401	vspltw	bp1, b1,  0
2402	vmaddfp	c01, a1, bp1, c01
2403	vspltw	bp2, b1,  1
2404	vmaddfp	c01, a2, bp2, c01
2405	vspltw	bp1, b1,  2
2406	vmaddfp	c01, a3, bp1, c01
2407	vspltw	bp2, b1,  3
2408	vmaddfp	c01, a4, bp2, c01
2409
2410	addi	AO, AO, 16 * SIZE
2411	addi	BO, BO,  4 * SIZE
2412
2413	LOAD_A	a1, OFFSET_0, AO
2414	LOAD_A	a2, OFFSET_1, AO
2415	LOAD_A	a3, OFFSET_2, AO
2416	LOAD_A	a4, OFFSET_3, AO
2417
2418	LOAD_B	b1, OFFSET_0, BO
2419
2420	bdnz	LL(153)
2421	.align 4
2422
2423LL(155):
2424	andi.	r0,  J,  3
2425	ble+	LL(158)
2426
2427	cmpwi	cr0, r0, 3
2428	bne	LL(156)
2429
2430	vspltw	bp1, b1,  0
2431	vmaddfp	c01, a1, bp1, c01
2432	vspltw	bp2, b1,  1
2433	vmaddfp	c01, a2, bp2, c01
2434	vspltw	bp1, b1,  2
2435	vmaddfp	c01, a3, bp1, c01
2436
2437	addi	AO, AO, 12 * SIZE
2438	addi	BO, BO,  3 * SIZE
2439	b	LL(158)
2440	.align 4
2441
2442LL(156):
2443	cmpwi	cr0, r0, 2
2444	bne	LL(157)
2445
2446	vspltw	bp1, b1,  0
2447	vspltw	bp2, b1,  1
2448
2449	vmaddfp	c01, a1, bp1, c01
2450	vmaddfp	c01, a2, bp2, c01
2451
2452	addi	AO, AO,  8 * SIZE
2453	addi	BO, BO,  2 * SIZE
2454	b	LL(158)
2455	.align 4
2456
2457LL(157):
2458	cmpwi	cr0, r0, 1
2459	bne	LL(158)
2460
2461	vspltw	bp1, b1,  0
2462
2463	vmaddfp	c01, a1, bp1, c01
2464
2465	addi	AO, AO,  4 * SIZE
2466	addi	BO, BO,  1 * SIZE
2467	.align 4
2468
2469LL(158):
2470	lvx	alpha, OFFSET_0, SP
2471	vxor	VZERO, VZERO, VZERO
2472
2473	lvx	C1, OFFSET_0, CO1
2474	lvx	C2, OFFSET_1, CO1
2475
2476	lvsr	PERMRSHIFT1, 0, CO1
2477
2478	vperm	c00, VZERO, c01,   PERMRSHIFT1
2479	vperm	c01, c01,   VZERO, PERMRSHIFT1
2480
2481	vmaddfp	c00, alpha, c00, C1
2482	vmaddfp	c01, alpha, c01, C2
2483
2484	stvx	c00, OFFSET_0, CO1
2485	stvx	c01, OFFSET_1, CO1
2486	addi	CO1, CO1, 4 * SIZE
2487	.align 4
2488
2489LL(160):
2490	andi.	I, M,  2
2491	ble	LL(170)
2492
2493	mr	BO, B
2494
2495	LFD	f8,   0 * SIZE(AO)
2496	LFD	f9,   1 * SIZE(AO)
2497	LFD	f10,  2 * SIZE(AO)
2498	LFD	f11,  3 * SIZE(AO)
2499
2500	LFD	f12,  0 * SIZE(B)
2501	LFD	f13,  1 * SIZE(B)
2502
2503	lfs	f0,  FZERO(SP)
2504 	fmr	f1,  f0
2505	fmr	f2,  f0
2506	fmr	f3,  f0
2507
2508	srawi.	r0,  K,  1
2509	mtspr	CTR, r0
2510	ble	LL(165)
2511	.align 4
2512
2513LL(162):
2514	FMADD	f0,  f8,  f12, f0
2515	FMADD	f1,  f9,  f12, f1
2516	FMADD	f2,  f10, f13, f2
2517	FMADD	f3,  f11, f13, f3
2518
2519	LFD	f8,   4 * SIZE(AO)
2520	LFD	f9,   5 * SIZE(AO)
2521	LFD	f10,  6 * SIZE(AO)
2522	LFD	f11,  7 * SIZE(AO)
2523
2524	LFD	f12,  2 * SIZE(BO)
2525	LFD	f13,  3 * SIZE(BO)
2526
2527	addi	AO, AO,  4 * SIZE
2528	addi	BO, BO,  2 * SIZE
2529	bdnz	LL(162)
2530	.align 4
2531
2532LL(165):
2533	andi.	r0,  K,  1
2534	lfs	f13,  ALPHA(SP)
2535	ble	LL(168)
2536	.align 4
2537
2538LL(166):
2539	FMADD	f0,  f8, f12, f0
2540	FMADD	f1,  f9, f12, f1
2541
2542	addi	AO, AO,  2 * SIZE
2543	addi	BO, BO,  1 * SIZE
2544	.align 4
2545
2546LL(168):
2547	LFD	f8,  0 * SIZE(CO1)
2548	LFD	f9,  1 * SIZE(CO1)
2549
2550	FADD	f0, f0, f2
2551	FADD	f1, f1, f3
2552
2553	FMADD	f0,  f0, f13, f8
2554	FMADD	f1,  f1, f13, f9
2555
2556	STFD	f0,  0 * SIZE(CO1)
2557	STFD	f1,  1 * SIZE(CO1)
2558
2559	addi	CO1, CO1, 2 * SIZE
2560	.align 4
2561
2562LL(170):
2563	andi.	I, M,  1
2564	ble	LL(999)
2565
2566	mr	BO, B
2567
2568	LFD	f8,   0 * SIZE(AO)
2569	LFD	f9,   1 * SIZE(AO)
2570
2571	LFD	f10,  0 * SIZE(B)
2572	LFD	f11,  1 * SIZE(B)
2573
2574	lfs	f0,  FZERO(SP)
2575 	fmr	f1,  f0
2576
2577	srawi.	r0,  K,  1
2578	mtspr	CTR, r0
2579	ble	LL(175)
2580	.align 4
2581
2582LL(172):
2583	FMADD	f0,  f8, f10, f0
2584	FMADD	f1,  f9, f11, f1
2585
2586	LFD	f8,   2 * SIZE(AO)
2587	LFD	f9,   3 * SIZE(AO)
2588	LFD	f10,  2 * SIZE(BO)
2589	LFD	f11,  3 * SIZE(BO)
2590
2591	addi	AO, AO,  2 * SIZE
2592	addi	BO, BO,  2 * SIZE
2593	bdnz	LL(172)
2594	.align 4
2595
2596LL(175):
2597	andi.	r0,  K,  1
2598	lfs	f13,  ALPHA(SP)
2599	ble	LL(178)
2600	.align 4
2601
2602LL(176):
2603	FMADD	f0,  f8, f10, f0
2604
2605	addi	AO, AO,  1 * SIZE
2606	addi	BO, BO,  1 * SIZE
2607	.align 4
2608
2609LL(178):
2610	LFD	f8,  0 * SIZE(CO1)
2611
2612	FADD	f0, f0, f1
2613
2614	FMADD	f0,  f0, f13, f8
2615
2616	STFD	f0,  0 * SIZE(CO1)
2617	.align 4
2618
2619LL(999):
2620	mr	SP, STACK
2621
2622	li	r0,  0 * 16
2623	lvx	v20, SP, r0
2624	li	r0,  1 * 16
2625	lvx	v21, SP, r0
2626	li	r0,  2 * 16
2627	lvx	v22, SP, r0
2628	li	r0,  3 * 16
2629	lvx	v23, SP, r0
2630	li	r0,  4 * 16
2631	lvx	v24, SP, r0
2632	li	r0,  5 * 16
2633	lvx	v25, SP, r0
2634	li	r0,  6 * 16
2635	lvx	v26, SP, r0
2636	li	r0,  7 * 16
2637	lvx	v27, SP, r0
2638	li	r0,  8 * 16
2639	lvx	v28, SP, r0
2640	li	r0,  9 * 16
2641	lvx	v29, SP, r0
2642	li	r0, 10 * 16
2643	lvx	v30, SP, r0
2644	li	r0, 11 * 16
2645	lvx	v31, SP, r0
2646
2647	mtspr	VRsave, VREG
2648
2649#ifdef __64BIT__
2650	ld	r31,  192(SP)
2651	ld	r30,  200(SP)
2652	ld	r29,  208(SP)
2653	ld	r28,  216(SP)
2654	ld	r27,  224(SP)
2655	ld	r26,  232(SP)
2656	ld	r25,  240(SP)
2657	ld	r24,  248(SP)
2658	ld	r23,  256(SP)
2659	ld	r22,  264(SP)
2660	ld	r21,  272(SP)
2661	ld	r20,  280(SP)
2662	ld	r19,  288(SP)
2663	ld	r18,  296(SP)
2664	ld	r17,  304(SP)
2665	ld	r16,  312(SP)
2666	ld	r15,  320(SP)
2667	ld	r14,  328(SP)
2668#else
2669	lwz	r31,  192(SP)
2670	lwz	r30,  196(SP)
2671	lwz	r29,  200(SP)
2672	lwz	r28,  204(SP)
2673	lwz	r27,  208(SP)
2674	lwz	r26,  212(SP)
2675	lwz	r25,  216(SP)
2676	lwz	r24,  220(SP)
2677	lwz	r23,  224(SP)
2678	lwz	r22,  228(SP)
2679	lwz	r21,  232(SP)
2680	lwz	r20,  236(SP)
2681	lwz	r19,  240(SP)
2682	lwz	r18,  244(SP)
2683	lwz	r17,  248(SP)
2684	lwz	r16,  252(SP)
2685	lwz	r15,  256(SP)
2686	lwz	r14,  260(SP)
2687#endif
2688
2689	addi	SP, SP, STACKSIZE
2690
2691	blr
2692
2693	EPILOGUE
2694#endif
2695