1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define	M	r3
26#define	N	r4
27#define	A	r5
28#define	LDA	r6
29#define B	r7
30
31#define AO1	r8
32#define AO2	r9
33#define AO3	r10
34#define AO4	r11
35
36#define J	r12
37
38#define AO5	r26
39#define AO6	r27
40#define AO7	r28
41#define AO8	r29
42#define INC	r30
43#define INC2	r31
44
45#define c01	f0
46#define c02	f1
47#define c03	f2
48#define c04	f3
49#define c05	f4
50#define c06	f5
51#define c07	f6
52#define c08	f7
53#define c09	f8
54#define c10	f9
55#define c11	f10
56#define c12	f11
57#define c13	f12
58#define c14	f13
59#define c15	f14
60#define c16	f15
61
62#define c17	f16
63#define c18	f17
64#define c19	f18
65#define c20	f19
66#define c21	f20
67#define c22	f21
68#define c23	f22
69#define c24	f23
70#define c25	f24
71#define c26	f25
72#define c27	f26
73#define c28	f27
74#define c29	f28
75#define c30	f29
76#define c31	f30
77#define c32	f31
78
79#define	sel_p	f30
80#define	sel_s	f31
81
82
83	PROLOGUE
84	PROFCODE
85
86	li	r0, -16
87
88	stfpdux	f14, SP, r0
89	stfpdux	f15, SP, r0
90	stfpdux	f16, SP, r0
91	stfpdux	f17, SP, r0
92	stfpdux	f18, SP, r0
93	stfpdux	f19, SP, r0
94	stfpdux	f20, SP, r0
95	stfpdux	f21, SP, r0
96	stfpdux	f22, SP, r0
97	stfpdux	f23, SP, r0
98	stfpdux	f24, SP, r0
99	stfpdux	f25, SP, r0
100	stfpdux	f26, SP, r0
101	stfpdux	f27, SP, r0
102	stfpdux	f28, SP, r0
103	stfpdux	f29, SP, r0
104	stfpdux	f30, SP, r0
105	stfpdux	f31, SP, r0
106
107	stwu	r31,  -4(SP)
108	stwu	r30,  -4(SP)
109	stwu	r29,  -4(SP)
110	stwu	r28,  -4(SP)
111
112	stwu	r27,  -4(SP)
113	stwu	r26,  -4(SP)
114
115	lis	r9,   0x3f80
116	lis	r10,  0xbf80
117
118	stwu	r9,    -4(SP)
119	stwu	r10,   -4(SP)
120	stwu	r10,   -4(SP)
121	stwu	r9,    -4(SP)
122
123	slwi	LDA, LDA, BASE_SHIFT
124
125	li	r0, 0
126	lfpsux	sel_p, SP, r0
127	li	r0, 8
128	lfpsux	sel_s, SP, r0
129
130	cmpwi	cr0, M, 0
131	ble-	.L999
132	cmpwi	cr0, N, 0
133	ble-	.L999
134
135	li	INC,  1 * SIZE
136	li	INC2, 2 * SIZE
137
138	subi	B, B, 2 * SIZE
139
140	andi.	r0, A,   2 * SIZE - 1
141	bne	.L100
142	andi.	r0, LDA, 2 * SIZE - 1
143	bne	.L100
144
145	subi	A, A, 2 * SIZE
146	srawi.	J,  N,  3
147	ble	.L20
148	.align 4
149.L11:
150	mr	AO1, A
151	add	AO2, A,   LDA
152	add	AO3, AO2, LDA
153	add	AO4, AO3, LDA
154	add	AO5, AO4, LDA
155	add	AO6, AO5, LDA
156	add	AO7, AO6, LDA
157	add	AO8, AO7, LDA
158	add	A,   AO8, LDA
159
160	srawi.	r0,  M,  2
161	mtspr	CTR, r0
162	ble	.L15
163	.align 4
164
165.L12:
166	LFPDUX	c01,   AO1, INC2
167	LFXDUX	c02,   AO2, INC2
168	LFPDUX	c03,   AO3, INC2
169	LFXDUX	c04,   AO4, INC2
170
171	LFPDUX	c05,   AO5, INC2
172	LFXDUX	c06,   AO6, INC2
173	LFPDUX	c07,   AO7, INC2
174	LFXDUX	c08,   AO8, INC2
175
176	LFPDUX	c09,   AO1, INC2
177	LFXDUX	c10,   AO2, INC2
178	LFPDUX	c11,   AO3, INC2
179	LFXDUX	c12,   AO4, INC2
180	fpsel	c17, sel_p, c01, c02
181
182	LFPDUX	c13,   AO5, INC2
183	fpsel	c18, sel_p, c03, c04
184	LFXDUX	c14,   AO6, INC2
185	fpsel	c19, sel_p, c05, c06
186	LFPDUX	c15,   AO7, INC2
187	fpsel	c20, sel_p, c07, c08
188	LFXDUX	c16,   AO8, INC2
189	fpsel	c21, sel_s, c01, c02
190
191	fpsel	c22, sel_s, c03, c04
192	STFPDUX	c17,   B, INC2
193	fpsel	c23, sel_s, c05, c06
194	STFPDUX	c18,   B, INC2
195	fpsel	c24, sel_s, c07, c08
196	STFPDUX	c19,   B, INC2
197
198	fpsel	c01, sel_p, c09, c10
199	STFPDUX	c20,   B, INC2
200	fpsel	c02, sel_p, c11, c12
201	STFXDUX	c21,   B, INC2
202	fpsel	c03, sel_p, c13, c14
203	STFXDUX	c22,   B, INC2
204	fpsel	c04, sel_p, c15, c16
205	STFXDUX	c23,   B, INC2
206
207	fpsel	c05, sel_s, c09, c10
208	STFXDUX	c24,   B, INC2
209	fpsel	c06, sel_s, c11, c12
210	STFPDUX	c01,   B, INC2
211	fpsel	c07, sel_s, c13, c14
212	STFPDUX	c02,   B, INC2
213	fpsel	c08, sel_s, c15, c16
214	STFPDUX	c03,   B, INC2
215
216	STFPDUX	c04,   B, INC2
217	STFXDUX	c05,   B, INC2
218	STFXDUX	c06,   B, INC2
219	STFXDUX	c07,   B, INC2
220	STFXDUX	c08,   B, INC2
221	bdnz	.L12
222	.align 4
223
224.L15:
225	andi.	r0,  M,  3
226	ble	.L19
227
228	andi.	r0,  M,  2
229	beq	.L17
230
231	LFPDUX	c01,   AO1, INC2
232	LFXDUX	c02,   AO2, INC2
233	LFPDUX	c03,   AO3, INC2
234	LFXDUX	c04,   AO4, INC2
235
236	LFPDUX	c05,   AO5, INC2
237	fpsel	c09, sel_p, c01, c02
238	LFXDUX	c06,   AO6, INC2
239	fpsel	c10, sel_p, c03, c04
240	LFPDUX	c07,   AO7, INC2
241	fpsel	c11, sel_p, c05, c06
242	LFXDUX	c08,   AO8, INC2
243	fpsel	c12, sel_p, c07, c08
244
245	fpsel	c13, sel_s, c01, c02
246	fpsel	c14, sel_s, c03, c04
247	STFPDUX	c09,   B, INC2
248	fpsel	c15, sel_s, c05, c06
249	STFPDUX	c10,   B, INC2
250	fpsel	c16, sel_s, c07, c08
251	STFPDUX	c11,   B, INC2
252
253	STFPDUX	c12,   B, INC2
254	STFXDUX	c13,   B, INC2
255	STFXDUX	c14,   B, INC2
256	STFXDUX	c15,   B, INC2
257	STFXDUX	c16,   B, INC2
258	.align 4
259
260.L17:
261	andi.	r0,  M,  1
262	beq	.L19
263
264	LFDUX	c01,   AO1, INC2
265	LFDUX	c02,   AO3, INC2
266	LFDUX	c03,   AO5, INC2
267	LFDUX	c04,   AO7, INC2
268
269	LFSDUX	c01,   AO2, INC2
270	LFSDUX	c02,   AO4, INC2
271	LFSDUX	c03,   AO6, INC2
272	LFSDUX	c04,   AO8, INC2
273
274	STFPDUX	c01,   B, INC2
275	STFPDUX	c02,   B, INC2
276	STFPDUX	c03,   B, INC2
277	STFPDUX	c04,   B, INC2
278	.align 4
279
280.L19:
281	addic.	J, J, -1
282	bgt	.L11
283	.align 4
284
285.L20:
286	andi.	J,  N,  4
287	ble	.L30
288	.align 4
289.L21:
290	mr	AO1, A
291	add	AO2, A,   LDA
292	add	AO3, AO2, LDA
293	add	AO4, AO3, LDA
294	add	A,   AO4, LDA
295
296	srawi.	r0,  M,  3
297	mtspr	CTR, r0
298	ble	.L25
299	.align 4
300
301.L22:
302	LFPDUX	c01,   AO1, INC2
303	LFXDUX	c02,   AO2, INC2
304	LFPDUX	c03,   AO3, INC2
305	LFXDUX	c04,   AO4, INC2
306
307	LFPDUX	c05,   AO1, INC2
308	LFXDUX	c06,   AO2, INC2
309	LFPDUX	c07,   AO3, INC2
310	LFXDUX	c08,   AO4, INC2
311
312	LFPDUX	c09,   AO1, INC2
313	LFXDUX	c10,   AO2, INC2
314	LFPDUX	c11,   AO3, INC2
315	LFXDUX	c12,   AO4, INC2
316	fpsel	c17, sel_p, c01, c02
317
318	LFPDUX	c13,   AO1, INC2
319	fpsel	c18, sel_p, c03, c04
320	LFXDUX	c14,   AO2, INC2
321	fpsel	c19, sel_s, c01, c02
322	LFPDUX	c15,   AO3, INC2
323	fpsel	c20, sel_s, c03, c04
324	LFXDUX	c16,   AO4, INC2
325	fpsel	c21, sel_p, c05, c06
326
327	fpsel	c22, sel_p, c07, c08
328	STFPDUX	c17,   B, INC2
329	fpsel	c23, sel_s, c05, c06
330	STFPDUX	c18,   B, INC2
331	fpsel	c24, sel_s, c07, c08
332	STFXDUX	c19,   B, INC2
333
334	fpsel	c01, sel_p, c09, c10
335	STFXDUX	c20,   B, INC2
336	fpsel	c02, sel_p, c11, c12
337	STFPDUX	c21,   B, INC2
338	fpsel	c03, sel_s, c09, c10
339	STFPDUX	c22,   B, INC2
340	fpsel	c04, sel_s, c11, c12
341	STFXDUX	c23,   B, INC2
342
343	fpsel	c05, sel_p, c13, c14
344	STFXDUX	c24,   B, INC2
345	fpsel	c06, sel_p, c15, c16
346	STFPDUX	c01,   B, INC2
347	fpsel	c07, sel_s, c13, c14
348	STFPDUX	c02,   B, INC2
349	fpsel	c08, sel_s, c15, c16
350	STFXDUX	c03,   B, INC2
351
352	STFXDUX	c04,   B, INC2
353	STFPDUX	c05,   B, INC2
354	STFPDUX	c06,   B, INC2
355	STFXDUX	c07,   B, INC2
356	STFXDUX	c08,   B, INC2
357	bdnz	.L22
358	.align 4
359
360.L25:
361	andi.	r0,  M,  7
362	ble	.L30
363
364	andi.	r0,  M,  4
365	beq	.L26
366
367	LFPDUX	c01,   AO1, INC2
368	LFXDUX	c02,   AO2, INC2
369	LFPDUX	c03,   AO3, INC2
370	LFXDUX	c04,   AO4, INC2
371
372	LFPDUX	c05,   AO1, INC2
373	fpsel	c09, sel_p, c01, c02
374	LFXDUX	c06,   AO2, INC2
375	fpsel	c10, sel_p, c03, c04
376	LFPDUX	c07,   AO3, INC2
377	fpsel	c11, sel_s, c01, c02
378	LFXDUX	c08,   AO4, INC2
379	fpsel	c12, sel_s, c03, c04
380
381	fpsel	c13, sel_p, c05, c06
382	fpsel	c14, sel_p, c07, c08
383	STFPDUX	c09,   B, INC2
384	fpsel	c15, sel_s, c05, c06
385	STFPDUX	c10,   B, INC2
386	fpsel	c16, sel_s, c07, c08
387	STFXDUX	c11,   B, INC2
388
389	STFXDUX	c12,   B, INC2
390	STFPDUX	c13,   B, INC2
391	STFPDUX	c14,   B, INC2
392	STFXDUX	c15,   B, INC2
393	STFXDUX	c16,   B, INC2
394	.align 4
395
396.L26:
397	andi.	r0,  M,  2
398	beq	.L27
399
400	LFPDUX	c01,   AO1, INC2
401	LFXDUX	c02,   AO2, INC2
402	LFPDUX	c03,   AO3, INC2
403	LFXDUX	c04,   AO4, INC2
404
405	fpsel	c05, sel_p, c01, c02
406	fpsel	c06, sel_p, c03, c04
407	fpsel	c07, sel_s, c01, c02
408	fpsel	c08, sel_s, c03, c04
409
410	STFPDUX	c05,   B, INC2
411	STFPDUX	c06,   B, INC2
412	STFXDUX	c07,   B, INC2
413	STFXDUX	c08,   B, INC2
414	.align 4
415
416.L27:
417	andi.	r0,  M,  1
418	beq	.L30
419
420	LFDUX	c01,   AO1, INC2
421	LFDUX	c02,   AO2, INC2
422	LFDUX	c03,   AO3, INC2
423	LFDUX	c04,   AO4, INC2
424
425	fsmfp	c01, c02
426	fsmfp	c03, c04
427
428	STFPDUX	c01,   B, INC2
429	STFPDUX	c03,   B, INC2
430	.align 4
431
432
433.L30:
434	andi.	J,  N,  2
435	ble	.L40
436
437	mr	AO1, A
438	add	AO2, A,   LDA
439	add	A,   AO2, LDA
440
441	srawi.	r0,  M,  3
442	mtspr	CTR, r0
443	ble	.L35
444	.align 4
445
446.L32:
447	LFPDUX	c01,   AO1, INC2
448	LFXDUX	c05,   AO2, INC2
449	LFPDUX	c02,   AO1, INC2
450	LFXDUX	c06,   AO2, INC2
451
452	LFPDUX	c03,   AO1, INC2
453	fpsel	c09, sel_p, c01, c05
454	LFXDUX	c07,   AO2, INC2
455	fpsel	c10, sel_s, c01, c05
456	LFPDUX	c04,   AO1, INC2
457	fpsel	c11, sel_p, c02, c06
458	LFXDUX	c08,   AO2, INC2
459	fpsel	c12, sel_s, c02, c06
460
461	fpsel	c13, sel_p, c03, c07
462	fpsel	c14, sel_s, c03, c07
463	STFPDUX	c09,   B, INC2
464	fpsel	c15, sel_p, c04, c08
465	STFXDUX	c10,   B, INC2
466	fpsel	c16, sel_s, c04, c08
467	STFPDUX	c11,   B, INC2
468	STFXDUX	c12,   B, INC2
469
470	STFPDUX	c13,   B, INC2
471	STFXDUX	c14,   B, INC2
472	STFPDUX	c15,   B, INC2
473	STFXDUX	c16,   B, INC2
474	bdnz	.L32
475	.align 4
476
477.L35:
478	andi.	r0,  M,  7
479	ble	.L40
480
481	andi.	r0,  M,  4
482	beq	.L36
483
484	LFPDUX	c01,   AO1, INC2
485	LFXDUX	c03,   AO2, INC2
486	LFPDUX	c02,   AO1, INC2
487	LFXDUX	c04,   AO2, INC2
488
489	fpsel	c05, sel_p, c01, c03
490	fpsel	c06, sel_s, c01, c03
491	fpsel	c07, sel_p, c02, c04
492	fpsel	c08, sel_s, c02, c04
493
494	STFPDUX	c05,   B, INC2
495	STFXDUX	c06,   B, INC2
496	STFPDUX	c07,   B, INC2
497	STFXDUX	c08,   B, INC2
498	.align 4
499
500.L36:
501	andi.	r0,  M,  2
502	beq	.L37
503
504	LFPDUX	c01,   AO1, INC2
505	LFXDUX	c02,   AO2, INC2
506
507	fpsel	c03, sel_p, c01, c02
508	fpsel	c04, sel_s, c01, c02
509
510	STFPDUX	c03,   B, INC2
511	STFXDUX	c04,   B, INC2
512	.align 4
513
514.L37:
515	andi.	r0,  M,  1
516	beq	.L40
517
518	LFDUX	c01,   AO1, INC2
519	LFDUX	c02,   AO2, INC2
520
521	fsmfp	c01, c02
522	STFPDUX	c01,   B, INC2
523	.align 4
524
525.L40:
526	andi.	J,  N,  1
527	ble	.L999
528
529	mr	AO1, A
530
531	srawi.	r0,  M,  3
532	mtspr	CTR, r0
533	ble	.L45
534	.align 4
535
536.L42:
537	LFPDUX	c01,   AO1, INC2
538	LFPDUX	c02,   AO1, INC2
539	LFPDUX	c03,   AO1, INC2
540	LFPDUX	c04,   AO1, INC2
541
542	STFPDUX	c01,   B, INC2
543	STFPDUX	c02,   B, INC2
544	STFPDUX	c03,   B, INC2
545	STFPDUX	c04,   B, INC2
546	bdnz	.L42
547	.align 4
548
549.L45:
550	andi.	r0,  M,  7
551	ble	.L999
552
553	andi.	r0,  M,  4
554	beq	.L46
555
556	LFPDUX	c01,   AO1, INC2
557	LFPDUX	c02,   AO1, INC2
558
559	STFPDUX	c01,   B, INC2
560	STFPDUX	c02,   B, INC2
561	.align 4
562
563.L46:
564	andi.	r0,  M,  2
565	beq	.L47
566
567	LFPDUX	c01,   AO1, INC2
568	STFPDUX	c01,   B, INC2
569	.align 4
570
571.L47:
572	andi.	r0,  M,  1
573	beq	.L999
574
575	LFDX	c01,   AO1, INC2
576	STFDX	c01,   B,  INC2
577	b	.L999
578	.align 4
579
580
581.L100:
582	subi	A, A, 1 * SIZE
583	srawi.	J,  N,  3
584	ble	.L120
585	.align 4
586.L111:
587	mr	AO1, A
588	add	AO2, A,   LDA
589	add	AO3, AO2, LDA
590	add	AO4, AO3, LDA
591	add	AO5, AO4, LDA
592	add	AO6, AO5, LDA
593	add	AO7, AO6, LDA
594	add	AO8, AO7, LDA
595	add	A,   AO8, LDA
596
597	srawi.	r0,  M,  3
598	mtspr	CTR, r0
599	ble	.L115
600	.align 4
601
602.L112:
603	LFDUX	c01,   AO1, INC
604	LFDUX	c05,   AO1, INC
605	LFDUX	c09,   AO1, INC
606	LFDUX	c13,   AO1, INC
607
608	LFDUX	c17,   AO1, INC
609	LFDUX	c21,   AO1, INC
610	LFDUX	c25,   AO1, INC
611	LFDUX	c29,   AO1, INC
612
613	LFSDUX	c01,   AO2, INC
614	LFSDUX	c05,   AO2, INC
615	LFSDUX	c09,   AO2, INC
616	LFSDUX	c13,   AO2, INC
617
618	LFSDUX	c17,   AO2, INC
619	LFSDUX	c21,   AO2, INC
620	LFSDUX	c25,   AO2, INC
621	LFSDUX	c29,   AO2, INC
622
623	LFDUX	c02,   AO3, INC
624	LFDUX	c06,   AO3, INC
625	LFDUX	c10,   AO3, INC
626	LFDUX	c14,   AO3, INC
627
628	LFDUX	c18,   AO3, INC
629	LFDUX	c22,   AO3, INC
630	LFDUX	c26,   AO3, INC
631	LFDUX	c30,   AO3, INC
632
633	LFSDUX	c02,   AO4, INC
634	LFSDUX	c06,   AO4, INC
635	LFSDUX	c10,   AO4, INC
636	LFSDUX	c14,   AO4, INC
637
638	LFSDUX	c18,   AO4, INC
639	LFSDUX	c22,   AO4, INC
640	LFSDUX	c26,   AO4, INC
641	LFSDUX	c30,   AO4, INC
642
643	LFDUX	c03,   AO5, INC
644	LFDUX	c07,   AO5, INC
645	LFDUX	c11,   AO5, INC
646	LFDUX	c15,   AO5, INC
647
648	LFDUX	c19,   AO5, INC
649	LFDUX	c23,   AO5, INC
650	LFDUX	c27,   AO5, INC
651	LFDUX	c31,   AO5, INC
652
653	LFSDUX	c03,   AO6, INC
654	LFSDUX	c07,   AO6, INC
655	LFSDUX	c11,   AO6, INC
656	LFSDUX	c15,   AO6, INC
657
658	LFSDUX	c19,   AO6, INC
659	LFSDUX	c23,   AO6, INC
660	LFSDUX	c27,   AO6, INC
661	LFSDUX	c31,   AO6, INC
662
663	LFDUX	c04,   AO7, INC
664	LFDUX	c08,   AO7, INC
665	LFDUX	c12,   AO7, INC
666	LFDUX	c16,   AO7, INC
667
668	LFDUX	c20,   AO7, INC
669	LFDUX	c24,   AO7, INC
670	LFDUX	c28,   AO7, INC
671	LFDUX	c32,   AO7, INC
672
673	LFSDUX	c04,   AO8, INC
674	LFSDUX	c08,   AO8, INC
675	LFSDUX	c12,   AO8, INC
676	LFSDUX	c16,   AO8, INC
677
678	LFSDUX	c20,   AO8, INC
679	LFSDUX	c24,   AO8, INC
680	LFSDUX	c28,   AO8, INC
681	LFSDUX	c32,   AO8, INC
682
683	STFPDUX	c01,   B, INC2
684	STFPDUX	c02,   B, INC2
685	STFPDUX	c03,   B, INC2
686	STFPDUX	c04,   B, INC2
687	STFPDUX	c05,   B, INC2
688	STFPDUX	c06,   B, INC2
689	STFPDUX	c07,   B, INC2
690	STFPDUX	c08,   B, INC2
691
692	STFPDUX	c09,   B, INC2
693	STFPDUX	c10,   B, INC2
694	STFPDUX	c11,   B, INC2
695	STFPDUX	c12,   B, INC2
696	STFPDUX	c13,   B, INC2
697	STFPDUX	c14,   B, INC2
698	STFPDUX	c15,   B, INC2
699	STFPDUX	c16,   B, INC2
700
701	STFPDUX	c17,   B, INC2
702	STFPDUX	c18,   B, INC2
703	STFPDUX	c19,   B, INC2
704	STFPDUX	c20,   B, INC2
705	STFPDUX	c21,   B, INC2
706	STFPDUX	c22,   B, INC2
707	STFPDUX	c23,   B, INC2
708	STFPDUX	c24,   B, INC2
709
710	STFPDUX	c25,   B, INC2
711	STFPDUX	c26,   B, INC2
712	STFPDUX	c27,   B, INC2
713	STFPDUX	c28,   B, INC2
714	STFPDUX	c29,   B, INC2
715	STFPDUX	c30,   B, INC2
716	STFPDUX	c31,   B, INC2
717	STFPDUX	c32,   B, INC2
718	bdnz	.L112
719	.align 4
720
721.L115:
722	andi.	r0,  M,  7
723	ble	.L119
724
725	andi.	r0,  M,  4
726	beq	.L116
727
728	LFDUX	c01,   AO1, INC
729	LFDUX	c05,   AO1, INC
730	LFDUX	c09,   AO1, INC
731	LFDUX	c13,   AO1, INC
732
733	LFSDUX	c01,   AO2, INC
734	LFSDUX	c05,   AO2, INC
735	LFSDUX	c09,   AO2, INC
736	LFSDUX	c13,   AO2, INC
737
738	LFDUX	c02,   AO3, INC
739	LFDUX	c06,   AO3, INC
740	LFDUX	c10,   AO3, INC
741	LFDUX	c14,   AO3, INC
742
743	LFSDUX	c02,   AO4, INC
744	LFSDUX	c06,   AO4, INC
745	LFSDUX	c10,   AO4, INC
746	LFSDUX	c14,   AO4, INC
747
748	LFDUX	c03,   AO5, INC
749	LFDUX	c07,   AO5, INC
750	LFDUX	c11,   AO5, INC
751	LFDUX	c15,   AO5, INC
752
753	LFSDUX	c03,   AO6, INC
754	LFSDUX	c07,   AO6, INC
755	LFSDUX	c11,   AO6, INC
756	LFSDUX	c15,   AO6, INC
757
758	LFDUX	c04,   AO7, INC
759	LFDUX	c08,   AO7, INC
760	LFDUX	c12,   AO7, INC
761	LFDUX	c16,   AO7, INC
762
763	LFSDUX	c04,   AO8, INC
764	LFSDUX	c08,   AO8, INC
765	LFSDUX	c12,   AO8, INC
766	LFSDUX	c16,   AO8, INC
767
768	STFPDUX	c01,   B, INC2
769	STFPDUX	c02,   B, INC2
770	STFPDUX	c03,   B, INC2
771	STFPDUX	c04,   B, INC2
772	STFPDUX	c05,   B, INC2
773	STFPDUX	c06,   B, INC2
774	STFPDUX	c07,   B, INC2
775	STFPDUX	c08,   B, INC2
776
777	STFPDUX	c09,   B, INC2
778	STFPDUX	c10,   B, INC2
779	STFPDUX	c11,   B, INC2
780	STFPDUX	c12,   B, INC2
781	STFPDUX	c13,   B, INC2
782	STFPDUX	c14,   B, INC2
783	STFPDUX	c15,   B, INC2
784	STFPDUX	c16,   B, INC2
785	.align 4
786
787.L116:
788	andi.	r0,  M,  2
789	beq	.L117
790
791	LFDUX	c01,   AO1, INC
792	LFDUX	c05,   AO1, INC
793	LFDUX	c02,   AO3, INC
794	LFDUX	c06,   AO3, INC
795
796	LFSDUX	c01,   AO2, INC
797	LFSDUX	c05,   AO2, INC
798	LFSDUX	c02,   AO4, INC
799	LFSDUX	c06,   AO4, INC
800
801	LFDUX	c03,   AO5, INC
802	LFDUX	c07,   AO5, INC
803	LFDUX	c04,   AO7, INC
804	LFDUX	c08,   AO7, INC
805
806	LFSDUX	c03,   AO6, INC
807	LFSDUX	c07,   AO6, INC
808	LFSDUX	c04,   AO8, INC
809	LFSDUX	c08,   AO8, INC
810
811	STFPDUX	c01,   B, INC2
812	STFPDUX	c02,   B, INC2
813	STFPDUX	c03,   B, INC2
814	STFPDUX	c04,   B, INC2
815	STFPDUX	c05,   B, INC2
816	STFPDUX	c06,   B, INC2
817	STFPDUX	c07,   B, INC2
818	STFPDUX	c08,   B, INC2
819	.align 4
820
821.L117:
822	andi.	r0,  M,  1
823	beq	.L119
824
825	LFDUX	c01,   AO1, INC
826	LFDUX	c02,   AO3, INC
827	LFDUX	c03,   AO5, INC
828	LFDUX	c04,   AO7, INC
829
830	LFSDUX	c01,   AO2, INC
831	LFSDUX	c02,   AO4, INC
832	LFSDUX	c03,   AO6, INC
833	LFSDUX	c04,   AO8, INC
834
835	STFPDUX	c01,   B, INC2
836	STFPDUX	c02,   B, INC2
837	STFPDUX	c03,   B, INC2
838	STFPDUX	c04,   B, INC2
839	.align 4
840
841.L119:
842	addic.	J, J, -1
843	bgt	.L111
844	.align 4
845
846.L120:
847	andi.	J,  N,  4
848	ble	.L130
849	.align 4
850.L121:
851	mr	AO1, A
852	add	AO2, A,   LDA
853	add	AO3, AO2, LDA
854	add	AO4, AO3, LDA
855	add	A,   AO4, LDA
856
857	srawi.	r0,  M,  3
858	mtspr	CTR, r0
859	ble	.L125
860	.align 4
861
862.L122:
863	LFDUX	c01,   AO1, INC
864	LFDUX	c02,   AO1, INC
865	LFDUX	c03,   AO1, INC
866	LFDUX	c04,   AO1, INC
867
868	LFDUX	c09,   AO1, INC
869	LFDUX	c10,   AO1, INC
870	LFDUX	c11,   AO1, INC
871	LFDUX	c12,   AO1, INC
872
873	LFSDUX	c01,   AO2, INC
874	LFSDUX	c02,   AO2, INC
875	LFSDUX	c03,   AO2, INC
876	LFSDUX	c04,   AO2, INC
877
878	LFSDUX	c09,   AO2, INC
879	LFSDUX	c10,   AO2, INC
880	LFSDUX	c11,   AO2, INC
881	LFSDUX	c12,   AO2, INC
882
883	LFDUX	c05,   AO3, INC
884	LFDUX	c06,   AO3, INC
885	LFDUX	c07,   AO3, INC
886	LFDUX	c08,   AO3, INC
887
888	LFDUX	c13,   AO3, INC
889	LFDUX	c14,   AO3, INC
890	LFDUX	c15,   AO3, INC
891	LFDUX	c16,   AO3, INC
892
893	LFSDUX	c05,   AO4, INC
894	LFSDUX	c06,   AO4, INC
895	LFSDUX	c07,   AO4, INC
896	LFSDUX	c08,   AO4, INC
897
898	LFSDUX	c13,   AO4, INC
899	LFSDUX	c14,   AO4, INC
900	LFSDUX	c15,   AO4, INC
901	LFSDUX	c16,   AO4, INC
902
903	STFPDUX	c01,   B, INC2
904	STFPDUX	c05,   B, INC2
905	STFPDUX	c02,   B, INC2
906	STFPDUX	c06,   B, INC2
907	STFPDUX	c03,   B, INC2
908	STFPDUX	c07,   B, INC2
909	STFPDUX	c04,   B, INC2
910	STFPDUX	c08,   B, INC2
911
912	STFPDUX	c09,   B, INC2
913	STFPDUX	c13,   B, INC2
914	STFPDUX	c10,   B, INC2
915	STFPDUX	c14,   B, INC2
916	STFPDUX	c11,   B, INC2
917	STFPDUX	c15,   B, INC2
918	STFPDUX	c12,   B, INC2
919	STFPDUX	c16,   B, INC2
920	bdnz	.L122
921	.align 4
922
923.L125:
924	andi.	r0,  M,  7
925	ble	.L130
926
927	andi.	r0,  M,  4
928	beq	.L126
929
930	LFDUX	c01,   AO1, INC
931	LFDUX	c02,   AO1, INC
932	LFDUX	c03,   AO1, INC
933	LFDUX	c04,   AO1, INC
934
935	LFSDUX	c01,   AO2, INC
936	LFSDUX	c02,   AO2, INC
937	LFSDUX	c03,   AO2, INC
938	LFSDUX	c04,   AO2, INC
939
940	LFDUX	c05,   AO3, INC
941	LFDUX	c06,   AO3, INC
942	LFDUX	c07,   AO3, INC
943	LFDUX	c08,   AO3, INC
944
945	LFSDUX	c05,   AO4, INC
946	LFSDUX	c06,   AO4, INC
947	LFSDUX	c07,   AO4, INC
948	LFSDUX	c08,   AO4, INC
949
950	STFPDUX	c01,   B, INC2
951	STFPDUX	c05,   B, INC2
952	STFPDUX	c02,   B, INC2
953	STFPDUX	c06,   B, INC2
954	STFPDUX	c03,   B, INC2
955	STFPDUX	c07,   B, INC2
956	STFPDUX	c04,   B, INC2
957	STFPDUX	c08,   B, INC2
958	.align 4
959
960.L126:
961	andi.	r0,  M,  2
962	beq	.L127
963
964	LFDUX	c01,   AO1, INC
965	LFDUX	c02,   AO1, INC
966
967	LFSDUX	c01,   AO2, INC
968	LFSDUX	c02,   AO2, INC
969
970	LFDUX	c05,   AO3, INC
971	LFDUX	c06,   AO3, INC
972
973	LFSDUX	c05,   AO4, INC
974	LFSDUX	c06,   AO4, INC
975
976	STFPDUX	c01,   B, INC2
977	STFPDUX	c05,   B, INC2
978	STFPDUX	c02,   B, INC2
979	STFPDUX	c06,   B, INC2
980	.align 4
981
982.L127:
983	andi.	r0,  M,  1
984	beq	.L130
985
986	LFDUX	c01,   AO1, INC
987	LFDUX	c05,   AO3, INC
988
989	nop
990	nop
991
992	LFSDUX	c01,   AO2, INC
993	LFSDUX	c05,   AO4, INC
994
995	STFPDUX	c01,   B, INC2
996	STFPDUX	c05,   B, INC2
997	.align 4
998
999
1000.L130:
1001	andi.	J,  N,  2
1002	ble	.L140
1003
1004	mr	AO1, A
1005	add	AO2, A,   LDA
1006	add	A,   AO2, LDA
1007
1008	srawi.	r0,  M,  3
1009	mtspr	CTR, r0
1010	ble	.L135
1011	.align 4
1012
1013.L132:
1014	LFDUX	c01,   AO1, INC
1015	LFDUX	c02,   AO1, INC
1016	LFDUX	c03,   AO1, INC
1017	LFDUX	c04,   AO1, INC
1018
1019	LFDUX	c09,   AO1, INC
1020	LFDUX	c10,   AO1, INC
1021	LFDUX	c11,   AO1, INC
1022	LFDUX	c12,   AO1, INC
1023
1024	LFSDUX	c01,   AO2, INC
1025	LFSDUX	c02,   AO2, INC
1026	LFSDUX	c03,   AO2, INC
1027	LFSDUX	c04,   AO2, INC
1028
1029	LFSDUX	c09,   AO2, INC
1030	LFSDUX	c10,   AO2, INC
1031	LFSDUX	c11,   AO2, INC
1032	LFSDUX	c12,   AO2, INC
1033
1034	STFPDUX	c01,   B, INC2
1035	STFPDUX	c02,   B, INC2
1036	STFPDUX	c03,   B, INC2
1037	STFPDUX	c04,   B, INC2
1038
1039	STFPDUX	c09,   B, INC2
1040	STFPDUX	c10,   B, INC2
1041	STFPDUX	c11,   B, INC2
1042	STFPDUX	c12,   B, INC2
1043	bdnz	.L132
1044	.align 4
1045
1046.L135:
1047	andi.	r0,  M,  7
1048	ble	.L140
1049
1050	andi.	r0,  M,  4
1051	beq	.L136
1052
1053	LFDUX	c01,   AO1, INC
1054	LFDUX	c02,   AO1, INC
1055	LFDUX	c03,   AO1, INC
1056	LFDUX	c04,   AO1, INC
1057
1058	LFSDUX	c01,   AO2, INC
1059	LFSDUX	c02,   AO2, INC
1060	LFSDUX	c03,   AO2, INC
1061	LFSDUX	c04,   AO2, INC
1062
1063	STFPDUX	c01,   B, INC2
1064	STFPDUX	c02,   B, INC2
1065	STFPDUX	c03,   B, INC2
1066	STFPDUX	c04,   B, INC2
1067	.align 4
1068
1069.L136:
1070	andi.	r0,  M,  2
1071	beq	.L137
1072
1073	LFDUX	c01,   AO1, INC
1074	LFDUX	c02,   AO1, INC
1075
1076	LFSDUX	c01,   AO2, INC
1077	LFSDUX	c02,   AO2, INC
1078
1079	STFPDUX	c01,   B, INC2
1080	STFPDUX	c02,   B, INC2
1081	.align 4
1082
1083.L137:
1084	andi.	r0,  M,  1
1085	beq	.L140
1086
1087	LFDUX	c01,   AO1, INC
1088	LFDUX	c02,   AO2, INC
1089
1090	fsmfp	c01, c02
1091	STFPDUX	c01,   B, INC2
1092	.align 4
1093
1094.L140:
1095	andi.	J,  N,  1
1096	ble	.L999
1097
1098	mr	AO1, A
1099
1100	srawi.	r0,  M,  3
1101	mtspr	CTR, r0
1102	ble	.L145
1103	.align 4
1104
1105.L142:
1106	LFDUX	c01,   AO1, INC
1107	LFDUX	c02,   AO1, INC
1108	LFDUX	c03,   AO1, INC
1109	LFDUX	c04,   AO1, INC
1110
1111	LFDUX	c05,   AO1, INC
1112	LFDUX	c06,   AO1, INC
1113	LFDUX	c07,   AO1, INC
1114	LFDUX	c08,   AO1, INC
1115
1116	fsmfp	c01, c02
1117	fsmfp	c03, c04
1118	fsmfp	c05, c06
1119	fsmfp	c07, c08
1120
1121	STFPDUX	c01,   B, INC2
1122	STFPDUX	c03,   B, INC2
1123	STFPDUX	c05,   B, INC2
1124	STFPDUX	c07,   B, INC2
1125	bdnz	.L142
1126	.align 4
1127
1128.L145:
1129	andi.	r0,  M,  7
1130	ble	.L999
1131
1132	andi.	r0,  M,  4
1133	beq	.L146
1134
1135	LFDUX	c01,   AO1, INC
1136	LFDUX	c02,   AO1, INC
1137	LFDUX	c03,   AO1, INC
1138	LFDUX	c04,   AO1, INC
1139
1140	fsmfp	c01, c02
1141	fsmfp	c03, c04
1142
1143	STFPDUX	c01,   B, INC2
1144	STFPDUX	c03,   B, INC2
1145	.align 4
1146
1147.L146:
1148	andi.	r0,  M,  2
1149	beq	.L147
1150
1151	LFDUX	c01,   AO1, INC
1152	LFDUX	c02,   AO1, INC
1153
1154	fsmfp	c01, c02
1155	STFPDUX	c01,   B, INC2
1156	.align 4
1157
1158.L147:
1159	andi.	r0,  M,  1
1160	beq	.L999
1161
1162	LFDX	c01,   AO1, INC
1163	STFDX	c01,   B,  INC2
1164	.align 4
1165
1166.L999:
1167	addi	SP, SP, 4
1168
1169	lwzu	r26,   4(SP)
1170	lwzu	r27,   4(SP)
1171
1172	lwzu	r28,   4(SP)
1173	lwzu	r29,   4(SP)
1174	lwzu	r30,   4(SP)
1175	lwzu	r31,   4(SP)
1176
1177	subi	SP, SP, 12
1178	li	r0, 16
1179
1180	lfpdux	f31, SP, r0
1181	lfpdux	f30, SP, r0
1182	lfpdux	f29, SP, r0
1183	lfpdux	f28, SP, r0
1184	lfpdux	f27, SP, r0
1185	lfpdux	f26, SP, r0
1186	lfpdux	f25, SP, r0
1187	lfpdux	f24, SP, r0
1188	lfpdux	f23, SP, r0
1189	lfpdux	f22, SP, r0
1190	lfpdux	f21, SP, r0
1191	lfpdux	f20, SP, r0
1192	lfpdux	f19, SP, r0
1193	lfpdux	f18, SP, r0
1194	lfpdux	f17, SP, r0
1195	lfpdux	f16, SP, r0
1196	lfpdux	f15, SP, r0
1197	lfpdux	f14, SP, r0
1198	addi	SP, SP, 16
1199	blr
1200	EPILOGUE
1201