1/***************************************************************************
2Copyright (c) 2013-2019, The OpenBLAS Project
3All rights reserved.
4Redistribution and use in source and binary forms, with or without
5modification, are permitted provided that the following conditions are
6met:
71. Redistributions of source code must retain the above copyright
8notice, this list of conditions and the following disclaimer.
92. Redistributions in binary form must reproduce the above copyright
10notice, this list of conditions and the following disclaimer in
11the documentation and/or other materials provided with the
12distribution.
133. Neither the name of the OpenBLAS project nor the names of
14its contributors may be used to endorse or promote products
15derived from this software without specific prior written permission.
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*****************************************************************************/
27
28/**************************************************************************************
29* Abdelrauf(quickwritereader@googlemail.com)
30* 	 BLASTEST 		: OK
31* 	 CTEST			: OK
32* 	 TEST			: OK
33*	 LAPACK-TEST		: OK
34**************************************************************************************/
35
36/*********************************************************************
37* Macros for N=4, M=16                                               *
38*********************************************************************/
39.macro LOAD4x16_1
40   LOAD4x16 1
41.endm
42
43.macro LOAD4x16_0
44   LOAD4x16 0
45.endm
46.macro LOAD4x16  Zero
47
48	lxv	vs24,	0(BO)
49	lxv	vs26,	16(BO)
50	xxpermdi	vs25,	vs24,	vs24,2
51	xxpermdi	vs27,	vs26,	vs26,2
52
53	lxv	vs0,	 0(AO)
54	lxv	vs1,	16(AO)
55	lxv	vs2,	32(AO)
56	lxv	vs3,	48(AO)
57
58
59	lxv	vs4,	64(AO)
60	lxv	vs5,	80(AO)
61	lxv	vs6,	96(AO)
62	lxv	vs7,	112(AO)
63.if \Zero==1
64    xxlxor		vs32,vs32,vs32
65    xxlxor		vs33,vs33,vs33
66	xxlxor		vs34,vs34,vs34
67	xxlxor		vs35,vs35,vs35
68	xxlxor		vs36,vs36,vs36
69	xxlxor		vs37,vs37,vs37
70	xxlxor		vs38,vs38,vs38
71	xxlxor		vs39,vs39,vs39
72	xxlxor		vs40,	vs40,	vs40
73	xxlxor		vs41,	vs41,	vs41
74	xxlxor		vs42,	vs42,	vs42
75	xxlxor		vs43,	vs43,	vs43
76	xxlxor		vs44,	vs44,	vs44
77	xxlxor		vs45,	vs45,	vs45
78	xxlxor		vs46,	vs46,	vs46
79	xxlxor		vs47,	vs47,	vs47
80	xxlxor		vs48,	vs48,	vs48
81	xxlxor		vs49,	vs49,	vs49
82	xxlxor		vs50,	vs50,	vs50
83	xxlxor		vs51,	vs51,	vs51
84	xxlxor		vs52,	vs52,	vs52
85	xxlxor		vs53,	vs53,	vs53
86	xxlxor		vs54,	vs54,	vs54
87	xxlxor		vs55,	vs55,	vs55
88	xxlxor		vs56,	vs56,	vs56
89	xxlxor		vs57,	vs57,	vs57
90	xxlxor		vs58,	vs58,	vs58
91	xxlxor		vs59,	vs59,	vs59
92	xxlxor		vs60,	vs60,	vs60
93	xxlxor		vs61,	vs61,	vs61
94	xxlxor		vs62,	vs62,	vs62
95	xxlxor		vs63,	vs63,	vs63
96.endif
97.endm
98
99
100#define unit_size 8
101#define DISP32(ind,disp) (ind*unit_size*32+disp)
102#define DISP16(ind,disp) (ind*unit_size*16+disp)
103#define DISP8(ind,disp) (ind*unit_size*8+disp)
104#define DISP4(ind,disp) (ind*unit_size*4+disp)
105#define DISP2(ind,disp) (ind*unit_size*2+disp)
106#define DISP1(ind,disp) (ind*unit_size+disp)
107
108.macro KERNEL4x16_L1_L2  Index,IsLast
109  KERNEL4x16_L1_L2_I AO,BO, 0,0,0, \Index,\IsLast,0
110.endm
111
112
113
114.macro KERNEL4x16_I1_L2  OffsetA,OffsetB, Index,IsLast
115  KERNEL4x16_L1_L2_I  AO,BO,1,\OffsetA,\OffsetB,\Index,\IsLast,0
116.endm
117
118.macro KERNEL4x16_I1_L2_2  OffsetA,OffsetB, Index,IsLast
119  KERNEL4x16_L1_L2_I  AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,0
120.endm
121
122.macro KERNEL4x16_I1_L2_3  OffsetA,OffsetB, Index,IsLast
123  KERNEL4x16_L1_L2_I  AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,1
124.endm
125
126.macro KERNEL4x16_I2_L2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
127  KERNEL4x16_L1_L2_I  \AREG,\BREG,1,\OffsetA,\OffsetB,\Index,\IsLast,0
128.endm
129
130.macro KERNEL4x16_I2_L2_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
131  KERNEL4x16_L1_L2_I  \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,0
132.endm
133
134.macro KERNEL4x16_I2_L2_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
135  KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,1
136.endm
137
138.macro KERNEL4x16_L1_L2_I  AREG,BREG, First, OffsetA,OffsetB, Index,IsLast ,Complete
139
140.if \First ==1
141	xvmuldp		vs32,	vs0,	vs24
142	xvmuldp		vs33,	vs1,	vs24
143	xvmuldp		vs34,	vs2,	vs24
144	xvmuldp		vs35,	vs3,	vs24
145.else
146	xvmaddadp		vs32,	vs0,	vs24
147	xvmaddadp		vs33,	vs1,	vs24
148	xvmaddadp		vs34,	vs2,	vs24
149	xvmaddadp		vs35,	vs3,	vs24
150.endif
151	lxv	vs8,	DISP32(\Index,0+\OffsetA)(\AREG)
152	lxv	vs9,	DISP32(\Index,16+\OffsetA)(\AREG)
153	lxv	vs10,	DISP32(\Index,32+\OffsetA)(\AREG)
154	lxv	vs11,	DISP32(\Index,48+\OffsetA)(\AREG)
155.if \First ==1
156	xvmuldp		vs36,	vs4,	vs24
157	xvmuldp		vs37,	vs5,	vs24
158	xvmuldp		vs38,	vs6,	vs24
159	xvmuldp		vs39,	vs7,	vs24
160.else
161	xvmaddadp		vs36,	vs4,	vs24
162	xvmaddadp		vs37,	vs5,	vs24
163	xvmaddadp		vs38,	vs6,	vs24
164	xvmaddadp		vs39,	vs7,	vs24
165.endif
166	lxv	vs28,	DISP8(\Index,0  +\OffsetB)(\BREG)
167	lxv	vs30,	DISP8(\Index,16  +\OffsetB)(\BREG)
168	xxpermdi	vs29,	vs28,	vs28,2
169	xxpermdi	vs31,	vs30,	vs30,2
170.if \First ==1
171	xvmuldp		vs40,	vs0,	vs25
172	xvmuldp		vs41,	vs1,	vs25
173	xvmuldp		vs42,	vs2,	vs25
174	xvmuldp		vs43,	vs3,	vs25
175
176
177	xvmuldp		vs44,	vs4,	vs25
178	xvmuldp		vs45,	vs5,	vs25
179	xvmuldp		vs46,	vs6,	vs25
180	xvmuldp		vs47,	vs7,	vs25
181
182
183	xvmuldp		vs48,	vs0,	vs26
184	xvmuldp		vs49,	vs1,	vs26
185	xvmuldp		vs50,	vs2,	vs26
186	xvmuldp		vs51,	vs3,	vs26
187
188
189.else
190	xvmaddadp		vs40,	vs0,	vs25
191	xvmaddadp		vs41,	vs1,	vs25
192	xvmaddadp		vs42,	vs2,	vs25
193	xvmaddadp		vs43,	vs3,	vs25
194
195
196	xvmaddadp		vs44,	vs4,	vs25
197	xvmaddadp		vs45,	vs5,	vs25
198	xvmaddadp		vs46,	vs6,	vs25
199	xvmaddadp		vs47,	vs7,	vs25
200
201
202	xvmaddadp		vs48,	vs0,	vs26
203	xvmaddadp		vs49,	vs1,	vs26
204	xvmaddadp		vs50,	vs2,	vs26
205	xvmaddadp		vs51,	vs3,	vs26
206
207.endif
208	lxv	vs12,  DISP32(\Index,64+\OffsetA)(\AREG)
209	lxv	vs13,  DISP32(\Index,80+\OffsetA)(\AREG)
210.if \First ==1
211	xvmuldp		vs52,	vs4,	vs26
212	xvmuldp		vs53,	vs5,	vs26
213	xvmuldp		vs54,	vs6,	vs26
214	xvmuldp		vs55,	vs7,	vs26
215
216.else
217	xvmaddadp		vs52,	vs4,	vs26
218	xvmaddadp		vs53,	vs5,	vs26
219	xvmaddadp		vs54,	vs6,	vs26
220	xvmaddadp		vs55,	vs7,	vs26
221.endif
222	lxv	vs14,  DISP32(\Index,96+\OffsetA)(\AREG)
223	lxv	vs15,  DISP32(\Index,112+\OffsetA)(\AREG)
224.if \First ==1
225	xvmuldp		vs56,	vs0,	vs27
226	xvmuldp		vs57,	vs1,	vs27
227	xvmuldp		vs58,	vs2,	vs27
228	xvmuldp		vs59,	vs3,	vs27
229
230
231
232	xvmuldp		vs60,	vs4,	vs27
233	xvmuldp		vs61,	vs5,	vs27
234	xvmuldp		vs62,	vs6,	vs27
235	xvmuldp		vs63,	vs7,	vs27
236
237.else
238	xvmaddadp		vs56,	vs0,	vs27
239	xvmaddadp		vs57,	vs1,	vs27
240	xvmaddadp		vs58,	vs2,	vs27
241	xvmaddadp		vs59,	vs3,	vs27
242
243
244
245	xvmaddadp		vs60,	vs4,	vs27
246	xvmaddadp		vs61,	vs5,	vs27
247	xvmaddadp		vs62,	vs6,	vs27
248	xvmaddadp		vs63,	vs7,	vs27
249.endif
250
251	xvmaddadp		vs32,	vs8,	vs28
252	xvmaddadp		vs33,	vs9,	vs28
253	xvmaddadp		vs34,	vs10,	vs28
254	xvmaddadp		vs35,	vs11,	vs28
255.if \Complete==0
256	lxv	vs0,	DISP32(\Index,128+\OffsetA)(\AREG)
257	lxv	vs1,	DISP32(\Index,144+\OffsetA)(\AREG)
258.endif
259	xvmaddadp		vs36,	vs12,	vs28
260	xvmaddadp		vs37,	vs13,	vs28
261	xvmaddadp		vs38,	vs14,	vs28
262	xvmaddadp		vs39,	vs15,	vs28
263.if \Complete==0
264	lxv	vs24,	DISP8(\Index,32  +\OffsetB)(\BREG)
265	lxv	vs26,	DISP8(\Index,48  +\OffsetB)(\BREG)
266	xxpermdi	vs25,	vs24,	vs24,2
267	xxpermdi	vs27,	vs26,	vs26,2
268.endif
269	xvmaddadp		vs40,	vs8,	vs29
270	xvmaddadp		vs41,	vs9,	vs29
271	xvmaddadp		vs42,	vs10,	vs29
272	xvmaddadp		vs43,	vs11,	vs29
273.if \Complete==0
274	lxv	vs2,	DISP32(\Index,160+\OffsetA)(\AREG)
275	lxv	vs3,	DISP32(\Index,176+\OffsetA)(\AREG)
276.endif
277	xvmaddadp		vs44,	vs12,	vs29
278	xvmaddadp		vs45,	vs13,	vs29
279	xvmaddadp		vs46,	vs14,	vs29
280	xvmaddadp		vs47,	vs15,	vs29
281
282
283	xvmaddadp		vs48,	vs8,	vs30
284	xvmaddadp		vs49,	vs9,	vs30
285	xvmaddadp		vs50,	vs10,	vs30
286	xvmaddadp		vs51,	vs11,	vs30
287.if \Complete==0
288	lxv	vs4,	DISP32(\Index,192+\OffsetA)(\AREG)
289	lxv	vs5,	DISP32(\Index,208+\OffsetA)(\AREG)
290.endif
291	xvmaddadp		vs52,	vs12,	vs30
292	xvmaddadp		vs53,	vs13,	vs30
293	xvmaddadp		vs54,	vs14,	vs30
294	xvmaddadp		vs55,	vs15,	vs30
295.if \Complete==0
296	lxv	vs6,	DISP32(\Index,224+\OffsetA)(\AREG)
297	lxv	vs7,	DISP32(\Index,240+\OffsetA)(\AREG)
298.endif
299	xvmaddadp		vs56,	vs8,	vs31
300	xvmaddadp		vs57,	vs9,	vs31
301	xvmaddadp		vs58,	vs10,	vs31
302	xvmaddadp		vs59,	vs11,	vs31
303
304
305	xvmaddadp		vs60,	vs12,	vs31
306
307	xvmaddadp		vs61,	vs13,	vs31
308	xvmaddadp		vs62,	vs14,	vs31
309
310	xvmaddadp		vs63,	vs15,	vs31
311  .if \IsLast==1
312  .if \Complete==1
313	addi		\AREG, \AREG, DISP32(\Index,128+\OffsetA)
314	addi		\BREG, \BREG,  DISP8(\Index,32+\OffsetB)
315  .else
316	addi		\AREG, \AREG, DISP32(\Index,256)
317	addi		\BREG, \BREG,  DISP8(\Index,64)
318  .endif
319  .endif
320
321
322.endm
323
324
325
326.macro KERNEL4x16 First
327
328	lxv	vs24,	0(BO)
329	lxv	vs26,	16(BO)
330	xxpermdi	vs25,	vs24,	vs24,2
331	xxpermdi	vs27,	vs26,	vs26,2
332
333	lxv	vs0,	0(AO)
334	lxv	vs1,	16(AO)
335	lxv	vs2,	32(AO)
336	lxv	vs3,	48(AO)
337
338	lxv	vs4,	64(AO)
339	lxv	vs5,	80(AO)
340	lxv	vs6,	96(AO)
341	lxv	vs7,	112(AO)
342
343
344
345	addi		BO, BO, 32
346  addi		AO, AO, 128
347
348.if \First==1
349	xvmuldp			vs32,	vs0,	vs24
350	xvmuldp			vs33,	vs1,	vs24
351	xvmuldp			vs34,	vs2,	vs24
352	xvmuldp			vs35,	vs3,	vs24
353	xvmuldp			vs36,	vs4,	vs24
354	xvmuldp			vs37,	vs5,	vs24
355	xvmuldp			vs38,	vs6,	vs24
356	xvmuldp			vs39,	vs7,	vs24
357
358	xvmuldp			vs40,	vs0,	vs25
359	xvmuldp			vs41,	vs1,	vs25
360	xvmuldp			vs42,	vs2,	vs25
361	xvmuldp			vs43,	vs3,	vs25
362	xvmuldp			vs44,	vs4,	vs25
363	xvmuldp			vs45,	vs5,	vs25
364	xvmuldp			vs46,	vs6,	vs25
365	xvmuldp			vs47,	vs7,	vs25
366
367	xvmuldp			vs48,	vs0,	vs26
368	xvmuldp			vs49,	vs1,	vs26
369	xvmuldp			vs50,	vs2,	vs26
370	xvmuldp			vs51,	vs3,	vs26
371	xvmuldp			vs52,	vs4,	vs26
372	xvmuldp			vs53,	vs5,	vs26
373	xvmuldp			vs54,	vs6,	vs26
374	xvmuldp			vs55,	vs7,	vs26
375
376	xvmuldp			vs56,	vs0,	vs27
377	xvmuldp			vs57,	vs1,	vs27
378	xvmuldp			vs58,	vs2,	vs27
379	xvmuldp			vs59,	vs3,	vs27
380	xvmuldp			vs60,	vs4,	vs27
381	xvmuldp			vs61,	vs5,	vs27
382	xvmuldp			vs62,	vs6,	vs27
383	xvmuldp			vs63,	vs7,	vs27
384.else
385	xvmaddadp		vs32,	vs0,	vs24
386	xvmaddadp		vs33,	vs1,	vs24
387	xvmaddadp		vs34,	vs2,	vs24
388	xvmaddadp		vs35,	vs3,	vs24
389	xvmaddadp		vs36,	vs4,	vs24
390	xvmaddadp		vs37,	vs5,	vs24
391	xvmaddadp		vs38,	vs6,	vs24
392	xvmaddadp		vs39,	vs7,	vs24
393
394	xvmaddadp		vs40,	vs0,	vs25
395	xvmaddadp		vs41,	vs1,	vs25
396	xvmaddadp		vs42,	vs2,	vs25
397	xvmaddadp		vs43,	vs3,	vs25
398
399	xvmaddadp		vs44,	vs4,	vs25
400	xvmaddadp		vs45,	vs5,	vs25
401	xvmaddadp		vs46,	vs6,	vs25
402	xvmaddadp		vs47,	vs7,	vs25
403
404	xvmaddadp		vs48,	vs0,	vs26
405	xvmaddadp		vs49,	vs1,	vs26
406	xvmaddadp		vs50,	vs2,	vs26
407	xvmaddadp		vs51,	vs3,	vs26
408
409	xvmaddadp		vs52,	vs4,	vs26
410	xvmaddadp		vs53,	vs5,	vs26
411	xvmaddadp		vs54,	vs6,	vs26
412	xvmaddadp		vs55,	vs7,	vs26
413
414	xvmaddadp		vs56,	vs0,	vs27
415	xvmaddadp		vs57,	vs1,	vs27
416	xvmaddadp		vs58,	vs2,	vs27
417	xvmaddadp		vs59,	vs3,	vs27
418	xvmaddadp		vs60,	vs4,	vs27
419	xvmaddadp		vs61,	vs5,	vs27
420	xvmaddadp		vs62,	vs6,	vs27
421	xvmaddadp		vs63,	vs7,	vs27
422
423.endif
424.endm
425
426.macro SAVE4x16_REGS
427	add		C2,	CO,	LDC
428	add		C3,	C2,	LDC
429	add		C4,	C3,	LDC
430.endm
431
432.macro SAVE4x16
433#ifndef TRMMKERNEL
434	lxv		vs0,	0(CO)
435	lxv		vs2,	16(CO)
436	lxv		vs4,	32(CO)
437	lxv		vs6,	48(CO)
438#endif
439	xxpermdi  vs8, vs40,vs32,1
440 	xxpermdi  vs9 ,vs32,vs40,1
441#ifndef TRMMKERNEL
442	lxv		vs24,	64(CO)
443	lxv		vs26,	80(CO)
444	lxv		vs28,	96(CO)
445	lxv		vs30,	112(CO)
446#endif
447	xxpermdi  vs10, vs41,vs33,1
448 	xxpermdi  vs11 ,vs33,vs41,1
449#ifndef TRMMKERNEL
450	lxv		vs1,	0(C2)
451	lxv		vs3,	16(C2)
452	lxv		vs5,	32(C2)
453	lxv		vs7,	48(C2)
454#endif
455	xxpermdi  vs12, vs42,vs34,1
456 	xxpermdi  vs13 ,vs34,vs42,1
457#ifndef TRMMKERNEL
458	lxv		vs25,	64(C2)
459	lxv		vs27,	80(C2)
460#endif
461	xxpermdi  vs14, vs43,vs35,1
462 	xxpermdi  vs15 ,vs35,vs43,1
463#ifndef TRMMKERNEL
464	lxv		vs29,	96(C2)
465	lxv		vs31,	112(C2)
466#endif
467
468#ifndef TRMMKERNEL
469	xvmaddadp	vs0,	vs8,	alpha_r
470	xvmaddadp	vs1,	vs9,	alpha_r
471	xvmaddadp	vs2,	vs10,	alpha_r
472	xvmaddadp	vs3,	vs11,	alpha_r
473#else
474	xvmuldp	vs0,	vs8,	alpha_r
475	xvmuldp	vs1,	vs9,	alpha_r
476	xvmuldp	vs2,	vs10,	alpha_r
477	xvmuldp	vs3,	vs11,	alpha_r
478
479#endif
480	xxpermdi  vs8, vs44,vs36,1
481 	xxpermdi  vs9 ,vs36,vs44,1
482	xxpermdi  vs10, vs45,vs37,1
483 	xxpermdi  vs11 ,vs37,vs45,1
484#ifndef TRMMKERNEL
485	xvmaddadp	vs4,	vs12,	alpha_r
486	xvmaddadp	vs5,	vs13,	alpha_r
487	xvmaddadp	vs6,	vs14,	alpha_r
488	xvmaddadp	vs7,	vs15,	alpha_r
489#else
490	xvmuldp	vs4,	vs12,	alpha_r
491	xvmuldp	vs5,	vs13,	alpha_r
492	xvmuldp	vs6,	vs14,	alpha_r
493	xvmuldp	vs7,	vs15,	alpha_r
494#endif
495	xxpermdi  vs12, vs46,vs38,1
496 	xxpermdi  vs13 ,vs38,vs46,1
497	xxpermdi  vs14, vs47,vs39,1
498 	xxpermdi  vs15 ,vs39,vs47,1
499
500#ifndef TRMMKERNEL
501	xvmaddadp	vs24,	vs8,	alpha_r
502	xvmaddadp	vs25,	vs9,	alpha_r
503	xvmaddadp	vs26,	vs10,	alpha_r
504	xvmaddadp	vs27,	vs11,	alpha_r
505
506	xvmaddadp	vs28,	vs12,	alpha_r
507	xvmaddadp	vs29,	vs13,	alpha_r
508	xvmaddadp	vs30,	vs14,	alpha_r
509	xvmaddadp	vs31,	vs15,	alpha_r
510#else
511	xvmuldp	vs24,	vs8,	alpha_r
512	xvmuldp	vs25,	vs9,	alpha_r
513	xvmuldp	vs26,	vs10,	alpha_r
514	xvmuldp	vs27,	vs11,	alpha_r
515
516	xvmuldp	vs28,	vs12,	alpha_r
517	xvmuldp	vs29,	vs13,	alpha_r
518	xvmuldp	vs30,	vs14,	alpha_r
519	xvmuldp	vs31,	vs15,	alpha_r
520
521#endif
522	stxv		vs0,	0(CO)
523	stxv		vs2,	16(CO)
524	stxv		vs4,	32(CO)
525	stxv		vs6,	48(CO)
526
527	stxv		vs24,	64(CO)
528	stxv		vs26,	80(CO)
529	stxv		vs28,	96(CO)
530	stxv		vs30,	112(CO)
531
532	stxv		vs1,	0(C2)
533	stxv		vs3,	16(C2)
534	stxv		vs5,	32(C2)
535	stxv		vs7,	48(C2)
536
537	stxv		vs25,	64(C2)
538	stxv		vs27,	80(C2)
539	stxv		vs29,	96(C2)
540	stxv		vs31,	112(C2)
541#ifndef TRMMKERNEL
542 	lxv		vs0,	0(C3)
543	lxv		vs2,	16(C3)
544	lxv		vs4,	32(C3)
545	lxv		vs6,	48(C3)
546#endif
547	xxpermdi  vs8, vs56,vs48,1
548 	xxpermdi  vs9 ,vs48,vs56,1
549#ifndef TRMMKERNEL
550	lxv		vs24,	64(C3)
551	lxv		vs26,	80(C3)
552#endif
553	xxpermdi  vs10, vs57,vs49,1
554 	xxpermdi  vs11 ,vs49,vs57,1
555#ifndef TRMMKERNEL
556	lxv		vs28,	96(C3)
557	lxv		vs30,	112(C3)
558#endif
559	xxpermdi  vs12, vs58,vs50,1
560 	xxpermdi  vs13 ,vs50,vs58,1
561#ifndef TRMMKERNEL
562	lxv		vs1,	0(C4)
563	lxv		vs3,	16(C4)
564#endif
565	xxpermdi  vs14, vs59,vs51,1
566 	xxpermdi  vs15 ,vs51,vs59,1
567#ifndef TRMMKERNEL
568	lxv		vs5,	32(C4)
569	lxv		vs7,	48(C4)
570
571	lxv		vs25,	64(C4)
572	lxv		vs27,	80(C4)
573	lxv		vs29,	96(C4)
574	lxv		vs31,	112(C4)
575#endif
576
577#ifndef TRMMKERNEL
578	xvmaddadp	vs0,	vs8,	alpha_r
579	xvmaddadp	vs1,	vs9,	alpha_r
580	xvmaddadp	vs2,	vs10,	alpha_r
581	xvmaddadp	vs3,	vs11,	alpha_r
582#else
583	xvmuldp	vs0,	vs8,	alpha_r
584	xvmuldp	vs1,	vs9,	alpha_r
585	xvmuldp	vs2,	vs10,	alpha_r
586	xvmuldp	vs3,	vs11,	alpha_r
587
588#endif
589
590	xxpermdi  vs8, vs60,vs52,1
591 	xxpermdi  vs9 ,vs52,vs60,1
592	xxpermdi  vs10, vs61,vs53,1
593 	xxpermdi  vs11 ,vs53,vs61,1
594#ifndef TRMMKERNEL
595	xvmaddadp	vs4,	vs12,	alpha_r
596	xvmaddadp	vs5,	vs13,	alpha_r
597	xvmaddadp	vs6,	vs14,	alpha_r
598	xvmaddadp	vs7,	vs15,	alpha_r
599#else
600	xvmuldp	vs4,	vs12,	alpha_r
601	xvmuldp	vs5,	vs13,	alpha_r
602	xvmuldp	vs6,	vs14,	alpha_r
603	xvmuldp	vs7,	vs15,	alpha_r
604#endif
605
606
607	xxpermdi  vs12, vs62,vs54,1
608 	xxpermdi  vs13 ,vs54,vs62,1
609	xxpermdi  vs14, vs63,vs55,1
610 	xxpermdi  vs15 ,vs55,vs63,1
611#ifndef TRMMKERNEL
612	xvmaddadp	vs24,	vs8,	alpha_r
613	xvmaddadp	vs25,	vs9,	alpha_r
614	xvmaddadp	vs26,	vs10,	alpha_r
615	xvmaddadp	vs27,	vs11,	alpha_r
616
617	xvmaddadp	vs28,	vs12,	alpha_r
618	xvmaddadp	vs29,	vs13,	alpha_r
619	xvmaddadp	vs30,	vs14,	alpha_r
620	xvmaddadp	vs31,	vs15,	alpha_r
621#else
622	xvmuldp	vs24,	vs8,	alpha_r
623	xvmuldp	vs25,	vs9,	alpha_r
624	xvmuldp	vs26,	vs10,	alpha_r
625	xvmuldp	vs27,	vs11,	alpha_r
626
627	xvmuldp	vs28,	vs12,	alpha_r
628	xvmuldp	vs29,	vs13,	alpha_r
629	xvmuldp	vs30,	vs14,	alpha_r
630	xvmuldp	vs31,	vs15,	alpha_r
631#endif
632 	stxv		vs0,	0(C3)
633	stxv		vs2,	16(C3)
634	stxv		vs4,	32(C3)
635	stxv		vs6,	48(C3)
636
637	stxv		vs24,	64(C3)
638	stxv		vs26,	80(C3)
639	stxv		vs28,	96(C3)
640	stxv		vs30,	112(C3)
641
642	stxv		vs1,	0(C4)
643	stxv		vs3,	16(C4)
644	stxv		vs5,	32(C4)
645	stxv		vs7,	48(C4)
646
647	stxv		vs25,	64(C4)
648	stxv		vs27,	80(C4)
649	stxv		vs29,	96(C4)
650	stxv		vs31,	112(C4)
651
652	addi		CO,	CO,	128
653.endm
654
655/*********************************************************************
656* Macros for N=4, M=8                                                *
657*********************************************************************/
658
659.macro LOAD4x8_1
660   LOAD4x8 1
661.endm
662
663.macro LOAD4x8_0
664   LOAD4x8 0
665.endm
666.macro LOAD4x8  Zero
667
668	lxv	vs24,	0(BO)
669	lxv	vs26,	16(BO)
670	xxpermdi	vs25,	vs24,	vs24,2
671	xxpermdi	vs27,	vs26,	vs26,2
672
673	lxv	vs0,	 0(AO)
674	lxv	vs1,	16(AO)
675	lxv	vs2,	32(AO)
676	lxv	vs3,	48(AO)
677
678
679
680.if \Zero==1
681    xxlxor		vs32,vs32,vs32
682    xxlxor		vs33,vs33,vs33
683	xxlxor		vs34,vs34,vs34
684	xxlxor		vs35,vs35,vs35
685
686	xxlxor		vs40,	vs40,	vs40
687	xxlxor		vs41,	vs41,	vs41
688	xxlxor		vs42,	vs42,	vs42
689	xxlxor		vs43,	vs43,	vs43
690
691	xxlxor		vs48,	vs48,	vs48
692	xxlxor		vs49,	vs49,	vs49
693	xxlxor		vs50,	vs50,	vs50
694	xxlxor		vs51,	vs51,	vs51
695
696	xxlxor		vs56,	vs56,	vs56
697	xxlxor		vs57,	vs57,	vs57
698	xxlxor		vs58,	vs58,	vs58
699	xxlxor		vs59,	vs59,	vs59
700
701.endif
702.endm
703
704
705
706.macro KERNEL4x8_L1_L2  Index,IsLast
707  KERNEL4x8_L1_L2_I  0,0,0, \Index,\IsLast,0
708.endm
709
710
711
712.macro KERNEL4x8_I1_L2  OffsetA,OffsetB, Index,IsLast
713  KERNEL4x8_L1_L2_I  1,\OffsetA,\OffsetB,\Index,\IsLast,0
714.endm
715
716.macro KERNEL4x8_I1_L2_2  OffsetA,OffsetB, Index,IsLast
717  KERNEL4x8_L1_L2_I  0,\OffsetA,\OffsetB,\Index,\IsLast,0
718.endm
719
720.macro KERNEL4x8_I1_L2_3  OffsetA,OffsetB, Index,IsLast
721  KERNEL4x8_L1_L2_I  0,\OffsetA,\OffsetB,\Index,\IsLast,1
722.endm
723
724.macro KERNEL4x8_L1_L2_I  First, OffsetA,OffsetB, Index,IsLast ,Complete
725
726	lxv	vs8,	DISP16(\Index,0+\OffsetA)(AO)
727	lxv	vs9,	DISP16(\Index,16+\OffsetA)(AO)
728.if \First ==1
729	xvmuldp		vs32,	vs0,	vs24
730	xvmuldp		vs33,	vs1,	vs24
731	xvmuldp		vs34,	vs2,	vs24
732	xvmuldp		vs35,	vs3,	vs24
733.else
734	xvmaddadp		vs32,	vs0,	vs24
735	xvmaddadp		vs33,	vs1,	vs24
736	xvmaddadp		vs34,	vs2,	vs24
737	xvmaddadp		vs35,	vs3,	vs24
738.endif
739
740	lxv	vs10,	DISP16(\Index,32+\OffsetA)(AO)
741	lxv	vs11,	DISP16(\Index,48+\OffsetA)(AO)
742
743
744
745.if \First ==1
746	xvmuldp		vs40,	vs0,	vs25
747	xvmuldp		vs41,	vs1,	vs25
748	xvmuldp		vs42,	vs2,	vs25
749	xvmuldp		vs43,	vs3,	vs25
750
751
752	xvmuldp		vs48,	vs0,	vs26
753	xvmuldp		vs49,	vs1,	vs26
754	xvmuldp		vs50,	vs2,	vs26
755	xvmuldp		vs51,	vs3,	vs26
756
757
758.else
759
760	lxv	vs28,	DISP8(\Index,0  +\OffsetB)(BO)
761	lxv	vs30,	DISP8(\Index,16  +\OffsetB)(BO)
762
763	xvmaddadp		vs40,	vs0,	vs25
764	xvmaddadp		vs41,	vs1,	vs25
765	xvmaddadp		vs42,	vs2,	vs25
766	xvmaddadp		vs43,	vs3,	vs25
767
768
769	xvmaddadp		vs48,	vs0,	vs26
770	xvmaddadp		vs49,	vs1,	vs26
771	xvmaddadp		vs50,	vs2,	vs26
772	xvmaddadp		vs51,	vs3,	vs26
773
774.endif
775	xxpermdi	vs29,	vs28,	vs28,2
776	xxpermdi	vs31,	vs30,	vs30,2
777.if \First ==1
778	xvmuldp		vs56,	vs0,	vs27
779	xvmuldp		vs57,	vs1,	vs27
780	xvmuldp		vs58,	vs2,	vs27
781	xvmuldp		vs59,	vs3,	vs27
782
783.else
784	xvmaddadp		vs56,	vs0,	vs27
785	xvmaddadp		vs57,	vs1,	vs27
786	xvmaddadp		vs58,	vs2,	vs27
787	xvmaddadp		vs59,	vs3,	vs27
788
789.endif
790
791	xvmaddadp		vs32,	vs8,	vs28
792	xvmaddadp		vs33,	vs9,	vs28
793	xvmaddadp		vs34,	vs10,	vs28
794	xvmaddadp		vs35,	vs11,	vs28
795.if \Complete==0
796	lxv	vs0,	DISP16(\Index,64+\OffsetA)(AO)
797	lxv	vs1,	DISP16(\Index,80+\OffsetA)(AO)
798.endif
799
800
801	xvmaddadp		vs40,	vs8,	vs29
802	xvmaddadp		vs41,	vs9,	vs29
803	xvmaddadp		vs42,	vs10,	vs29
804	xvmaddadp		vs43,	vs11,	vs29
805
806.if \Complete==0
807	lxv	vs2,	DISP16(\Index,96+\OffsetA)(AO)
808	lxv	vs3,	DISP16(\Index,112+\OffsetA)(AO)
809.endif
810
811
812	xvmaddadp		vs48,	vs8,	vs30
813	xvmaddadp		vs49,	vs9,	vs30
814	xvmaddadp		vs50,	vs10,	vs30
815	xvmaddadp		vs51,	vs11,	vs30
816.if \Complete==0
817	lxv	vs24,	DISP8(\Index,32  +\OffsetB)(BO)
818	lxv	vs26,	DISP8(\Index,48  +\OffsetB)(BO)
819.endif
820
821	xvmaddadp		vs56,	vs8,	vs31
822	xvmaddadp		vs57,	vs9,	vs31
823	xvmaddadp		vs58,	vs10,	vs31
824	xvmaddadp		vs59,	vs11,	vs31
825.if \Complete==0
826	xxpermdi	vs25,	vs24,	vs24,2
827	xxpermdi	vs27,	vs26,	vs26,2
828.endif
829
830  .if \IsLast==1
831  .if \Complete==1
832	addi		AO, AO, DISP16(\Index,64+\OffsetA)
833	addi		BO, BO,  DISP8(\Index,32+\OffsetB)
834  .else
835	addi		AO, AO, DISP16(\Index,128)
836	addi		BO, BO,  DISP8(\Index,64)
837  .endif
838  .endif
839
840
841.endm
842
843
844
845.macro KERNEL4x8 First
846
847	lxv	vs24,	0(BO)
848	lxv	vs26,	16(BO)
849	xxpermdi	vs25,	vs24,	vs24,2
850	xxpermdi	vs27,	vs26,	vs26,2
851
852	lxv	vs0,	0(AO)
853	lxv	vs1,	16(AO)
854	lxv	vs2,	32(AO)
855	lxv	vs3,	48(AO)
856
857
858
859
860	addi		BO, BO, 32
861    addi		AO, AO, 64
862
863.if \First==1
864	xvmuldp			vs32,	vs0,	vs24
865	xvmuldp			vs33,	vs1,	vs24
866	xvmuldp			vs34,	vs2,	vs24
867	xvmuldp			vs35,	vs3,	vs24
868
869
870	xvmuldp			vs40,	vs0,	vs25
871	xvmuldp			vs41,	vs1,	vs25
872	xvmuldp			vs42,	vs2,	vs25
873	xvmuldp			vs43,	vs3,	vs25
874
875
876	xvmuldp			vs48,	vs0,	vs26
877	xvmuldp			vs49,	vs1,	vs26
878	xvmuldp			vs50,	vs2,	vs26
879	xvmuldp			vs51,	vs3,	vs26
880
881
882	xvmuldp			vs56,	vs0,	vs27
883	xvmuldp			vs57,	vs1,	vs27
884	xvmuldp			vs58,	vs2,	vs27
885	xvmuldp			vs59,	vs3,	vs27
886
887.else
888	xvmaddadp		vs32,	vs0,	vs24
889	xvmaddadp		vs33,	vs1,	vs24
890	xvmaddadp		vs34,	vs2,	vs24
891	xvmaddadp		vs35,	vs3,	vs24
892
893
894	xvmaddadp		vs40,	vs0,	vs25
895	xvmaddadp		vs41,	vs1,	vs25
896	xvmaddadp		vs42,	vs2,	vs25
897	xvmaddadp		vs43,	vs3,	vs25
898
899
900
901	xvmaddadp		vs48,	vs0,	vs26
902	xvmaddadp		vs49,	vs1,	vs26
903	xvmaddadp		vs50,	vs2,	vs26
904	xvmaddadp		vs51,	vs3,	vs26
905
906
907
908	xvmaddadp		vs56,	vs0,	vs27
909	xvmaddadp		vs57,	vs1,	vs27
910	xvmaddadp		vs58,	vs2,	vs27
911	xvmaddadp		vs59,	vs3,	vs27
912
913
914.endif
915.endm
916
917
918
919.macro SAVE4x8
920	add		T2,	CO,	LDC
921	add		T3,	T2,	LDC
922	add		T4,	T3,	LDC
923#ifndef TRMMKERNEL
924	lxv		vs0,	0(CO)
925	lxv		vs2,	16(CO)
926#endif
927	xxpermdi  vs8, vs40,vs32,1
928 	xxpermdi  vs9 ,vs32,vs40,1
929#ifndef TRMMKERNEL
930	lxv		vs4,	32(CO)
931	lxv		vs6,	48(CO)
932#endif
933	xxpermdi  vs10, vs41,vs33,1
934 	xxpermdi  vs11 ,vs33,vs41,1
935#ifndef TRMMKERNEL
936	lxv		vs1,	0(T2)
937	lxv		vs3,	16(T2)
938#endif
939	xxpermdi  vs12, vs42,vs34,1
940 	xxpermdi  vs13 ,vs34,vs42,1
941#ifndef TRMMKERNEL
942	lxv		vs5,	32(T2)
943	lxv		vs7,	48(T2)
944#endif
945	xxpermdi  vs14, vs43,vs35,1
946 	xxpermdi  vs15 ,vs35,vs43,1
947
948
949
950#ifndef TRMMKERNEL
951	xvmaddadp	vs0,	vs8,	alpha_r
952	xvmaddadp	vs1,	vs9,	alpha_r
953	xvmaddadp	vs2,	vs10,	alpha_r
954	xvmaddadp	vs3,	vs11,	alpha_r
955
956	xvmaddadp	vs4,	vs12,	alpha_r
957	xvmaddadp	vs5,	vs13,	alpha_r
958	xvmaddadp	vs6,	vs14,	alpha_r
959	xvmaddadp	vs7,	vs15,	alpha_r
960#else
961	xvmuldp	vs0,	vs8,	alpha_r
962	xvmuldp	vs1,	vs9,	alpha_r
963	xvmuldp	vs2,	vs10,	alpha_r
964	xvmuldp	vs3,	vs11,	alpha_r
965
966	xvmuldp	vs4,	vs12,	alpha_r
967	xvmuldp	vs5,	vs13,	alpha_r
968	xvmuldp	vs6,	vs14,	alpha_r
969	xvmuldp	vs7,	vs15,	alpha_r
970
971#endif
972
973
974	stxv		vs0,	0(CO)
975	stxv		vs2,	16(CO)
976	stxv		vs4,	32(CO)
977	stxv		vs6,	48(CO)
978
979
980	stxv		vs1,	0(T2)
981	stxv		vs3,	16(T2)
982	stxv		vs5,	32(T2)
983	stxv		vs7,	48(T2)
984
985
986	xxpermdi  vs8, vs56,vs48,1
987 	xxpermdi  vs9 ,vs48,vs56,1
988#ifndef TRMMKERNEL
989 	lxv		vs0,	0(T3)
990	lxv		vs2,	16(T3)
991#endif
992	xxpermdi  vs10, vs57,vs49,1
993 	xxpermdi  vs11 ,vs49,vs57,1
994#ifndef TRMMKERNEL
995	lxv		vs4,	32(T3)
996	lxv		vs6,	48(T3)
997#endif
998	xxpermdi  vs12, vs58,vs50,1
999 	xxpermdi  vs13 ,vs50,vs58,1
1000#ifndef TRMMKERNEL
1001	lxv		vs1,	0(T4)
1002	lxv		vs3,	16(T4)
1003#endif
1004	xxpermdi  vs14, vs59,vs51,1
1005 	xxpermdi  vs15 ,vs51,vs59,1
1006#ifndef TRMMKERNEL
1007	lxv		vs5,	32(T4)
1008	lxv		vs7,	48(T4)
1009
1010
1011	xvmaddadp	vs0,	vs8,	alpha_r
1012	xvmaddadp	vs1,	vs9,	alpha_r
1013	xvmaddadp	vs2,	vs10,	alpha_r
1014	xvmaddadp	vs3,	vs11,	alpha_r
1015
1016
1017
1018	xvmaddadp	vs4,	vs12,	alpha_r
1019	xvmaddadp	vs5,	vs13,	alpha_r
1020	xvmaddadp	vs6,	vs14,	alpha_r
1021	xvmaddadp	vs7,	vs15,	alpha_r
1022#else
1023	xvmuldp	vs0,	vs8,	alpha_r
1024	xvmuldp	vs1,	vs9,	alpha_r
1025	xvmuldp	vs2,	vs10,	alpha_r
1026	xvmuldp	vs3,	vs11,	alpha_r
1027
1028
1029
1030	xvmuldp	vs4,	vs12,	alpha_r
1031	xvmuldp	vs5,	vs13,	alpha_r
1032	xvmuldp	vs6,	vs14,	alpha_r
1033	xvmuldp	vs7,	vs15,	alpha_r
1034
1035#endif
1036
1037
1038 	stxv		vs0,	0(T3)
1039	stxv		vs2,	16(T3)
1040	stxv		vs4,	32(T3)
1041	stxv		vs6,	48(T3)
1042
1043
1044	stxv		vs1,	0(T4)
1045	stxv		vs3,	16(T4)
1046	stxv		vs5,	32(T4)
1047	stxv		vs7,	48(T4)
1048
1049
1050
1051	addi		CO,	CO,	64
1052.endm
1053
1054
1055/*********************************************************************
1056* Macros for N=4, M=4                                                *
1057*********************************************************************/
1058
1059.macro LOAD4x4_1
1060
1061	lxvd2x	vs0,	0,	AO
1062	lxvd2x	vs1,	o16,	AO
1063
1064	lxvdsx	vs24,	0,	BO
1065	lxvdsx	vs25,	o8,	BO
1066	lxvdsx	vs26,	o16,	BO
1067	lxvdsx	vs27,	o24,	BO
1068
1069	addi		AO, AO, 32
1070	addi		BO, BO, 32
1071
1072.endm
1073
1074.macro KERNEL4x4_I1
1075
1076	lxvd2x	vs8,	0,	AO
1077	lxvd2x	vs9,	o16,	AO
1078
1079	lxvdsx	vs28,	0,	BO
1080	lxvdsx	vs29,	o8,	BO
1081	lxvdsx	vs30,	o16,	BO
1082	lxvdsx	vs31,	o24,	BO
1083
1084	addi		AO, AO, 32
1085	addi		BO, BO, 32
1086
1087
1088	xvmuldp			vs32,	vs0,	vs24
1089	xvmuldp			vs33,	vs1,	vs24
1090
1091	xvmuldp			vs40,	vs0,	vs25
1092	xvmuldp			vs41,	vs1,	vs25
1093
1094	xvmuldp			vs48,	vs0,	vs26
1095	xvmuldp			vs49,	vs1,	vs26
1096
1097	xvmuldp			vs56,	vs0,	vs27
1098	xvmuldp			vs57,	vs1,	vs27
1099
1100.endm
1101
1102.macro KERNEL4x4_1
1103
1104	lxvd2x	vs8,	0,	AO
1105	lxvd2x	vs9,	o16,	AO
1106
1107	lxvdsx	vs28,	0,	BO
1108	lxvdsx	vs29,	o8,	BO
1109	lxvdsx	vs30,	o16,	BO
1110	lxvdsx	vs31,	o24,	BO
1111
1112	addi		AO, AO, 32
1113	addi		BO, BO, 32
1114
1115
1116	xvmaddadp		vs32,	vs0,	vs24
1117	xvmaddadp		vs33,	vs1,	vs24
1118
1119	xvmaddadp		vs40,	vs0,	vs25
1120	xvmaddadp		vs41,	vs1,	vs25
1121
1122	xvmaddadp		vs48,	vs0,	vs26
1123	xvmaddadp		vs49,	vs1,	vs26
1124
1125	xvmaddadp		vs56,	vs0,	vs27
1126	xvmaddadp		vs57,	vs1,	vs27
1127
1128.endm
1129
1130.macro KERNEL4x4_2
1131
1132	lxvd2x	vs0,	0,	AO
1133	lxvd2x	vs1,	o16,	AO
1134
1135	lxvdsx	vs24,	0,	BO
1136	lxvdsx	vs25,	o8,	BO
1137	lxvdsx	vs26,	o16,	BO
1138	lxvdsx	vs27,	o24,	BO
1139
1140	addi		AO, AO, 32
1141	addi		BO, BO, 32
1142
1143
1144	xvmaddadp		vs32,	vs8,	vs28
1145	xvmaddadp		vs33,	vs9,	vs28
1146
1147	xvmaddadp		vs40,	vs8,	vs29
1148	xvmaddadp		vs41,	vs9,	vs29
1149
1150	xvmaddadp		vs48,	vs8,	vs30
1151	xvmaddadp		vs49,	vs9,	vs30
1152
1153	xvmaddadp		vs56,	vs8,	vs31
1154	xvmaddadp		vs57,	vs9,	vs31
1155
1156.endm
1157
1158.macro KERNEL4x4_E2
1159
1160
1161	xvmaddadp		vs32,	vs8,	vs28
1162	xvmaddadp		vs33,	vs9,	vs28
1163
1164	xvmaddadp		vs40,	vs8,	vs29
1165	xvmaddadp		vs41,	vs9,	vs29
1166
1167	xvmaddadp		vs48,	vs8,	vs30
1168	xvmaddadp		vs49,	vs9,	vs30
1169
1170	xvmaddadp		vs56,	vs8,	vs31
1171	xvmaddadp		vs57,	vs9,	vs31
1172
1173.endm
1174
1175.macro KERNEL4x4_SUBI1
1176
1177	lxvd2x	vs0,	0,	AO
1178	lxvd2x	vs1,	o16,	AO
1179
1180	lxvdsx	vs24,	0,	BO
1181	lxvdsx	vs25,	o8,	BO
1182	lxvdsx	vs26,	o16,	BO
1183	lxvdsx	vs27,	o24,	BO
1184
1185	addi		AO, AO, 32
1186	addi		BO, BO, 32
1187
1188
1189	xvmuldp			vs32,	vs0,	vs24
1190	xvmuldp			vs33,	vs1,	vs24
1191
1192	xvmuldp			vs40,	vs0,	vs25
1193	xvmuldp			vs41,	vs1,	vs25
1194
1195	xvmuldp			vs48,	vs0,	vs26
1196	xvmuldp			vs49,	vs1,	vs26
1197
1198	xvmuldp			vs56,	vs0,	vs27
1199	xvmuldp			vs57,	vs1,	vs27
1200
1201.endm
1202
1203.macro KERNEL4x4_SUB1
1204
1205	lxvd2x	vs0,	0,	AO
1206	lxvd2x	vs1,	o16,	AO
1207
1208	lxvdsx	vs24,	0,	BO
1209	lxvdsx	vs25,	o8,	BO
1210	lxvdsx	vs26,	o16,	BO
1211	lxvdsx	vs27,	o24,	BO
1212
1213	addi		AO, AO, 32
1214	addi		BO, BO, 32
1215
1216
1217	xvmaddadp		vs32,	vs0,	vs24
1218	xvmaddadp		vs33,	vs1,	vs24
1219
1220	xvmaddadp		vs40,	vs0,	vs25
1221	xvmaddadp		vs41,	vs1,	vs25
1222
1223	xvmaddadp		vs48,	vs0,	vs26
1224	xvmaddadp		vs49,	vs1,	vs26
1225
1226	xvmaddadp		vs56,	vs0,	vs27
1227	xvmaddadp		vs57,	vs1,	vs27
1228
1229.endm
1230
1231.macro SAVE4x4
1232
1233	mr		T1,	CO
1234
1235#ifndef TRMMKERNEL
1236	lxvd2x		vs0,	0,	T1
1237	lxvd2x		vs1,	o16,	T1
1238#endif
1239
1240#ifndef TRMMKERNEL
1241	xvmaddadp	vs0,	vs32,	alpha_r
1242	xvmaddadp	vs1,	vs33,	alpha_r
1243#else
1244	xvmuldp		vs0,	vs32,	alpha_r
1245	xvmuldp		vs1,	vs33,	alpha_r
1246#endif
1247
1248	stxvd2x		vs0,	0,	T1
1249	stxvd2x		vs1,	o16,	T1
1250
1251	add		T1,	T1,	LDC
1252
1253#ifndef TRMMKERNEL
1254	lxvd2x		vs8,	0,	T1
1255	lxvd2x		vs9,	o16,	T1
1256#endif
1257
1258#ifndef TRMMKERNEL
1259	xvmaddadp	vs8,	vs40,	alpha_r
1260	xvmaddadp	vs9,	vs41,	alpha_r
1261#else
1262	xvmuldp		vs8,	vs40,	alpha_r
1263	xvmuldp		vs9,	vs41,	alpha_r
1264#endif
1265
1266	stxvd2x		vs8,	0,	T1
1267	stxvd2x		vs9,	o16,	T1
1268
1269	add		T1,	T1,	LDC
1270
1271#ifndef TRMMKERNEL
1272	lxvd2x		vs0,	0,	T1
1273	lxvd2x		vs1,	o16,	T1
1274#endif
1275
1276#ifndef TRMMKERNEL
1277	xvmaddadp	vs0,	vs48,	alpha_r
1278	xvmaddadp	vs1,	vs49,	alpha_r
1279#else
1280	xvmuldp		vs0,	vs48,	alpha_r
1281	xvmuldp		vs1,	vs49,	alpha_r
1282#endif
1283
1284	stxvd2x		vs0,	0,	T1
1285	stxvd2x		vs1,	o16,	T1
1286
1287	add		T1,	T1,	LDC
1288
1289#ifndef TRMMKERNEL
1290	lxvd2x		vs8,	0,	T1
1291	lxvd2x		vs9,	o16,	T1
1292#endif
1293
1294#ifndef TRMMKERNEL
1295	xvmaddadp	vs8,	vs56,	alpha_r
1296	xvmaddadp	vs9,	vs57,	alpha_r
1297#else
1298	xvmuldp		vs8,	vs56,	alpha_r
1299	xvmuldp		vs9,	vs57,	alpha_r
1300#endif
1301
1302	stxvd2x		vs8,	0,	T1
1303	stxvd2x		vs9,	o16,	T1
1304
1305	addi		CO,	CO,	32
1306
1307.endm
1308
1309/*********************************************************************
1310* Macros for N=4, M=2                                                *
1311*********************************************************************/
1312
1313.macro LOAD4x2_1
1314
1315	lxvd2x	vs0,	0,	AO
1316
1317	lxvdsx	vs24,	0,	BO
1318	lxvdsx	vs25,	o8,	BO
1319	lxvdsx	vs26,	o16,	BO
1320	lxvdsx	vs27,	o24,	BO
1321
1322	addi		AO, AO, 16
1323	addi		BO, BO, 32
1324
1325.endm
1326
1327.macro KERNEL4x2_I1
1328
1329	lxvd2x	vs8,	0,	AO
1330
1331	lxvdsx	vs28,	0,	BO
1332	lxvdsx	vs29,	o8,	BO
1333	lxvdsx	vs30,	o16,	BO
1334	lxvdsx	vs31,	o24,	BO
1335
1336	addi		AO, AO, 16
1337	addi		BO, BO, 32
1338
1339
1340	xvmuldp			vs32,	vs0,	vs24
1341
1342	xvmuldp			vs40,	vs0,	vs25
1343
1344	xvmuldp			vs48,	vs0,	vs26
1345
1346	xvmuldp			vs56,	vs0,	vs27
1347
1348.endm
1349
1350.macro KERNEL4x2_1
1351
1352	lxvd2x	vs8,	0,	AO
1353
1354	lxvdsx	vs28,	0,	BO
1355	lxvdsx	vs29,	o8,	BO
1356	lxvdsx	vs30,	o16,	BO
1357	lxvdsx	vs31,	o24,	BO
1358
1359	addi		AO, AO, 16
1360	addi		BO, BO, 32
1361
1362
1363	xvmaddadp		vs32,	vs0,	vs24
1364
1365	xvmaddadp		vs40,	vs0,	vs25
1366
1367	xvmaddadp		vs48,	vs0,	vs26
1368
1369	xvmaddadp		vs56,	vs0,	vs27
1370
1371.endm
1372
1373.macro KERNEL4x2_2
1374
1375	lxvd2x	vs0,	0,	AO
1376
1377	lxvdsx	vs24,	0,	BO
1378	lxvdsx	vs25,	o8,	BO
1379	lxvdsx	vs26,	o16,	BO
1380	lxvdsx	vs27,	o24,	BO
1381
1382	addi		AO, AO, 16
1383	addi		BO, BO, 32
1384
1385
1386	xvmaddadp		vs32,	vs8,	vs28
1387
1388	xvmaddadp		vs40,	vs8,	vs29
1389
1390	xvmaddadp		vs48,	vs8,	vs30
1391
1392	xvmaddadp		vs56,	vs8,	vs31
1393
1394.endm
1395
1396.macro KERNEL4x2_E2
1397
1398
1399	xvmaddadp		vs32,	vs8,	vs28
1400
1401	xvmaddadp		vs40,	vs8,	vs29
1402
1403	xvmaddadp		vs48,	vs8,	vs30
1404
1405	xvmaddadp		vs56,	vs8,	vs31
1406
1407.endm
1408
1409.macro KERNEL4x2_SUBI1
1410
1411	lxvd2x	vs0,	0,	AO
1412
1413	lxvdsx	vs24,	0,	BO
1414	lxvdsx	vs25,	o8,	BO
1415	lxvdsx	vs26,	o16,	BO
1416	lxvdsx	vs27,	o24,	BO
1417
1418	addi		AO, AO, 16
1419	addi		BO, BO, 32
1420
1421
1422	xvmuldp			vs32,	vs0,	vs24
1423
1424	xvmuldp			vs40,	vs0,	vs25
1425
1426	xvmuldp			vs48,	vs0,	vs26
1427
1428	xvmuldp			vs56,	vs0,	vs27
1429
1430.endm
1431
1432.macro KERNEL4x2_SUB1
1433
1434	lxvd2x	vs0,	0,	AO
1435
1436	lxvdsx	vs24,	0,	BO
1437	lxvdsx	vs25,	o8,	BO
1438	lxvdsx	vs26,	o16,	BO
1439	lxvdsx	vs27,	o24,	BO
1440
1441	addi		AO, AO, 16
1442	addi		BO, BO, 32
1443
1444
1445	xvmaddadp		vs32,	vs0,	vs24
1446
1447	xvmaddadp		vs40,	vs0,	vs25
1448
1449	xvmaddadp		vs48,	vs0,	vs26
1450
1451	xvmaddadp		vs56,	vs0,	vs27
1452
1453.endm
1454
1455.macro SAVE4x2
1456
1457	mr		T1,	CO
1458
1459#ifndef TRMMKERNEL
1460	lxvd2x		vs0,	0,	T1
1461#endif
1462
1463#ifndef TRMMKERNEL
1464	xvmaddadp	vs0,	vs32,	alpha_r
1465#else
1466	xvmuldp		vs0,	vs32,	alpha_r
1467#endif
1468
1469	stxvd2x		vs0,	0,	T1
1470
1471	add		T1,	T1,	LDC
1472
1473#ifndef TRMMKERNEL
1474	lxvd2x		vs8,	0,	T1
1475#endif
1476
1477#ifndef TRMMKERNEL
1478	xvmaddadp	vs8,	vs40,	alpha_r
1479#else
1480	xvmuldp		vs8,	vs40,	alpha_r
1481#endif
1482
1483	stxvd2x		vs8,	0,	T1
1484
1485	add		T1,	T1,	LDC
1486
1487#ifndef TRMMKERNEL
1488	lxvd2x		vs0,	0,	T1
1489#endif
1490
1491#ifndef TRMMKERNEL
1492	xvmaddadp	vs0,	vs48,	alpha_r
1493#else
1494	xvmuldp		vs0,	vs48,	alpha_r
1495#endif
1496
1497	stxvd2x		vs0,	0,	T1
1498
1499	add		T1,	T1,	LDC
1500
1501#ifndef TRMMKERNEL
1502	lxvd2x		vs8,	0,	T1
1503#endif
1504
1505#ifndef TRMMKERNEL
1506	xvmaddadp	vs8,	vs56,	alpha_r
1507#else
1508	xvmuldp		vs8,	vs56,	alpha_r
1509#endif
1510
1511	stxvd2x		vs8,	0,	T1
1512
1513	addi		CO,	CO,	16
1514
1515.endm
1516
1517/*********************************************************************
1518* Macros for N=4, M=1                                                *
1519*********************************************************************/
1520
1521.macro LOAD4x1_1
1522
1523	lxsdx	vs0,	0,	AO
1524
1525	lxsdx	vs24,	0,	BO
1526	lxsdx	vs25,	o8,	BO
1527	lxsdx	vs26,	o16,	BO
1528	lxsdx	vs27,	o24,	BO
1529
1530	addi		AO, AO, 8
1531	addi		BO, BO, 32
1532
1533.endm
1534
1535.macro KERNEL4x1_I1
1536
1537	lxsdx	vs8,	0,	AO
1538
1539	lxsdx	vs28,	0,	BO
1540	lxsdx	vs29,	o8,	BO
1541	lxsdx	vs30,	o16,	BO
1542	lxsdx	vs31,	o24,	BO
1543
1544	addi		AO, AO, 8
1545	addi		BO, BO, 32
1546
1547
1548	xsmuldp			vs32,	vs0,	vs24
1549
1550	xsmuldp			vs40,	vs0,	vs25
1551
1552	xsmuldp			vs48,	vs0,	vs26
1553
1554	xsmuldp			vs56,	vs0,	vs27
1555
1556.endm
1557
1558.macro KERNEL4x1_1
1559
1560	lxsdx	vs8,	0,	AO
1561
1562	lxsdx	vs28,	0,	BO
1563	lxsdx	vs29,	o8,	BO
1564	lxsdx	vs30,	o16,	BO
1565	lxsdx	vs31,	o24,	BO
1566
1567	addi		AO, AO, 8
1568	addi		BO, BO, 32
1569
1570
1571	xsmaddadp		vs32,	vs0,	vs24
1572
1573	xsmaddadp		vs40,	vs0,	vs25
1574
1575	xsmaddadp		vs48,	vs0,	vs26
1576
1577	xsmaddadp		vs56,	vs0,	vs27
1578
1579.endm
1580
1581.macro KERNEL4x1_2
1582
1583	lxsdx	vs0,	0,	AO
1584
1585	lxsdx	vs24,	0,	BO
1586	lxsdx	vs25,	o8,	BO
1587	lxsdx	vs26,	o16,	BO
1588	lxsdx	vs27,	o24,	BO
1589
1590	addi		AO, AO, 8
1591	addi		BO, BO, 32
1592
1593
1594	xsmaddadp		vs32,	vs8,	vs28
1595
1596	xsmaddadp		vs40,	vs8,	vs29
1597
1598	xsmaddadp		vs48,	vs8,	vs30
1599
1600	xsmaddadp		vs56,	vs8,	vs31
1601
1602.endm
1603
1604.macro KERNEL4x1_E2
1605
1606
1607	xsmaddadp		vs32,	vs8,	vs28
1608
1609	xsmaddadp		vs40,	vs8,	vs29
1610
1611	xsmaddadp		vs48,	vs8,	vs30
1612
1613	xsmaddadp		vs56,	vs8,	vs31
1614
1615.endm
1616
1617.macro KERNEL4x1_SUBI1
1618
1619	lxsdx	vs0,	0,	AO
1620
1621	lxsdx	vs24,	0,	BO
1622	lxsdx	vs25,	o8,	BO
1623	lxsdx	vs26,	o16,	BO
1624	lxsdx	vs27,	o24,	BO
1625
1626	addi		AO, AO, 8
1627	addi		BO, BO, 32
1628
1629
1630	xsmuldp			vs32,	vs0,	vs24
1631
1632	xsmuldp			vs40,	vs0,	vs25
1633
1634	xsmuldp			vs48,	vs0,	vs26
1635
1636	xsmuldp			vs56,	vs0,	vs27
1637
1638.endm
1639
1640.macro KERNEL4x1_SUB1
1641
1642	lxsdx	vs0,	0,	AO
1643
1644	lxsdx	vs24,	0,	BO
1645	lxsdx	vs25,	o8,	BO
1646	lxsdx	vs26,	o16,	BO
1647	lxsdx	vs27,	o24,	BO
1648
1649	addi		AO, AO, 8
1650	addi		BO, BO, 32
1651
1652
1653	xsmaddadp		vs32,	vs0,	vs24
1654
1655	xsmaddadp		vs40,	vs0,	vs25
1656
1657	xsmaddadp		vs48,	vs0,	vs26
1658
1659	xsmaddadp		vs56,	vs0,	vs27
1660
1661.endm
1662
1663.macro SAVE4x1
1664
1665	mr		T1,	CO
1666
1667#ifndef TRMMKERNEL
1668	lxsdx		vs0,	0,	T1
1669#endif
1670
1671#ifndef TRMMKERNEL
1672	xsmaddadp	vs0,	vs32,	alpha_r
1673#else
1674	xsmuldp		vs0,	vs32,	alpha_r
1675#endif
1676
1677	stxsdx		vs0,	0,	T1
1678
1679	add		T1,	T1,	LDC
1680
1681#ifndef TRMMKERNEL
1682	lxsdx		vs8,	0,	T1
1683#endif
1684
1685#ifndef TRMMKERNEL
1686	xsmaddadp	vs8,	vs40,	alpha_r
1687#else
1688	xsmuldp		vs8,	vs40,	alpha_r
1689#endif
1690
1691	stxsdx		vs8,	0,	T1
1692
1693	add		T1,	T1,	LDC
1694
1695#ifndef TRMMKERNEL
1696	lxsdx		vs0,	0,	T1
1697#endif
1698
1699#ifndef TRMMKERNEL
1700	xsmaddadp	vs0,	vs48,	alpha_r
1701#else
1702	xsmuldp		vs0,	vs48,	alpha_r
1703#endif
1704
1705	stxsdx		vs0,	0,	T1
1706
1707	add		T1,	T1,	LDC
1708
1709#ifndef TRMMKERNEL
1710	lxsdx		vs8,	0,	T1
1711#endif
1712
1713#ifndef TRMMKERNEL
1714	xsmaddadp	vs8,	vs56,	alpha_r
1715#else
1716	xsmuldp		vs8,	vs56,	alpha_r
1717#endif
1718
1719	stxsdx		vs8,	0,	T1
1720
1721	addi		CO,	CO,	8
1722
1723.endm
1724
1725/*********************************************************************
1726* Macros for N=2, M=16                                               *
1727*********************************************************************/
1728
1729.macro LOAD2x16_1
1730
1731	lxvd2x	vs0,	0,	AO
1732	lxvd2x	vs1,	o16,	AO
1733	lxvd2x	vs2,	o32,	AO
1734	lxvd2x	vs3,	o48,	AO
1735
1736	lxvdsx	vs24,	0,	BO
1737	lxvdsx	vs25,	o8,	BO
1738
1739	addi		AO, AO, 64
1740	addi		BO, BO, 16
1741
1742	lxvd2x	vs4,	0,	AO
1743	lxvd2x	vs5,	o16,	AO
1744	lxvd2x	vs6,	o32,	AO
1745	lxvd2x	vs7,	o48,	AO
1746
1747	addi		AO, AO, 64
1748
1749.endm
1750
1751.macro KERNEL2x16_I1
1752
1753	lxvd2x	vs8,	0,	AO
1754	lxvd2x	vs9,	o16,	AO
1755	lxvd2x	vs10,	o32,	AO
1756	lxvd2x	vs11,	o48,	AO
1757
1758	lxvdsx	vs28,	0,	BO
1759	lxvdsx	vs29,	o8,	BO
1760
1761	addi		AO, AO, 64
1762	addi		BO, BO, 16
1763
1764	lxvd2x	vs12,	0,	AO
1765	lxvd2x	vs13,	o16,	AO
1766	lxvd2x	vs14,	o32,	AO
1767	lxvd2x	vs15,	o48,	AO
1768
1769	addi		AO, AO, 64
1770
1771
1772	xvmuldp			vs32,	vs0,	vs24
1773	xvmuldp			vs33,	vs1,	vs24
1774	xvmuldp			vs34,	vs2,	vs24
1775	xvmuldp			vs35,	vs3,	vs24
1776	xvmuldp			vs36,	vs4,	vs24
1777	xvmuldp			vs37,	vs5,	vs24
1778	xvmuldp			vs38,	vs6,	vs24
1779	xvmuldp			vs39,	vs7,	vs24
1780
1781	xvmuldp			vs40,	vs0,	vs25
1782	xvmuldp			vs41,	vs1,	vs25
1783	xvmuldp			vs42,	vs2,	vs25
1784	xvmuldp			vs43,	vs3,	vs25
1785	xvmuldp			vs44,	vs4,	vs25
1786	xvmuldp			vs45,	vs5,	vs25
1787	xvmuldp			vs46,	vs6,	vs25
1788	xvmuldp			vs47,	vs7,	vs25
1789
1790.endm
1791
1792.macro KERNEL2x16_1
1793
1794	lxvd2x	vs8,	0,	AO
1795	lxvd2x	vs9,	o16,	AO
1796	lxvd2x	vs10,	o32,	AO
1797	lxvd2x	vs11,	o48,	AO
1798
1799	lxvdsx	vs28,	0,	BO
1800	lxvdsx	vs29,	o8,	BO
1801
1802	addi		AO, AO, 64
1803	addi		BO, BO, 16
1804
1805	lxvd2x	vs12,	0,	AO
1806	lxvd2x	vs13,	o16,	AO
1807	lxvd2x	vs14,	o32,	AO
1808	lxvd2x	vs15,	o48,	AO
1809
1810	addi		AO, AO, 64
1811
1812
1813	xvmaddadp		vs32,	vs0,	vs24
1814	xvmaddadp		vs33,	vs1,	vs24
1815	xvmaddadp		vs34,	vs2,	vs24
1816	xvmaddadp		vs35,	vs3,	vs24
1817	xvmaddadp		vs36,	vs4,	vs24
1818	xvmaddadp		vs37,	vs5,	vs24
1819	xvmaddadp		vs38,	vs6,	vs24
1820	xvmaddadp		vs39,	vs7,	vs24
1821
1822	xvmaddadp		vs40,	vs0,	vs25
1823	xvmaddadp		vs41,	vs1,	vs25
1824	xvmaddadp		vs42,	vs2,	vs25
1825	xvmaddadp		vs43,	vs3,	vs25
1826	xvmaddadp		vs44,	vs4,	vs25
1827	xvmaddadp		vs45,	vs5,	vs25
1828	xvmaddadp		vs46,	vs6,	vs25
1829	xvmaddadp		vs47,	vs7,	vs25
1830
1831.endm
1832
1833.macro KERNEL2x16_2
1834
1835	lxvd2x	vs0,	0,	AO
1836	lxvd2x	vs1,	o16,	AO
1837	lxvd2x	vs2,	o32,	AO
1838	lxvd2x	vs3,	o48,	AO
1839
1840	lxvdsx	vs24,	0,	BO
1841	lxvdsx	vs25,	o8,	BO
1842
1843	addi		AO, AO, 64
1844	addi		BO, BO, 16
1845
1846	lxvd2x	vs4,	0,	AO
1847	lxvd2x	vs5,	o16,	AO
1848	lxvd2x	vs6,	o32,	AO
1849	lxvd2x	vs7,	o48,	AO
1850
1851	addi		AO, AO, 64
1852
1853
1854	xvmaddadp		vs32,	vs8,	vs28
1855	xvmaddadp		vs33,	vs9,	vs28
1856	xvmaddadp		vs34,	vs10,	vs28
1857	xvmaddadp		vs35,	vs11,	vs28
1858	xvmaddadp		vs36,	vs12,	vs28
1859	xvmaddadp		vs37,	vs13,	vs28
1860	xvmaddadp		vs38,	vs14,	vs28
1861	xvmaddadp		vs39,	vs15,	vs28
1862
1863	xvmaddadp		vs40,	vs8,	vs29
1864	xvmaddadp		vs41,	vs9,	vs29
1865	xvmaddadp		vs42,	vs10,	vs29
1866	xvmaddadp		vs43,	vs11,	vs29
1867	xvmaddadp		vs44,	vs12,	vs29
1868	xvmaddadp		vs45,	vs13,	vs29
1869	xvmaddadp		vs46,	vs14,	vs29
1870	xvmaddadp		vs47,	vs15,	vs29
1871
1872.endm
1873
1874.macro KERNEL2x16_E2
1875
1876
1877	xvmaddadp		vs32,	vs8,	vs28
1878	xvmaddadp		vs33,	vs9,	vs28
1879	xvmaddadp		vs34,	vs10,	vs28
1880	xvmaddadp		vs35,	vs11,	vs28
1881	xvmaddadp		vs36,	vs12,	vs28
1882	xvmaddadp		vs37,	vs13,	vs28
1883	xvmaddadp		vs38,	vs14,	vs28
1884	xvmaddadp		vs39,	vs15,	vs28
1885
1886	xvmaddadp		vs40,	vs8,	vs29
1887	xvmaddadp		vs41,	vs9,	vs29
1888	xvmaddadp		vs42,	vs10,	vs29
1889	xvmaddadp		vs43,	vs11,	vs29
1890	xvmaddadp		vs44,	vs12,	vs29
1891	xvmaddadp		vs45,	vs13,	vs29
1892	xvmaddadp		vs46,	vs14,	vs29
1893	xvmaddadp		vs47,	vs15,	vs29
1894
1895.endm
1896
1897.macro KERNEL2x16_SUBI1
1898
1899	lxvd2x	vs0,	0,	AO
1900	lxvd2x	vs1,	o16,	AO
1901	lxvd2x	vs2,	o32,	AO
1902	lxvd2x	vs3,	o48,	AO
1903
1904	lxvdsx	vs24,	0,	BO
1905	lxvdsx	vs25,	o8,	BO
1906
1907	addi		AO, AO, 64
1908	addi		BO, BO, 16
1909
1910	lxvd2x	vs4,	0,	AO
1911	lxvd2x	vs5,	o16,	AO
1912	lxvd2x	vs6,	o32,	AO
1913	lxvd2x	vs7,	o48,	AO
1914
1915	addi		AO, AO, 64
1916
1917
1918	xvmuldp			vs32,	vs0,	vs24
1919	xvmuldp			vs33,	vs1,	vs24
1920	xvmuldp			vs34,	vs2,	vs24
1921	xvmuldp			vs35,	vs3,	vs24
1922	xvmuldp			vs36,	vs4,	vs24
1923	xvmuldp			vs37,	vs5,	vs24
1924	xvmuldp			vs38,	vs6,	vs24
1925	xvmuldp			vs39,	vs7,	vs24
1926
1927	xvmuldp			vs40,	vs0,	vs25
1928	xvmuldp			vs41,	vs1,	vs25
1929	xvmuldp			vs42,	vs2,	vs25
1930	xvmuldp			vs43,	vs3,	vs25
1931	xvmuldp			vs44,	vs4,	vs25
1932	xvmuldp			vs45,	vs5,	vs25
1933	xvmuldp			vs46,	vs6,	vs25
1934	xvmuldp			vs47,	vs7,	vs25
1935
1936.endm
1937
1938.macro KERNEL2x16_SUB1
1939
1940	lxvd2x	vs0,	0,	AO
1941	lxvd2x	vs1,	o16,	AO
1942	lxvd2x	vs2,	o32,	AO
1943	lxvd2x	vs3,	o48,	AO
1944
1945	lxvdsx	vs24,	0,	BO
1946	lxvdsx	vs25,	o8,	BO
1947
1948	addi		AO, AO, 64
1949	addi		BO, BO, 16
1950
1951	lxvd2x	vs4,	0,	AO
1952	lxvd2x	vs5,	o16,	AO
1953	lxvd2x	vs6,	o32,	AO
1954	lxvd2x	vs7,	o48,	AO
1955
1956	addi		AO, AO, 64
1957
1958
1959	xvmaddadp		vs32,	vs0,	vs24
1960	xvmaddadp		vs33,	vs1,	vs24
1961	xvmaddadp		vs34,	vs2,	vs24
1962	xvmaddadp		vs35,	vs3,	vs24
1963	xvmaddadp		vs36,	vs4,	vs24
1964	xvmaddadp		vs37,	vs5,	vs24
1965	xvmaddadp		vs38,	vs6,	vs24
1966	xvmaddadp		vs39,	vs7,	vs24
1967
1968	xvmaddadp		vs40,	vs0,	vs25
1969	xvmaddadp		vs41,	vs1,	vs25
1970	xvmaddadp		vs42,	vs2,	vs25
1971	xvmaddadp		vs43,	vs3,	vs25
1972	xvmaddadp		vs44,	vs4,	vs25
1973	xvmaddadp		vs45,	vs5,	vs25
1974	xvmaddadp		vs46,	vs6,	vs25
1975	xvmaddadp		vs47,	vs7,	vs25
1976
1977.endm
1978
1979.macro SAVE2x16
1980
1981	mr		T1,	CO
1982	addi		T2,	T1,	64
1983
1984#ifndef TRMMKERNEL
1985	lxvd2x		vs0,	0,	T1
1986	lxvd2x		vs1,	o16,	T1
1987	lxvd2x		vs2,	o32,	T1
1988	lxvd2x		vs3,	o48,	T1
1989
1990	lxvd2x		vs4,	0,	T2
1991	lxvd2x		vs5,	o16,	T2
1992	lxvd2x		vs6,	o32,	T2
1993	lxvd2x		vs7,	o48,	T2
1994#endif
1995
1996#ifndef TRMMKERNEL
1997	xvmaddadp	vs0,	vs32,	alpha_r
1998	xvmaddadp	vs1,	vs33,	alpha_r
1999	xvmaddadp	vs2,	vs34,	alpha_r
2000	xvmaddadp	vs3,	vs35,	alpha_r
2001	xvmaddadp	vs4,	vs36,	alpha_r
2002	xvmaddadp	vs5,	vs37,	alpha_r
2003	xvmaddadp	vs6,	vs38,	alpha_r
2004	xvmaddadp	vs7,	vs39,	alpha_r
2005#else
2006	xvmuldp		vs0,	vs32,	alpha_r
2007	xvmuldp		vs1,	vs33,	alpha_r
2008	xvmuldp		vs2,	vs34,	alpha_r
2009	xvmuldp		vs3,	vs35,	alpha_r
2010	xvmuldp		vs4,	vs36,	alpha_r
2011	xvmuldp		vs5,	vs37,	alpha_r
2012	xvmuldp		vs6,	vs38,	alpha_r
2013	xvmuldp		vs7,	vs39,	alpha_r
2014#endif
2015
2016	stxvd2x		vs0,	0,	T1
2017	stxvd2x		vs1,	o16,	T1
2018	stxvd2x		vs2,	o32,	T1
2019	stxvd2x		vs3,	o48,	T1
2020
2021	stxvd2x		vs4,	0,	T2
2022	stxvd2x		vs5,	o16,	T2
2023	stxvd2x		vs6,	o32,	T2
2024	stxvd2x		vs7,	o48,	T2
2025
2026	add		T1,	T1,	LDC
2027	add		T2,	T2,	LDC
2028
2029#ifndef TRMMKERNEL
2030	lxvd2x		vs8,	0,	T1
2031	lxvd2x		vs9,	o16,	T1
2032	lxvd2x		vs10,	o32,	T1
2033	lxvd2x		vs11,	o48,	T1
2034
2035	lxvd2x		vs12,	0,	T2
2036	lxvd2x		vs13,	o16,	T2
2037	lxvd2x		vs14,	o32,	T2
2038	lxvd2x		vs15,	o48,	T2
2039#endif
2040
2041#ifndef TRMMKERNEL
2042	xvmaddadp	vs8,	vs40,	alpha_r
2043	xvmaddadp	vs9,	vs41,	alpha_r
2044	xvmaddadp	vs10,	vs42,	alpha_r
2045	xvmaddadp	vs11,	vs43,	alpha_r
2046	xvmaddadp	vs12,	vs44,	alpha_r
2047	xvmaddadp	vs13,	vs45,	alpha_r
2048	xvmaddadp	vs14,	vs46,	alpha_r
2049	xvmaddadp	vs15,	vs47,	alpha_r
2050#else
2051	xvmuldp		vs8,	vs40,	alpha_r
2052	xvmuldp		vs9,	vs41,	alpha_r
2053	xvmuldp		vs10,	vs42,	alpha_r
2054	xvmuldp		vs11,	vs43,	alpha_r
2055	xvmuldp		vs12,	vs44,	alpha_r
2056	xvmuldp		vs13,	vs45,	alpha_r
2057	xvmuldp		vs14,	vs46,	alpha_r
2058	xvmuldp		vs15,	vs47,	alpha_r
2059#endif
2060
2061	stxvd2x		vs8,	0,	T1
2062	stxvd2x		vs9,	o16,	T1
2063	stxvd2x		vs10,	o32,	T1
2064	stxvd2x		vs11,	o48,	T1
2065
2066	stxvd2x		vs12,	0,	T2
2067	stxvd2x		vs13,	o16,	T2
2068	stxvd2x		vs14,	o32,	T2
2069	stxvd2x		vs15,	o48,	T2
2070
2071	addi		CO,	CO,	128
2072
2073.endm
2074
2075/*********************************************************************
2076* Macros for N=4, M=8                                                *
2077*********************************************************************/
2078
2079.macro LOAD2x8_1
2080
2081	lxvd2x	vs0,	0,	AO
2082	lxvd2x	vs1,	o16,	AO
2083	lxvd2x	vs2,	o32,	AO
2084	lxvd2x	vs3,	o48,	AO
2085
2086	lxvdsx	vs24,	0,	BO
2087	lxvdsx	vs25,	o8,	BO
2088
2089	addi		AO, AO, 64
2090	addi		BO, BO, 16
2091
2092.endm
2093
2094.macro KERNEL2x8_I1
2095
2096	lxvd2x	vs8,	0,	AO
2097	lxvd2x	vs9,	o16,	AO
2098	lxvd2x	vs10,	o32,	AO
2099	lxvd2x	vs11,	o48,	AO
2100
2101	lxvdsx	vs28,	0,	BO
2102	lxvdsx	vs29,	o8,	BO
2103
2104	addi		AO, AO, 64
2105	addi		BO, BO, 16
2106
2107
2108	xvmuldp			vs32,	vs0,	vs24
2109	xvmuldp			vs33,	vs1,	vs24
2110	xvmuldp			vs34,	vs2,	vs24
2111	xvmuldp			vs35,	vs3,	vs24
2112
2113	xvmuldp			vs40,	vs0,	vs25
2114	xvmuldp			vs41,	vs1,	vs25
2115	xvmuldp			vs42,	vs2,	vs25
2116	xvmuldp			vs43,	vs3,	vs25
2117
2118.endm
2119
2120.macro KERNEL2x8_1
2121
2122	lxvd2x	vs8,	0,	AO
2123	lxvd2x	vs9,	o16,	AO
2124	lxvd2x	vs10,	o32,	AO
2125	lxvd2x	vs11,	o48,	AO
2126
2127	lxvdsx	vs28,	0,	BO
2128	lxvdsx	vs29,	o8,	BO
2129
2130	addi		AO, AO, 64
2131	addi		BO, BO, 16
2132
2133
2134	xvmaddadp		vs32,	vs0,	vs24
2135	xvmaddadp		vs33,	vs1,	vs24
2136	xvmaddadp		vs34,	vs2,	vs24
2137	xvmaddadp		vs35,	vs3,	vs24
2138
2139	xvmaddadp		vs40,	vs0,	vs25
2140	xvmaddadp		vs41,	vs1,	vs25
2141	xvmaddadp		vs42,	vs2,	vs25
2142	xvmaddadp		vs43,	vs3,	vs25
2143
2144.endm
2145
2146.macro KERNEL2x8_2
2147
2148	lxvd2x	vs0,	0,	AO
2149	lxvd2x	vs1,	o16,	AO
2150	lxvd2x	vs2,	o32,	AO
2151	lxvd2x	vs3,	o48,	AO
2152
2153	lxvdsx	vs24,	0,	BO
2154	lxvdsx	vs25,	o8,	BO
2155
2156	addi		AO, AO, 64
2157	addi		BO, BO, 16
2158
2159
2160	xvmaddadp		vs32,	vs8,	vs28
2161	xvmaddadp		vs33,	vs9,	vs28
2162	xvmaddadp		vs34,	vs10,	vs28
2163	xvmaddadp		vs35,	vs11,	vs28
2164
2165	xvmaddadp		vs40,	vs8,	vs29
2166	xvmaddadp		vs41,	vs9,	vs29
2167	xvmaddadp		vs42,	vs10,	vs29
2168	xvmaddadp		vs43,	vs11,	vs29
2169
2170.endm
2171
2172.macro KERNEL2x8_E2
2173
2174
2175	xvmaddadp		vs32,	vs8,	vs28
2176	xvmaddadp		vs33,	vs9,	vs28
2177	xvmaddadp		vs34,	vs10,	vs28
2178	xvmaddadp		vs35,	vs11,	vs28
2179
2180	xvmaddadp		vs40,	vs8,	vs29
2181	xvmaddadp		vs41,	vs9,	vs29
2182	xvmaddadp		vs42,	vs10,	vs29
2183	xvmaddadp		vs43,	vs11,	vs29
2184
2185.endm
2186
2187.macro KERNEL2x8_SUBI1
2188
2189	lxvd2x	vs0,	0,	AO
2190	lxvd2x	vs1,	o16,	AO
2191	lxvd2x	vs2,	o32,	AO
2192	lxvd2x	vs3,	o48,	AO
2193
2194	lxvdsx	vs24,	0,	BO
2195	lxvdsx	vs25,	o8,	BO
2196
2197	addi		AO, AO, 64
2198	addi		BO, BO, 16
2199
2200
2201	xvmuldp			vs32,	vs0,	vs24
2202	xvmuldp			vs33,	vs1,	vs24
2203	xvmuldp			vs34,	vs2,	vs24
2204	xvmuldp			vs35,	vs3,	vs24
2205
2206	xvmuldp			vs40,	vs0,	vs25
2207	xvmuldp			vs41,	vs1,	vs25
2208	xvmuldp			vs42,	vs2,	vs25
2209	xvmuldp			vs43,	vs3,	vs25
2210
2211.endm
2212
2213.macro KERNEL2x8_SUB1
2214
2215	lxvd2x	vs0,	0,	AO
2216	lxvd2x	vs1,	o16,	AO
2217	lxvd2x	vs2,	o32,	AO
2218	lxvd2x	vs3,	o48,	AO
2219
2220	lxvdsx	vs24,	0,	BO
2221	lxvdsx	vs25,	o8,	BO
2222
2223	addi		AO, AO, 64
2224	addi		BO, BO, 16
2225
2226
2227	xvmaddadp		vs32,	vs0,	vs24
2228	xvmaddadp		vs33,	vs1,	vs24
2229	xvmaddadp		vs34,	vs2,	vs24
2230	xvmaddadp		vs35,	vs3,	vs24
2231
2232	xvmaddadp		vs40,	vs0,	vs25
2233	xvmaddadp		vs41,	vs1,	vs25
2234	xvmaddadp		vs42,	vs2,	vs25
2235	xvmaddadp		vs43,	vs3,	vs25
2236
2237.endm
2238
2239.macro SAVE2x8
2240
2241	mr		T1,	CO
2242
2243#ifndef TRMMKERNEL
2244	lxvd2x		vs0,	0,	T1
2245	lxvd2x		vs1,	o16,	T1
2246	lxvd2x		vs2,	o32,	T1
2247	lxvd2x		vs3,	o48,	T1
2248#endif
2249
2250#ifndef TRMMKERNEL
2251	xvmaddadp	vs0,	vs32,	alpha_r
2252	xvmaddadp	vs1,	vs33,	alpha_r
2253	xvmaddadp	vs2,	vs34,	alpha_r
2254	xvmaddadp	vs3,	vs35,	alpha_r
2255#else
2256	xvmuldp		vs0,	vs32,	alpha_r
2257	xvmuldp		vs1,	vs33,	alpha_r
2258	xvmuldp		vs2,	vs34,	alpha_r
2259	xvmuldp		vs3,	vs35,	alpha_r
2260#endif
2261
2262	stxvd2x		vs0,	0,	T1
2263	stxvd2x		vs1,	o16,	T1
2264	stxvd2x		vs2,	o32,	T1
2265	stxvd2x		vs3,	o48,	T1
2266
2267	add		T1,	T1,	LDC
2268
2269#ifndef TRMMKERNEL
2270	lxvd2x		vs8,	0,	T1
2271	lxvd2x		vs9,	o16,	T1
2272	lxvd2x		vs10,	o32,	T1
2273	lxvd2x		vs11,	o48,	T1
2274#endif
2275
2276#ifndef TRMMKERNEL
2277	xvmaddadp	vs8,	vs40,	alpha_r
2278	xvmaddadp	vs9,	vs41,	alpha_r
2279	xvmaddadp	vs10,	vs42,	alpha_r
2280	xvmaddadp	vs11,	vs43,	alpha_r
2281#else
2282	xvmuldp		vs8,	vs40,	alpha_r
2283	xvmuldp		vs9,	vs41,	alpha_r
2284	xvmuldp		vs10,	vs42,	alpha_r
2285	xvmuldp		vs11,	vs43,	alpha_r
2286#endif
2287
2288	stxvd2x		vs8,	0,	T1
2289	stxvd2x		vs9,	o16,	T1
2290	stxvd2x		vs10,	o32,	T1
2291	stxvd2x		vs11,	o48,	T1
2292
2293	addi		CO,	CO,	64
2294
2295.endm
2296
2297/*********************************************************************
2298* Macros for N=2, M=4                                                *
2299*********************************************************************/
2300
2301.macro LOAD2x4_1
2302
2303	lxvd2x	vs0,	0,	AO
2304	lxvd2x	vs1,	o16,	AO
2305
2306	lxvdsx	vs24,	0,	BO
2307	lxvdsx	vs25,	o8,	BO
2308
2309	addi		AO, AO, 32
2310	addi		BO, BO, 16
2311
2312.endm
2313
2314.macro KERNEL2x4_I1
2315
2316	lxvd2x	vs8,	0,	AO
2317	lxvd2x	vs9,	o16,	AO
2318
2319	lxvdsx	vs28,	0,	BO
2320	lxvdsx	vs29,	o8,	BO
2321
2322	addi		AO, AO, 32
2323	addi		BO, BO, 16
2324
2325
2326	xvmuldp			vs32,	vs0,	vs24
2327	xvmuldp			vs33,	vs1,	vs24
2328
2329	xvmuldp			vs40,	vs0,	vs25
2330	xvmuldp			vs41,	vs1,	vs25
2331
2332.endm
2333
2334.macro KERNEL2x4_1
2335
2336	lxvd2x	vs8,	0,	AO
2337	lxvd2x	vs9,	o16,	AO
2338
2339	lxvdsx	vs28,	0,	BO
2340	lxvdsx	vs29,	o8,	BO
2341
2342	addi		AO, AO, 32
2343	addi		BO, BO, 16
2344
2345
2346	xvmaddadp		vs32,	vs0,	vs24
2347	xvmaddadp		vs33,	vs1,	vs24
2348
2349	xvmaddadp		vs40,	vs0,	vs25
2350	xvmaddadp		vs41,	vs1,	vs25
2351
2352.endm
2353
2354.macro KERNEL2x4_2
2355
2356	lxvd2x	vs0,	0,	AO
2357	lxvd2x	vs1,	o16,	AO
2358
2359	lxvdsx	vs24,	0,	BO
2360	lxvdsx	vs25,	o8,	BO
2361
2362	addi		AO, AO, 32
2363	addi		BO, BO, 16
2364
2365
2366	xvmaddadp		vs32,	vs8,	vs28
2367	xvmaddadp		vs33,	vs9,	vs28
2368
2369	xvmaddadp		vs40,	vs8,	vs29
2370	xvmaddadp		vs41,	vs9,	vs29
2371
2372.endm
2373
2374.macro KERNEL2x4_E2
2375
2376
2377	xvmaddadp		vs32,	vs8,	vs28
2378	xvmaddadp		vs33,	vs9,	vs28
2379
2380	xvmaddadp		vs40,	vs8,	vs29
2381	xvmaddadp		vs41,	vs9,	vs29
2382
2383.endm
2384
2385.macro KERNEL2x4_SUBI1
2386
2387	lxvd2x	vs0,	0,	AO
2388	lxvd2x	vs1,	o16,	AO
2389
2390	lxvdsx	vs24,	0,	BO
2391	lxvdsx	vs25,	o8,	BO
2392
2393	addi		AO, AO, 32
2394	addi		BO, BO, 16
2395
2396
2397	xvmuldp			vs32,	vs0,	vs24
2398	xvmuldp			vs33,	vs1,	vs24
2399
2400	xvmuldp			vs40,	vs0,	vs25
2401	xvmuldp			vs41,	vs1,	vs25
2402
2403.endm
2404
2405.macro KERNEL2x4_SUB1
2406
2407	lxvd2x	vs0,	0,	AO
2408	lxvd2x	vs1,	o16,	AO
2409
2410	lxvdsx	vs24,	0,	BO
2411	lxvdsx	vs25,	o8,	BO
2412
2413	addi		AO, AO, 32
2414	addi		BO, BO, 16
2415
2416
2417	xvmaddadp		vs32,	vs0,	vs24
2418	xvmaddadp		vs33,	vs1,	vs24
2419
2420	xvmaddadp		vs40,	vs0,	vs25
2421	xvmaddadp		vs41,	vs1,	vs25
2422
2423.endm
2424
2425.macro SAVE2x4
2426
2427	mr		T1,	CO
2428
2429#ifndef TRMMKERNEL
2430	lxvd2x		vs0,	0,	T1
2431	lxvd2x		vs1,	o16,	T1
2432#endif
2433
2434#ifndef TRMMKERNEL
2435	xvmaddadp	vs0,	vs32,	alpha_r
2436	xvmaddadp	vs1,	vs33,	alpha_r
2437#else
2438	xvmuldp		vs0,	vs32,	alpha_r
2439	xvmuldp		vs1,	vs33,	alpha_r
2440#endif
2441
2442	stxvd2x		vs0,	0,	T1
2443	stxvd2x		vs1,	o16,	T1
2444
2445	add		T1,	T1,	LDC
2446
2447#ifndef TRMMKERNEL
2448	lxvd2x		vs8,	0,	T1
2449	lxvd2x		vs9,	o16,	T1
2450#endif
2451
2452#ifndef TRMMKERNEL
2453	xvmaddadp	vs8,	vs40,	alpha_r
2454	xvmaddadp	vs9,	vs41,	alpha_r
2455#else
2456	xvmuldp		vs8,	vs40,	alpha_r
2457	xvmuldp		vs9,	vs41,	alpha_r
2458#endif
2459
2460	stxvd2x		vs8,	0,	T1
2461	stxvd2x		vs9,	o16,	T1
2462
2463	addi		CO,	CO,	32
2464
2465.endm
2466
2467/*********************************************************************
2468* Macros for N=2, M=2                                                *
2469*********************************************************************/
2470
2471.macro LOAD2x2_1
2472
2473	lxvd2x	vs0,	0,	AO
2474
2475	lxvdsx	vs24,	0,	BO
2476	lxvdsx	vs25,	o8,	BO
2477
2478	addi		AO, AO, 16
2479	addi		BO, BO, 16
2480
2481.endm
2482
2483.macro KERNEL2x2_I1
2484
2485	lxvd2x	vs8,	0,	AO
2486
2487	lxvdsx	vs28,	0,	BO
2488	lxvdsx	vs29,	o8,	BO
2489
2490	addi		AO, AO, 16
2491	addi		BO, BO, 16
2492
2493
2494	xvmuldp			vs32,	vs0,	vs24
2495
2496	xvmuldp			vs40,	vs0,	vs25
2497
2498.endm
2499
2500.macro KERNEL2x2_1
2501
2502	lxvd2x	vs8,	0,	AO
2503
2504	lxvdsx	vs28,	0,	BO
2505	lxvdsx	vs29,	o8,	BO
2506
2507	addi		AO, AO, 16
2508	addi		BO, BO, 16
2509
2510
2511	xvmaddadp		vs32,	vs0,	vs24
2512
2513	xvmaddadp		vs40,	vs0,	vs25
2514
2515.endm
2516
2517.macro KERNEL2x2_2
2518
2519	lxvd2x	vs0,	0,	AO
2520
2521	lxvdsx	vs24,	0,	BO
2522	lxvdsx	vs25,	o8,	BO
2523
2524	addi		AO, AO, 16
2525	addi		BO, BO, 16
2526
2527
2528	xvmaddadp		vs32,	vs8,	vs28
2529
2530	xvmaddadp		vs40,	vs8,	vs29
2531
2532.endm
2533
2534.macro KERNEL2x2_E2
2535
2536
2537	xvmaddadp		vs32,	vs8,	vs28
2538
2539	xvmaddadp		vs40,	vs8,	vs29
2540
2541.endm
2542
2543.macro KERNEL2x2_SUBI1
2544
2545	lxvd2x	vs0,	0,	AO
2546
2547	lxvdsx	vs24,	0,	BO
2548	lxvdsx	vs25,	o8,	BO
2549
2550	addi		AO, AO, 16
2551	addi		BO, BO, 16
2552
2553
2554	xvmuldp			vs32,	vs0,	vs24
2555
2556	xvmuldp			vs40,	vs0,	vs25
2557
2558.endm
2559
2560.macro KERNEL2x2_SUB1
2561
2562	lxvd2x	vs0,	0,	AO
2563
2564	lxvdsx	vs24,	0,	BO
2565	lxvdsx	vs25,	o8,	BO
2566
2567	addi		AO, AO, 16
2568	addi		BO, BO, 16
2569
2570
2571	xvmaddadp		vs32,	vs0,	vs24
2572
2573	xvmaddadp		vs40,	vs0,	vs25
2574
2575.endm
2576
2577.macro SAVE2x2
2578
2579	mr		T1,	CO
2580
2581#ifndef TRMMKERNEL
2582	lxvd2x		vs0,	0,	T1
2583#endif
2584
2585#ifndef TRMMKERNEL
2586	xvmaddadp	vs0,	vs32,	alpha_r
2587#else
2588	xvmuldp		vs0,	vs32,	alpha_r
2589#endif
2590
2591	stxvd2x		vs0,	0,	T1
2592
2593	add		T1,	T1,	LDC
2594
2595#ifndef TRMMKERNEL
2596	lxvd2x		vs8,	0,	T1
2597#endif
2598
2599#ifndef TRMMKERNEL
2600	xvmaddadp	vs8,	vs40,	alpha_r
2601#else
2602	xvmuldp		vs8,	vs40,	alpha_r
2603#endif
2604
2605	stxvd2x		vs8,	0,	T1
2606
2607	addi		CO,	CO,	16
2608
2609.endm
2610
2611/*********************************************************************
2612* Macros for N=2, M=1                                                *
2613*********************************************************************/
2614
2615.macro LOAD2x1_1
2616
2617	lxsdx	vs0,	0,	AO
2618
2619	lxsdx	vs24,	0,	BO
2620	lxsdx	vs25,	o8,	BO
2621
2622	addi		AO, AO, 8
2623	addi		BO, BO, 16
2624
2625.endm
2626
2627.macro KERNEL2x1_I1
2628
2629	lxsdx	vs8,	0,	AO
2630
2631	lxsdx	vs28,	0,	BO
2632	lxsdx	vs29,	o8,	BO
2633
2634	addi		AO, AO, 8
2635	addi		BO, BO, 16
2636
2637
2638	xsmuldp			vs32,	vs0,	vs24
2639
2640	xsmuldp			vs40,	vs0,	vs25
2641
2642.endm
2643
2644.macro KERNEL2x1_1
2645
2646	lxsdx	vs8,	0,	AO
2647
2648	lxsdx	vs28,	0,	BO
2649	lxsdx	vs29,	o8,	BO
2650
2651	addi		AO, AO, 8
2652	addi		BO, BO, 16
2653
2654
2655	xsmaddadp		vs32,	vs0,	vs24
2656
2657	xsmaddadp		vs40,	vs0,	vs25
2658
2659.endm
2660
2661.macro KERNEL2x1_2
2662
2663	lxsdx	vs0,	0,	AO
2664
2665	lxsdx	vs24,	0,	BO
2666	lxsdx	vs25,	o8,	BO
2667
2668	addi		AO, AO, 8
2669	addi		BO, BO, 16
2670
2671
2672	xsmaddadp		vs32,	vs8,	vs28
2673
2674	xsmaddadp		vs40,	vs8,	vs29
2675
2676.endm
2677
2678.macro KERNEL2x1_E2
2679
2680
2681	xsmaddadp		vs32,	vs8,	vs28
2682
2683	xsmaddadp		vs40,	vs8,	vs29
2684
2685.endm
2686
2687.macro KERNEL2x1_SUBI1
2688
2689	lxsdx	vs0,	0,	AO
2690
2691	lxsdx	vs24,	0,	BO
2692	lxsdx	vs25,	o8,	BO
2693
2694	addi		AO, AO, 8
2695	addi		BO, BO, 16
2696
2697
2698	xsmuldp			vs32,	vs0,	vs24
2699
2700	xsmuldp			vs40,	vs0,	vs25
2701
2702.endm
2703
2704.macro KERNEL2x1_SUB1
2705
2706	lxsdx	vs0,	0,	AO
2707
2708	lxsdx	vs24,	0,	BO
2709	lxsdx	vs25,	o8,	BO
2710
2711	addi		AO, AO, 8
2712	addi		BO, BO, 16
2713
2714
2715	xsmaddadp		vs32,	vs0,	vs24
2716
2717	xsmaddadp		vs40,	vs0,	vs25
2718
2719.endm
2720
2721.macro SAVE2x1
2722
2723	mr		T1,	CO
2724
2725#ifndef TRMMKERNEL
2726	lxsdx		vs0,	0,	T1
2727#endif
2728
2729#ifndef TRMMKERNEL
2730	xsmaddadp	vs0,	vs32,	alpha_r
2731#else
2732	xsmuldp		vs0,	vs32,	alpha_r
2733#endif
2734
2735	stxsdx		vs0,	0,	T1
2736
2737	add		T1,	T1,	LDC
2738
2739#ifndef TRMMKERNEL
2740	lxsdx		vs8,	0,	T1
2741#endif
2742
2743#ifndef TRMMKERNEL
2744	xsmaddadp	vs8,	vs40,	alpha_r
2745#else
2746	xsmuldp		vs8,	vs40,	alpha_r
2747#endif
2748
2749	stxsdx		vs8,	0,	T1
2750
2751	addi		CO,	CO,	8
2752
2753.endm
2754
2755/*********************************************************************
2756* Macros for N=1, M=16                                               *
2757*********************************************************************/
2758
2759.macro LOAD1x16_1
2760
2761	lxvd2x	vs0,	0,	AO
2762	lxvd2x	vs1,	o16,	AO
2763	lxvd2x	vs2,	o32,	AO
2764	lxvd2x	vs3,	o48,	AO
2765
2766	lxvdsx	vs24,	0,	BO
2767
2768	addi		AO, AO, 64
2769	addi		BO, BO, 8
2770
2771	lxvd2x	vs4,	0,	AO
2772	lxvd2x	vs5,	o16,	AO
2773	lxvd2x	vs6,	o32,	AO
2774	lxvd2x	vs7,	o48,	AO
2775
2776	addi		AO, AO, 64
2777
2778.endm
2779
2780.macro KERNEL1x16_I1
2781
2782	lxvd2x	vs8,	0,	AO
2783	lxvd2x	vs9,	o16,	AO
2784	lxvd2x	vs10,	o32,	AO
2785	lxvd2x	vs11,	o48,	AO
2786
2787	lxvdsx	vs28,	0,	BO
2788
2789	addi		AO, AO, 64
2790	addi		BO, BO, 8
2791
2792	lxvd2x	vs12,	0,	AO
2793	lxvd2x	vs13,	o16,	AO
2794	lxvd2x	vs14,	o32,	AO
2795	lxvd2x	vs15,	o48,	AO
2796
2797	addi		AO, AO, 64
2798
2799
2800	xvmuldp			vs32,	vs0,	vs24
2801	xvmuldp			vs33,	vs1,	vs24
2802	xvmuldp			vs34,	vs2,	vs24
2803	xvmuldp			vs35,	vs3,	vs24
2804	xvmuldp			vs36,	vs4,	vs24
2805	xvmuldp			vs37,	vs5,	vs24
2806	xvmuldp			vs38,	vs6,	vs24
2807	xvmuldp			vs39,	vs7,	vs24
2808
2809.endm
2810
2811.macro KERNEL1x16_1
2812
2813	lxvd2x	vs8,	0,	AO
2814	lxvd2x	vs9,	o16,	AO
2815	lxvd2x	vs10,	o32,	AO
2816	lxvd2x	vs11,	o48,	AO
2817
2818	lxvdsx	vs28,	0,	BO
2819
2820	addi		AO, AO, 64
2821	addi		BO, BO, 8
2822
2823	lxvd2x	vs12,	0,	AO
2824	lxvd2x	vs13,	o16,	AO
2825	lxvd2x	vs14,	o32,	AO
2826	lxvd2x	vs15,	o48,	AO
2827
2828	addi		AO, AO, 64
2829
2830
2831	xvmaddadp		vs32,	vs0,	vs24
2832	xvmaddadp		vs33,	vs1,	vs24
2833	xvmaddadp		vs34,	vs2,	vs24
2834	xvmaddadp		vs35,	vs3,	vs24
2835	xvmaddadp		vs36,	vs4,	vs24
2836	xvmaddadp		vs37,	vs5,	vs24
2837	xvmaddadp		vs38,	vs6,	vs24
2838	xvmaddadp		vs39,	vs7,	vs24
2839
2840.endm
2841
2842.macro KERNEL1x16_2
2843
2844	lxvd2x	vs0,	0,	AO
2845	lxvd2x	vs1,	o16,	AO
2846	lxvd2x	vs2,	o32,	AO
2847	lxvd2x	vs3,	o48,	AO
2848
2849	lxvdsx	vs24,	0,	BO
2850
2851	addi		AO, AO, 64
2852	addi		BO, BO, 8
2853
2854	lxvd2x	vs4,	0,	AO
2855	lxvd2x	vs5,	o16,	AO
2856	lxvd2x	vs6,	o32,	AO
2857	lxvd2x	vs7,	o48,	AO
2858
2859	addi		AO, AO, 64
2860
2861
2862	xvmaddadp		vs32,	vs8,	vs28
2863	xvmaddadp		vs33,	vs9,	vs28
2864	xvmaddadp		vs34,	vs10,	vs28
2865	xvmaddadp		vs35,	vs11,	vs28
2866	xvmaddadp		vs36,	vs12,	vs28
2867	xvmaddadp		vs37,	vs13,	vs28
2868	xvmaddadp		vs38,	vs14,	vs28
2869	xvmaddadp		vs39,	vs15,	vs28
2870
2871.endm
2872
2873.macro KERNEL1x16_E2
2874
2875
2876	xvmaddadp		vs32,	vs8,	vs28
2877	xvmaddadp		vs33,	vs9,	vs28
2878	xvmaddadp		vs34,	vs10,	vs28
2879	xvmaddadp		vs35,	vs11,	vs28
2880	xvmaddadp		vs36,	vs12,	vs28
2881	xvmaddadp		vs37,	vs13,	vs28
2882	xvmaddadp		vs38,	vs14,	vs28
2883	xvmaddadp		vs39,	vs15,	vs28
2884
2885.endm
2886
2887.macro KERNEL1x16_SUBI1
2888
2889	lxvd2x	vs0,	0,	AO
2890	lxvd2x	vs1,	o16,	AO
2891	lxvd2x	vs2,	o32,	AO
2892	lxvd2x	vs3,	o48,	AO
2893
2894	lxvdsx	vs24,	0,	BO
2895
2896	addi		AO, AO, 64
2897	addi		BO, BO, 8
2898
2899	lxvd2x	vs4,	0,	AO
2900	lxvd2x	vs5,	o16,	AO
2901	lxvd2x	vs6,	o32,	AO
2902	lxvd2x	vs7,	o48,	AO
2903
2904	addi		AO, AO, 64
2905
2906
2907	xvmuldp			vs32,	vs0,	vs24
2908	xvmuldp			vs33,	vs1,	vs24
2909	xvmuldp			vs34,	vs2,	vs24
2910	xvmuldp			vs35,	vs3,	vs24
2911	xvmuldp			vs36,	vs4,	vs24
2912	xvmuldp			vs37,	vs5,	vs24
2913	xvmuldp			vs38,	vs6,	vs24
2914	xvmuldp			vs39,	vs7,	vs24
2915
2916.endm
2917
2918.macro KERNEL1x16_SUB1
2919
2920	lxvd2x	vs0,	0,	AO
2921	lxvd2x	vs1,	o16,	AO
2922	lxvd2x	vs2,	o32,	AO
2923	lxvd2x	vs3,	o48,	AO
2924
2925	lxvdsx	vs24,	0,	BO
2926
2927	addi		AO, AO, 64
2928	addi		BO, BO, 8
2929
2930	lxvd2x	vs4,	0,	AO
2931	lxvd2x	vs5,	o16,	AO
2932	lxvd2x	vs6,	o32,	AO
2933	lxvd2x	vs7,	o48,	AO
2934
2935	addi		AO, AO, 64
2936
2937
2938	xvmaddadp		vs32,	vs0,	vs24
2939	xvmaddadp		vs33,	vs1,	vs24
2940	xvmaddadp		vs34,	vs2,	vs24
2941	xvmaddadp		vs35,	vs3,	vs24
2942	xvmaddadp		vs36,	vs4,	vs24
2943	xvmaddadp		vs37,	vs5,	vs24
2944	xvmaddadp		vs38,	vs6,	vs24
2945	xvmaddadp		vs39,	vs7,	vs24
2946
2947.endm
2948
2949.macro SAVE1x16
2950
2951	mr		T1,	CO
2952	addi		T2,	T1,	64
2953
2954#ifndef TRMMKERNEL
2955	lxvd2x		vs0,	0,	T1
2956	lxvd2x		vs1,	o16,	T1
2957	lxvd2x		vs2,	o32,	T1
2958	lxvd2x		vs3,	o48,	T1
2959
2960	lxvd2x		vs4,	0,	T2
2961	lxvd2x		vs5,	o16,	T2
2962	lxvd2x		vs6,	o32,	T2
2963	lxvd2x		vs7,	o48,	T2
2964#endif
2965
2966#ifndef TRMMKERNEL
2967	xvmaddadp	vs0,	vs32,	alpha_r
2968	xvmaddadp	vs1,	vs33,	alpha_r
2969	xvmaddadp	vs2,	vs34,	alpha_r
2970	xvmaddadp	vs3,	vs35,	alpha_r
2971	xvmaddadp	vs4,	vs36,	alpha_r
2972	xvmaddadp	vs5,	vs37,	alpha_r
2973	xvmaddadp	vs6,	vs38,	alpha_r
2974	xvmaddadp	vs7,	vs39,	alpha_r
2975#else
2976	xvmuldp		vs0,	vs32,	alpha_r
2977	xvmuldp		vs1,	vs33,	alpha_r
2978	xvmuldp		vs2,	vs34,	alpha_r
2979	xvmuldp		vs3,	vs35,	alpha_r
2980	xvmuldp		vs4,	vs36,	alpha_r
2981	xvmuldp		vs5,	vs37,	alpha_r
2982	xvmuldp		vs6,	vs38,	alpha_r
2983	xvmuldp		vs7,	vs39,	alpha_r
2984#endif
2985
2986	stxvd2x		vs0,	0,	T1
2987	stxvd2x		vs1,	o16,	T1
2988	stxvd2x		vs2,	o32,	T1
2989	stxvd2x		vs3,	o48,	T1
2990
2991	stxvd2x		vs4,	0,	T2
2992	stxvd2x		vs5,	o16,	T2
2993	stxvd2x		vs6,	o32,	T2
2994	stxvd2x		vs7,	o48,	T2
2995
2996	addi		CO,	CO,	128
2997
2998.endm
2999
3000/*********************************************************************
3001* Macros for N=4, M=8                                                *
3002*********************************************************************/
3003
3004.macro LOAD1x8_1
3005
3006	lxvd2x	vs0,	0,	AO
3007	lxvd2x	vs1,	o16,	AO
3008	lxvd2x	vs2,	o32,	AO
3009	lxvd2x	vs3,	o48,	AO
3010
3011	lxvdsx	vs24,	0,	BO
3012
3013	addi		AO, AO, 64
3014	addi		BO, BO, 8
3015
3016.endm
3017
3018.macro KERNEL1x8_I1
3019
3020	lxvd2x	vs8,	0,	AO
3021	lxvd2x	vs9,	o16,	AO
3022	lxvd2x	vs10,	o32,	AO
3023	lxvd2x	vs11,	o48,	AO
3024
3025	lxvdsx	vs28,	0,	BO
3026
3027	addi		AO, AO, 64
3028	addi		BO, BO, 8
3029
3030
3031	xvmuldp			vs32,	vs0,	vs24
3032	xvmuldp			vs33,	vs1,	vs24
3033	xvmuldp			vs34,	vs2,	vs24
3034	xvmuldp			vs35,	vs3,	vs24
3035
3036.endm
3037
3038.macro KERNEL1x8_1
3039
3040	lxvd2x	vs8,	0,	AO
3041	lxvd2x	vs9,	o16,	AO
3042	lxvd2x	vs10,	o32,	AO
3043	lxvd2x	vs11,	o48,	AO
3044
3045	lxvdsx	vs28,	0,	BO
3046
3047	addi		AO, AO, 64
3048	addi		BO, BO, 8
3049
3050
3051	xvmaddadp		vs32,	vs0,	vs24
3052	xvmaddadp		vs33,	vs1,	vs24
3053	xvmaddadp		vs34,	vs2,	vs24
3054	xvmaddadp		vs35,	vs3,	vs24
3055
3056.endm
3057
3058.macro KERNEL1x8_2
3059
3060	lxvd2x	vs0,	0,	AO
3061	lxvd2x	vs1,	o16,	AO
3062	lxvd2x	vs2,	o32,	AO
3063	lxvd2x	vs3,	o48,	AO
3064
3065	lxvdsx	vs24,	0,	BO
3066
3067	addi		AO, AO, 64
3068	addi		BO, BO, 8
3069
3070
3071	xvmaddadp		vs32,	vs8,	vs28
3072	xvmaddadp		vs33,	vs9,	vs28
3073	xvmaddadp		vs34,	vs10,	vs28
3074	xvmaddadp		vs35,	vs11,	vs28
3075
3076.endm
3077
3078.macro KERNEL1x8_E2
3079
3080
3081	xvmaddadp		vs32,	vs8,	vs28
3082	xvmaddadp		vs33,	vs9,	vs28
3083	xvmaddadp		vs34,	vs10,	vs28
3084	xvmaddadp		vs35,	vs11,	vs28
3085
3086.endm
3087
3088.macro KERNEL1x8_SUBI1
3089
3090	lxvd2x	vs0,	0,	AO
3091	lxvd2x	vs1,	o16,	AO
3092	lxvd2x	vs2,	o32,	AO
3093	lxvd2x	vs3,	o48,	AO
3094
3095	lxvdsx	vs24,	0,	BO
3096
3097	addi		AO, AO, 64
3098	addi		BO, BO, 8
3099
3100
3101	xvmuldp			vs32,	vs0,	vs24
3102	xvmuldp			vs33,	vs1,	vs24
3103	xvmuldp			vs34,	vs2,	vs24
3104	xvmuldp			vs35,	vs3,	vs24
3105
3106.endm
3107
3108.macro KERNEL1x8_SUB1
3109
3110	lxvd2x	vs0,	0,	AO
3111	lxvd2x	vs1,	o16,	AO
3112	lxvd2x	vs2,	o32,	AO
3113	lxvd2x	vs3,	o48,	AO
3114
3115	lxvdsx	vs24,	0,	BO
3116
3117	addi		AO, AO, 64
3118	addi		BO, BO, 8
3119
3120
3121	xvmaddadp		vs32,	vs0,	vs24
3122	xvmaddadp		vs33,	vs1,	vs24
3123	xvmaddadp		vs34,	vs2,	vs24
3124	xvmaddadp		vs35,	vs3,	vs24
3125
3126.endm
3127
3128.macro SAVE1x8
3129
3130	mr		T1,	CO
3131
3132#ifndef TRMMKERNEL
3133	lxvd2x		vs0,	0,	T1
3134	lxvd2x		vs1,	o16,	T1
3135	lxvd2x		vs2,	o32,	T1
3136	lxvd2x		vs3,	o48,	T1
3137#endif
3138
3139#ifndef TRMMKERNEL
3140	xvmaddadp	vs0,	vs32,	alpha_r
3141	xvmaddadp	vs1,	vs33,	alpha_r
3142	xvmaddadp	vs2,	vs34,	alpha_r
3143	xvmaddadp	vs3,	vs35,	alpha_r
3144#else
3145	xvmuldp		vs0,	vs32,	alpha_r
3146	xvmuldp		vs1,	vs33,	alpha_r
3147	xvmuldp		vs2,	vs34,	alpha_r
3148	xvmuldp		vs3,	vs35,	alpha_r
3149#endif
3150
3151	stxvd2x		vs0,	0,	T1
3152	stxvd2x		vs1,	o16,	T1
3153	stxvd2x		vs2,	o32,	T1
3154	stxvd2x		vs3,	o48,	T1
3155
3156	addi		CO,	CO,	64
3157
3158.endm
3159
3160/*********************************************************************
3161* Macros for N=1, M=4                                                *
3162*********************************************************************/
3163
3164.macro LOAD1x4_1
3165
3166	lxvd2x	vs0,	0,	AO
3167	lxvd2x	vs1,	o16,	AO
3168
3169	lxvdsx	vs24,	0,	BO
3170
3171	addi		AO, AO, 32
3172	addi		BO, BO, 8
3173
3174.endm
3175
3176.macro KERNEL1x4_I1
3177
3178	lxvd2x	vs8,	0,	AO
3179	lxvd2x	vs9,	o16,	AO
3180
3181	lxvdsx	vs28,	0,	BO
3182
3183	addi		AO, AO, 32
3184	addi		BO, BO, 8
3185
3186
3187	xvmuldp			vs32,	vs0,	vs24
3188	xvmuldp			vs33,	vs1,	vs24
3189
3190.endm
3191
3192.macro KERNEL1x4_1
3193
3194	lxvd2x	vs8,	0,	AO
3195	lxvd2x	vs9,	o16,	AO
3196
3197	lxvdsx	vs28,	0,	BO
3198
3199	addi		AO, AO, 32
3200	addi		BO, BO, 8
3201
3202
3203	xvmaddadp		vs32,	vs0,	vs24
3204	xvmaddadp		vs33,	vs1,	vs24
3205
3206.endm
3207
3208.macro KERNEL1x4_2
3209
3210	lxvd2x	vs0,	0,	AO
3211	lxvd2x	vs1,	o16,	AO
3212
3213	lxvdsx	vs24,	0,	BO
3214
3215	addi		AO, AO, 32
3216	addi		BO, BO, 8
3217
3218
3219	xvmaddadp		vs32,	vs8,	vs28
3220	xvmaddadp		vs33,	vs9,	vs28
3221
3222.endm
3223
3224.macro KERNEL1x4_E2
3225
3226
3227	xvmaddadp		vs32,	vs8,	vs28
3228	xvmaddadp		vs33,	vs9,	vs28
3229
3230.endm
3231
3232.macro KERNEL1x4_SUBI1
3233
3234	lxvd2x	vs0,	0,	AO
3235	lxvd2x	vs1,	o16,	AO
3236
3237	lxvdsx	vs24,	0,	BO
3238
3239	addi		AO, AO, 32
3240	addi		BO, BO, 8
3241
3242
3243	xvmuldp			vs32,	vs0,	vs24
3244	xvmuldp			vs33,	vs1,	vs24
3245
3246.endm
3247
3248.macro KERNEL1x4_SUB1
3249
3250	lxvd2x	vs0,	0,	AO
3251	lxvd2x	vs1,	o16,	AO
3252
3253	lxvdsx	vs24,	0,	BO
3254
3255	addi		AO, AO, 32
3256	addi		BO, BO, 8
3257
3258
3259	xvmaddadp		vs32,	vs0,	vs24
3260	xvmaddadp		vs33,	vs1,	vs24
3261
3262.endm
3263
3264.macro SAVE1x4
3265
3266	mr		T1,	CO
3267
3268#ifndef TRMMKERNEL
3269	lxvd2x		vs0,	0,	T1
3270	lxvd2x		vs1,	o16,	T1
3271#endif
3272
3273#ifndef TRMMKERNEL
3274	xvmaddadp	vs0,	vs32,	alpha_r
3275	xvmaddadp	vs1,	vs33,	alpha_r
3276#else
3277	xvmuldp		vs0,	vs32,	alpha_r
3278	xvmuldp		vs1,	vs33,	alpha_r
3279#endif
3280
3281	stxvd2x		vs0,	0,	T1
3282	stxvd2x		vs1,	o16,	T1
3283
3284	addi		CO,	CO,	32
3285
3286.endm
3287
3288/*********************************************************************
3289* Macros for N=1, M=2                                                *
3290*********************************************************************/
3291
3292.macro LOAD1x2_1
3293
3294	lxvd2x	vs0,	0,	AO
3295
3296	lxvdsx	vs24,	0,	BO
3297
3298	addi		AO, AO, 16
3299	addi		BO, BO, 8
3300
3301.endm
3302
3303.macro KERNEL1x2_I1
3304
3305	lxvd2x	vs8,	0,	AO
3306
3307	lxvdsx	vs28,	0,	BO
3308
3309	addi		AO, AO, 16
3310	addi		BO, BO, 8
3311
3312
3313	xvmuldp			vs32,	vs0,	vs24
3314
3315.endm
3316
3317.macro KERNEL1x2_1
3318
3319	lxvd2x	vs8,	0,	AO
3320
3321	lxvdsx	vs28,	0,	BO
3322
3323	addi		AO, AO, 16
3324	addi		BO, BO, 8
3325
3326
3327	xvmaddadp		vs32,	vs0,	vs24
3328
3329.endm
3330
3331.macro KERNEL1x2_2
3332
3333	lxvd2x	vs0,	0,	AO
3334
3335	lxvdsx	vs24,	0,	BO
3336
3337	addi		AO, AO, 16
3338	addi		BO, BO, 8
3339
3340
3341	xvmaddadp		vs32,	vs8,	vs28
3342
3343.endm
3344
3345.macro KERNEL1x2_E2
3346
3347
3348	xvmaddadp		vs32,	vs8,	vs28
3349
3350.endm
3351
3352.macro KERNEL1x2_SUBI1
3353
3354	lxvd2x	vs0,	0,	AO
3355
3356	lxvdsx	vs24,	0,	BO
3357
3358	addi		AO, AO, 16
3359	addi		BO, BO, 8
3360
3361
3362	xvmuldp			vs32,	vs0,	vs24
3363
3364.endm
3365
3366.macro KERNEL1x2_SUB1
3367
3368	lxvd2x	vs0,	0,	AO
3369
3370	lxvdsx	vs24,	0,	BO
3371
3372	addi		AO, AO, 16
3373	addi		BO, BO, 8
3374
3375
3376	xvmaddadp		vs32,	vs0,	vs24
3377
3378.endm
3379
3380.macro SAVE1x2
3381
3382	mr		T1,	CO
3383
3384#ifndef TRMMKERNEL
3385	lxvd2x		vs0,	0,	T1
3386#endif
3387
3388#ifndef TRMMKERNEL
3389	xvmaddadp	vs0,	vs32,	alpha_r
3390#else
3391	xvmuldp		vs0,	vs32,	alpha_r
3392#endif
3393
3394	stxvd2x		vs0,	0,	T1
3395
3396	addi		CO,	CO,	16
3397
3398.endm
3399
3400/*********************************************************************
3401* Macros for N=1, M=1                                                *
3402*********************************************************************/
3403
3404.macro LOAD1x1_1
3405
3406	lxsdx	vs0,	0,	AO
3407
3408	lxsdx	vs24,	0,	BO
3409
3410	addi		AO, AO, 8
3411	addi		BO, BO, 8
3412
3413.endm
3414
3415.macro KERNEL1x1_I1
3416
3417	lxsdx	vs8,	0,	AO
3418
3419	lxsdx	vs28,	0,	BO
3420
3421	addi		AO, AO, 8
3422	addi		BO, BO, 8
3423
3424
3425	xsmuldp			vs32,	vs0,	vs24
3426
3427.endm
3428
3429.macro KERNEL1x1_1
3430
3431	lxsdx	vs8,	0,	AO
3432
3433	lxsdx	vs28,	0,	BO
3434
3435	addi		AO, AO, 8
3436	addi		BO, BO, 8
3437
3438
3439	xsmaddadp		vs32,	vs0,	vs24
3440
3441.endm
3442
3443.macro KERNEL1x1_2
3444
3445	lxsdx	vs0,	0,	AO
3446
3447	lxsdx	vs24,	0,	BO
3448
3449	addi		AO, AO, 8
3450	addi		BO, BO, 8
3451
3452
3453	xsmaddadp		vs32,	vs8,	vs28
3454
3455.endm
3456
3457.macro KERNEL1x1_E2
3458
3459
3460	xsmaddadp		vs32,	vs8,	vs28
3461
3462.endm
3463
3464.macro KERNEL1x1_SUBI1
3465
3466	lxsdx	vs0,	0,	AO
3467
3468	lxsdx	vs24,	0,	BO
3469
3470	addi		AO, AO, 8
3471	addi		BO, BO, 8
3472
3473
3474	xsmuldp			vs32,	vs0,	vs24
3475
3476.endm
3477
3478.macro KERNEL1x1_SUB1
3479
3480	lxsdx	vs0,	0,	AO
3481
3482	lxsdx	vs24,	0,	BO
3483
3484	addi		AO, AO, 8
3485	addi		BO, BO, 8
3486
3487
3488	xsmaddadp		vs32,	vs0,	vs24
3489
3490.endm
3491
3492.macro SAVE1x1
3493
3494	mr		T1,	CO
3495
3496#ifndef TRMMKERNEL
3497	lxsdx		vs0,	0,	T1
3498#endif
3499
3500#ifndef TRMMKERNEL
3501	xsmaddadp	vs0,	vs32,	alpha_r
3502#else
3503	xsmuldp		vs0,	vs32,	alpha_r
3504#endif
3505
3506	stxsdx		vs0,	0,	T1
3507
3508	addi		CO,	CO,	8
3509
3510.endm
3511
3512
3513
3514
3515/****************************TRMM POINTER REFRESH MACROSES*************************/
3516
3517.macro SHIFT_REG  REG1,REG2,SHIFT_VAL
3518		.if \SHIFT_VAL==16
3519			slwi		\REG1,	\REG2,	7
3520		.elseif \SHIFT_VAL==8
3521			slwi		\REG1,	\REG2,	6
3522		.elseif \SHIFT_VAL==4
3523			slwi		\REG1,	\REG2,	5
3524		.elseif \SHIFT_VAL==2
3525			slwi		\REG1,	\REG2,	4
3526		.elseif \SHIFT_VAL==1
3527			slwi		\REG1,	\REG2,	3
3528		.endif
3529.endm
3530
3531/*
3532//#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
3533// 		ptrbb = bb;
3534// #else
3535// 		ptrba += off*16;
3536// 		ptrbb = bb + off*2;
3537// #endif
3538*/
3539.macro REFRESH_POINTERS  PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
3540    #if (defined(LEFT) &&  defined(TRANSA)) ||  (!defined(LEFT) && !defined(TRANSA))
3541        /* ptrbb = bb;*/
3542        mr \PTR_B,\B_VAL     /* refresh BPOINT */
3543
3544    #else
3545		    /*
3546        // ptrba  =ptrba+ off*C_A;
3547        // ptrbb = bb + off*C_B;
3548				*/
3549		SHIFT_REG T4,\OFF_VAL,\C_B		/* Number of values in B shifted  */
3550		SHIFT_REG T2,\OFF_VAL,\C_A		/* Number of values in A shifted  */
3551		add		\PTR_B,	\B_VAL ,	T4				/* Add values to BO */
3552		add		\PTR_A,	\PTR_A,	T2				/* Add values to AO  */
3553    #endif
3554.endm
3555
3556
3557/*
3558// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
3559// 		temp = bk-off;
3560// #elif defined(LEFT)
3561// 		temp = off+16;	// number of values in A
3562// #else
3563// 		temp = off+2;	// number of values in B
3564// #endif
3565*/
3566.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
3567    #if (defined(LEFT) && !defined(TRANSA)) ||  (!defined(LEFT) && defined(TRANSA))
3568                            /* temp = bk-off;*/
3569           sub \TEMP_BK,\BK_VAL,\OFF_VAL
3570
3571    #elif defined(LEFT)
3572                            /* temp = off+INCR_A;	// number of values in A */
3573           addi \TEMP_BK, \OFF_VAL, \INCR_A
3574    #else
3575                            /* temp = off+INCR_B	// number of values in B*/
3576           addi \TEMP_BK,\OFF_VAL, \INCR_B
3577    #endif
3578
3579.endm
3580/*
3581// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
3582// 		temp = bk - off;
3583// #ifdef LEFT
3584// 		temp -= 16; // number of values in A
3585// #else
3586// 		temp -= 2; // number of values in B
3587// #endif
3588// 		ptrba += temp*16;
3589// 		ptrbb += temp*2;
3590// #endif
3591
3592// #ifdef LEFT
3593// 		off += 16; // number of values in A
3594// #endif
3595*/
3596
3597
3598.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
3599
3600    #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
3601                    /*temp = bk - off;*/
3602                sub \TEMP_BK,\BK_VAL,\OFF_VAL
3603    #ifdef LEFT
3604                    /*temp -= 8; // number of values in A*/
3605                addi \TEMP_BK,\TEMP_BK,-\C_A
3606    #else
3607                    /*temp -= 4; // number of values in B*/
3608                addi \TEMP_BK,\TEMP_BK,-\C_B
3609    #endif
3610                    /*ptrba += temp*C_A;
3611                    ptrbb += temp*C_B;*/
3612                SHIFT_REG T4,\TEMP_BK,\C_A
3613								SHIFT_REG T2,\TEMP_BK,\C_B
3614                add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/
3615								add \PTR_B, \PTR_B,T2
3616
3617    #endif
3618
3619    #ifdef LEFT
3620                    /*off += 8; // number of values in A*/
3621                 addi \OFF_VAL,\OFF_VAL,\C_A
3622    #endif
3623.endm