1/***************************************************************************
2Copyright (c) 2013-2016, The OpenBLAS Project
3All rights reserved.
4Redistribution and use in source and binary forms, with or without
5modification, are permitted provided that the following conditions are
6met:
71. Redistributions of source code must retain the above copyright
8notice, this list of conditions and the following disclaimer.
92. Redistributions in binary form must reproduce the above copyright
10notice, this list of conditions and the following disclaimer in
11the documentation and/or other materials provided with the
12distribution.
133. Neither the name of the OpenBLAS project nor the names of
14its contributors may be used to endorse or promote products
15derived from this software without specific prior written permission.
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*****************************************************************************/
27
28/**************************************************************************************
29* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
30* 	 BLASTEST 		: OK
31* 	 CTEST			: OK
32* 	 TEST			: OK
33*	 LAPACK-TEST		: OK
34**************************************************************************************/
35
36#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
37
38	#define	XSFADD_R1	xsadddp
39	#define	XSFADD_R2	xssubdp
40	#define	XSFADD_I1	xsadddp
41	#define	XSFADD_I2	xsadddp
42
43#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT)
44
45	#define	XSFADD_R1	xsadddp
46	#define	XSFADD_R2	xsadddp
47	#define	XSFADD_I1	xssubdp
48	#define	XSFADD_I2	xsadddp
49
50#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR)
51
52	#define	XSFADD_R1	xsadddp
53	#define	XSFADD_R2	xsadddp
54	#define	XSFADD_I1	xsadddp
55	#define	XSFADD_I2	xssubdp
56
57#else		// CC || CR || RC || RR
58
59	#define	XSFADD_R1	xsadddp
60	#define	XSFADD_R2	xssubdp
61	#define	XSFADD_I1	xssubdp
62	#define	XSFADD_I2	xssubdp
63
64#endif
65
66/**********************************************************************************************
67* Macros for N=2 and M=8
68**********************************************************************************************/
69
70#if defined(_AIX)
71define(`LOAD2x8_1', `
72#else
73.macro LOAD2x8_1
74#endif
75
76	lxvd2x		vs16,	o0,	BO		// load real part from B
77	lxvd2x		vs17,	o16,	BO		// load imag part from B
78	lxvd2x		vs18,	o32,	BO		// load real part from B
79	lxvd2x		vs19,	o48,	BO		// load imag part from B
80
81	addi		BO,	BO,	64
82
83	lxvd2x		vs0,	o0,	AO		// load real,imag from A
84	lxvd2x		vs1,	o16,	AO		// load real,imag from A
85	lxvd2x		vs2,	o32,	AO		// load real,imag from A
86	lxvd2x		vs3,	o48,	AO		// load real,imag from A
87
88	addi		AO,	AO,	64
89
90	lxvd2x		vs4,	o0,	AO		// load real,imag from A
91	lxvd2x		vs5,	o16,	AO		// load real,imag from A
92	lxvd2x		vs6,	o32,	AO		// load real,imag from A
93	lxvd2x		vs7,	o48,	AO		// load real,imag from A
94
95	addi		AO,	AO,	64
96
97
98#if defined(_AIX)
99')
100#else
101.endm
102#endif
103
104#if defined(_AIX)
105define(`KERNEL2x8_I1', `
106#else
107.macro KERNEL2x8_I1
108#endif
109
110	lxvd2x		vs8,	o0,	AO		// load real,imag from A
111	lxvd2x		vs9,	o16,	AO		// load real,imag from A
112	lxvd2x		vs10,	o32,	AO		// load real,imag from A
113	lxvd2x		vs11,	o48,	AO		// load real,imag from A
114
115	addi		AO,	AO,	64
116
117	lxvd2x		vs12,	o0,	AO		// load real,imag from A
118	lxvd2x		vs13,	o16,	AO		// load real,imag from A
119	lxvd2x		vs14,	o32,	AO		// load real,imag from A
120	lxvd2x		vs15,	o48,	AO		// load real,imag from A
121
122	addi		AO,	AO,	64
123
124	lxvd2x		vs20,	o0,	BO		// load real part from B
125	lxvd2x		vs21,	o16,	BO		// load imag part from B
126	lxvd2x		vs22,	o32,	BO		// load real part from B
127	lxvd2x		vs23,	o48,	BO		// load imag part from B
128
129	addi		BO,	BO,	64
130
131	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
132	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
133	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
134	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
135	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
136	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
137	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
138	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
139	xvmuldp		vs40,	vs4,	vs16		// real*real, imag*real
140	xvmuldp		vs41,	vs4,	vs17		// real*imag, imag*imag
141	xvmuldp		vs42,	vs5,	vs16		// real*real, imag*real
142	xvmuldp		vs43,	vs5,	vs17		// real*imag, imag*imag
143	xvmuldp		vs44,	vs6,	vs16		// real*real, imag*real
144	xvmuldp		vs45,	vs6,	vs17		// real*imag, imag*imag
145	xvmuldp		vs46,	vs7,	vs16		// real*real, imag*real
146	xvmuldp		vs47,	vs7,	vs17		// real*imag, imag*imag
147
148	xvmuldp		vs48,	vs0,	vs18		// real*real, imag*real
149	xvmuldp		vs49,	vs0,	vs19		// real*imag, imag*imag
150	xvmuldp		vs50,	vs1,	vs18		// real*real, imag*real
151	xvmuldp		vs51,	vs1,	vs19		// real*imag, imag*imag
152	xvmuldp		vs52,	vs2,	vs18		// real*real, imag*real
153	xvmuldp		vs53,	vs2,	vs19		// real*imag, imag*imag
154	xvmuldp		vs54,	vs3,	vs18		// real*real, imag*real
155	xvmuldp		vs55,	vs3,	vs19		// real*imag, imag*imag
156	xvmuldp		vs56,	vs4,	vs18		// real*real, imag*real
157	xvmuldp		vs57,	vs4,	vs19		// real*imag, imag*imag
158	xvmuldp		vs58,	vs5,	vs18		// real*real, imag*real
159	xvmuldp		vs59,	vs5,	vs19		// real*imag, imag*imag
160	xvmuldp		vs60,	vs6,	vs18		// real*real, imag*real
161	xvmuldp		vs61,	vs6,	vs19		// real*imag, imag*imag
162	xvmuldp		vs62,	vs7,	vs18		// real*real, imag*real
163	xvmuldp		vs63,	vs7,	vs19		// real*imag, imag*imag
164
165
166#if defined(_AIX)
167')
168#else
169.endm
170#endif
171
172#if defined(_AIX)
173define(`KERNEL2x8_1', `
174#else
175.macro KERNEL2x8_1
176#endif
177
178	lxvd2x		vs8,	o0,	AO		// load real,imag from A
179	lxvd2x		vs9,	o16,	AO		// load real,imag from A
180	lxvd2x		vs10,	o32,	AO		// load real,imag from A
181	lxvd2x		vs11,	o48,	AO		// load real,imag from A
182
183	addi		AO,	AO,	64
184
185	lxvd2x		vs12,	o0,	AO		// load real,imag from A
186	lxvd2x		vs13,	o16,	AO		// load real,imag from A
187	lxvd2x		vs14,	o32,	AO		// load real,imag from A
188	lxvd2x		vs15,	o48,	AO		// load real,imag from A
189
190	addi		AO,	AO,	64
191
192	lxvd2x		vs20,	o0,	BO		// load real part from B
193	lxvd2x		vs21,	o16,	BO		// load imag part from B
194	lxvd2x		vs22,	o32,	BO		// load real part from B
195	lxvd2x		vs23,	o48,	BO		// load imag part from B
196
197	addi		BO,	BO,	64
198
199	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
200	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
201	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
202	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
203	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
204	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
205	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
206	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
207	xvmaddadp	vs40,	vs4,	vs16		// real*real, imag*real
208	xvmaddadp	vs41,	vs4,	vs17		// real*imag, imag*imag
209	xvmaddadp	vs42,	vs5,	vs16		// real*real, imag*real
210	xvmaddadp	vs43,	vs5,	vs17		// real*imag, imag*imag
211	xvmaddadp	vs44,	vs6,	vs16		// real*real, imag*real
212	xvmaddadp	vs45,	vs6,	vs17		// real*imag, imag*imag
213	xvmaddadp	vs46,	vs7,	vs16		// real*real, imag*real
214	xvmaddadp	vs47,	vs7,	vs17		// real*imag, imag*imag
215
216	xvmaddadp	vs48,	vs0,	vs18		// real*real, imag*real
217	xvmaddadp	vs49,	vs0,	vs19		// real*imag, imag*imag
218	xvmaddadp	vs50,	vs1,	vs18		// real*real, imag*real
219	xvmaddadp	vs51,	vs1,	vs19		// real*imag, imag*imag
220	xvmaddadp	vs52,	vs2,	vs18		// real*real, imag*real
221	xvmaddadp	vs53,	vs2,	vs19		// real*imag, imag*imag
222	xvmaddadp	vs54,	vs3,	vs18		// real*real, imag*real
223	xvmaddadp	vs55,	vs3,	vs19		// real*imag, imag*imag
224	xvmaddadp	vs56,	vs4,	vs18		// real*real, imag*real
225	xvmaddadp	vs57,	vs4,	vs19		// real*imag, imag*imag
226	xvmaddadp	vs58,	vs5,	vs18		// real*real, imag*real
227	xvmaddadp	vs59,	vs5,	vs19		// real*imag, imag*imag
228	xvmaddadp	vs60,	vs6,	vs18		// real*real, imag*real
229	xvmaddadp	vs61,	vs6,	vs19		// real*imag, imag*imag
230	xvmaddadp	vs62,	vs7,	vs18		// real*real, imag*real
231	xvmaddadp	vs63,	vs7,	vs19		// real*imag, imag*imag
232
233
234#if defined(_AIX)
235')
236#else
237.endm
238#endif
239
240#if defined(_AIX)
241define(`KERNEL2x8_2', `
242#else
243.macro KERNEL2x8_2
244#endif
245
246	lxvd2x		vs0,	o0,	AO		// load real,imag from A
247	lxvd2x		vs1,	o16,	AO		// load real,imag from A
248	lxvd2x		vs2,	o32,	AO		// load real,imag from A
249	lxvd2x		vs3,	o48,	AO		// load real,imag from A
250
251	addi		AO,	AO,	64
252
253	lxvd2x		vs4,	o0,	AO		// load real,imag from A
254	lxvd2x		vs5,	o16,	AO		// load real,imag from A
255	lxvd2x		vs6,	o32,	AO		// load real,imag from A
256	lxvd2x		vs7,	o48,	AO		// load real,imag from A
257
258	addi		AO,	AO,	64
259
260	lxvd2x		vs16,	o0,	BO		// load real part from B
261	lxvd2x		vs17,	o16,	BO		// load imag part from B
262	lxvd2x		vs18,	o32,	BO		// load real part from B
263	lxvd2x		vs19,	o48,	BO		// load imag part from B
264
265	addi		BO,	BO,	64
266
267	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
268	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
269	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
270	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
271	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
272	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
273	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
274	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
275	xvmaddadp	vs40,	vs12,	vs20		// real*real, imag*real
276	xvmaddadp	vs41,	vs12,	vs21		// real*imag, imag*imag
277	xvmaddadp	vs42,	vs13,	vs20		// real*real, imag*real
278	xvmaddadp	vs43,	vs13,	vs21		// real*imag, imag*imag
279	xvmaddadp	vs44,	vs14,	vs20		// real*real, imag*real
280	xvmaddadp	vs45,	vs14,	vs21		// real*imag, imag*imag
281	xvmaddadp	vs46,	vs15,	vs20		// real*real, imag*real
282	xvmaddadp	vs47,	vs15,	vs21		// real*imag, imag*imag
283
284	xvmaddadp	vs48,	vs8,	vs22		// real*real, imag*real
285	xvmaddadp	vs49,	vs8,	vs23		// real*imag, imag*imag
286	xvmaddadp	vs50,	vs9,	vs22		// real*real, imag*real
287	xvmaddadp	vs51,	vs9,	vs23		// real*imag, imag*imag
288	xvmaddadp	vs52,	vs10,	vs22		// real*real, imag*real
289	xvmaddadp	vs53,	vs10,	vs23		// real*imag, imag*imag
290	xvmaddadp	vs54,	vs11,	vs22		// real*real, imag*real
291	xvmaddadp	vs55,	vs11,	vs23		// real*imag, imag*imag
292	xvmaddadp	vs56,	vs12,	vs22		// real*real, imag*real
293	xvmaddadp	vs57,	vs12,	vs23		// real*imag, imag*imag
294	xvmaddadp	vs58,	vs13,	vs22		// real*real, imag*real
295	xvmaddadp	vs59,	vs13,	vs23		// real*imag, imag*imag
296	xvmaddadp	vs60,	vs14,	vs22		// real*real, imag*real
297	xvmaddadp	vs61,	vs14,	vs23		// real*imag, imag*imag
298	xvmaddadp	vs62,	vs15,	vs22		// real*real, imag*real
299	xvmaddadp	vs63,	vs15,	vs23		// real*imag, imag*imag
300
301
302#if defined(_AIX)
303')
304#else
305.endm
306#endif
307
308#if defined(_AIX)
309define(`KERNEL2x8_E2', `
310#else
311.macro KERNEL2x8_E2
312#endif
313
314
315	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
316	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
317	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
318	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
319	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
320	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
321	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
322	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
323	xvmaddadp	vs40,	vs12,	vs20		// real*real, imag*real
324	xvmaddadp	vs41,	vs12,	vs21		// real*imag, imag*imag
325	xvmaddadp	vs42,	vs13,	vs20		// real*real, imag*real
326	xvmaddadp	vs43,	vs13,	vs21		// real*imag, imag*imag
327	xvmaddadp	vs44,	vs14,	vs20		// real*real, imag*real
328	xvmaddadp	vs45,	vs14,	vs21		// real*imag, imag*imag
329	xvmaddadp	vs46,	vs15,	vs20		// real*real, imag*real
330	xvmaddadp	vs47,	vs15,	vs21		// real*imag, imag*imag
331
332	xvmaddadp	vs48,	vs8,	vs22		// real*real, imag*real
333	xvmaddadp	vs49,	vs8,	vs23		// real*imag, imag*imag
334	xvmaddadp	vs50,	vs9,	vs22		// real*real, imag*real
335	xvmaddadp	vs51,	vs9,	vs23		// real*imag, imag*imag
336	xvmaddadp	vs52,	vs10,	vs22		// real*real, imag*real
337	xvmaddadp	vs53,	vs10,	vs23		// real*imag, imag*imag
338	xvmaddadp	vs54,	vs11,	vs22		// real*real, imag*real
339	xvmaddadp	vs55,	vs11,	vs23		// real*imag, imag*imag
340	xvmaddadp	vs56,	vs12,	vs22		// real*real, imag*real
341	xvmaddadp	vs57,	vs12,	vs23		// real*imag, imag*imag
342	xvmaddadp	vs58,	vs13,	vs22		// real*real, imag*real
343	xvmaddadp	vs59,	vs13,	vs23		// real*imag, imag*imag
344	xvmaddadp	vs60,	vs14,	vs22		// real*real, imag*real
345	xvmaddadp	vs61,	vs14,	vs23		// real*imag, imag*imag
346	xvmaddadp	vs62,	vs15,	vs22		// real*real, imag*real
347	xvmaddadp	vs63,	vs15,	vs23		// real*imag, imag*imag
348
349
350#if defined(_AIX)
351')
352#else
353.endm
354#endif
355
356#if defined(_AIX)
357define(`KERNEL2x8_SUBI1', `
358#else
359.macro KERNEL2x8_SUBI1
360#endif
361
362	lxvd2x		vs0,	o0,	AO		// load real,imag from A
363	lxvd2x		vs1,	o16,	AO		// load real,imag from A
364	lxvd2x		vs2,	o32,	AO		// load real,imag from A
365	lxvd2x		vs3,	o48,	AO		// load real,imag from A
366
367	addi		AO,	AO,	64
368
369	lxvd2x		vs4,	o0,	AO		// load real,imag from A
370	lxvd2x		vs5,	o16,	AO		// load real,imag from A
371	lxvd2x		vs6,	o32,	AO		// load real,imag from A
372	lxvd2x		vs7,	o48,	AO		// load real,imag from A
373
374	addi		AO,	AO,	64
375
376	lxvd2x		vs16,	o0,	BO		// load real part from B
377	lxvd2x		vs17,	o16,	BO		// load imag part from B
378	lxvd2x		vs18,	o32,	BO		// load real part from B
379	lxvd2x		vs19,	o48,	BO		// load imag part from B
380
381	addi		BO,	BO,	64
382
383	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
384	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
385	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
386	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
387	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
388	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
389	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
390	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
391	xvmuldp		vs40,	vs4,	vs16		// real*real, imag*real
392	xvmuldp		vs41,	vs4,	vs17		// real*imag, imag*imag
393	xvmuldp		vs42,	vs5,	vs16		// real*real, imag*real
394	xvmuldp		vs43,	vs5,	vs17		// real*imag, imag*imag
395	xvmuldp		vs44,	vs6,	vs16		// real*real, imag*real
396	xvmuldp		vs45,	vs6,	vs17		// real*imag, imag*imag
397	xvmuldp		vs46,	vs7,	vs16		// real*real, imag*real
398	xvmuldp		vs47,	vs7,	vs17		// real*imag, imag*imag
399
400	xvmuldp		vs48,	vs0,	vs18		// real*real, imag*real
401	xvmuldp		vs49,	vs0,	vs19		// real*imag, imag*imag
402	xvmuldp		vs50,	vs1,	vs18		// real*real, imag*real
403	xvmuldp		vs51,	vs1,	vs19		// real*imag, imag*imag
404	xvmuldp		vs52,	vs2,	vs18		// real*real, imag*real
405	xvmuldp		vs53,	vs2,	vs19		// real*imag, imag*imag
406	xvmuldp		vs54,	vs3,	vs18		// real*real, imag*real
407	xvmuldp		vs55,	vs3,	vs19		// real*imag, imag*imag
408	xvmuldp		vs56,	vs4,	vs18		// real*real, imag*real
409	xvmuldp		vs57,	vs4,	vs19		// real*imag, imag*imag
410	xvmuldp		vs58,	vs5,	vs18		// real*real, imag*real
411	xvmuldp		vs59,	vs5,	vs19		// real*imag, imag*imag
412	xvmuldp		vs60,	vs6,	vs18		// real*real, imag*real
413	xvmuldp		vs61,	vs6,	vs19		// real*imag, imag*imag
414	xvmuldp		vs62,	vs7,	vs18		// real*real, imag*real
415	xvmuldp		vs63,	vs7,	vs19		// real*imag, imag*imag
416
417
418#if defined(_AIX)
419')
420#else
421.endm
422#endif
423
424#if defined(_AIX)
425define(`KERNEL2x8_SUB1', `
426#else
427.macro KERNEL2x8_SUB1
428#endif
429
430	lxvd2x		vs0,	o0,	AO		// load real,imag from A
431	lxvd2x		vs1,	o16,	AO		// load real,imag from A
432	lxvd2x		vs2,	o32,	AO		// load real,imag from A
433	lxvd2x		vs3,	o48,	AO		// load real,imag from A
434
435	addi		AO,	AO,	64
436
437	lxvd2x		vs4,	o0,	AO		// load real,imag from A
438	lxvd2x		vs5,	o16,	AO		// load real,imag from A
439	lxvd2x		vs6,	o32,	AO		// load real,imag from A
440	lxvd2x		vs7,	o48,	AO		// load real,imag from A
441
442	addi		AO,	AO,	64
443
444	lxvd2x		vs16,	o0,	BO		// load real part from B
445	lxvd2x		vs17,	o16,	BO		// load imag part from B
446	lxvd2x		vs18,	o32,	BO		// load real part from B
447	lxvd2x		vs19,	o48,	BO		// load imag part from B
448
449	addi		BO,	BO,	64
450
451	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
452	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
453	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
454	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
455	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
456	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
457	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
458	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
459	xvmaddadp	vs40,	vs4,	vs16		// real*real, imag*real
460	xvmaddadp	vs41,	vs4,	vs17		// real*imag, imag*imag
461	xvmaddadp	vs42,	vs5,	vs16		// real*real, imag*real
462	xvmaddadp	vs43,	vs5,	vs17		// real*imag, imag*imag
463	xvmaddadp	vs44,	vs6,	vs16		// real*real, imag*real
464	xvmaddadp	vs45,	vs6,	vs17		// real*imag, imag*imag
465	xvmaddadp	vs46,	vs7,	vs16		// real*real, imag*real
466	xvmaddadp	vs47,	vs7,	vs17		// real*imag, imag*imag
467
468	xvmaddadp	vs48,	vs0,	vs18		// real*real, imag*real
469	xvmaddadp	vs49,	vs0,	vs19		// real*imag, imag*imag
470	xvmaddadp	vs50,	vs1,	vs18		// real*real, imag*real
471	xvmaddadp	vs51,	vs1,	vs19		// real*imag, imag*imag
472	xvmaddadp	vs52,	vs2,	vs18		// real*real, imag*real
473	xvmaddadp	vs53,	vs2,	vs19		// real*imag, imag*imag
474	xvmaddadp	vs54,	vs3,	vs18		// real*real, imag*real
475	xvmaddadp	vs55,	vs3,	vs19		// real*imag, imag*imag
476	xvmaddadp	vs56,	vs4,	vs18		// real*real, imag*real
477	xvmaddadp	vs57,	vs4,	vs19		// real*imag, imag*imag
478	xvmaddadp	vs58,	vs5,	vs18		// real*real, imag*real
479	xvmaddadp	vs59,	vs5,	vs19		// real*imag, imag*imag
480	xvmaddadp	vs60,	vs6,	vs18		// real*real, imag*real
481	xvmaddadp	vs61,	vs6,	vs19		// real*imag, imag*imag
482	xvmaddadp	vs62,	vs7,	vs18		// real*real, imag*real
483	xvmaddadp	vs63,	vs7,	vs19		// real*imag, imag*imag
484
485
486#if defined(_AIX)
487')
488#else
489.endm
490#endif
491
492#if defined(_AIX)
493define(`SAVE2x8', `
494#else
495.macro SAVE2x8
496#endif
497
498
499	mr		T1,	CO
500	addi		T2,	T1,	64
501
502#ifndef TRMMKERNEL
503
504	lxvd2x		vs16,	o0,	T1
505	lxvd2x		vs17,	o16,	T1
506	lxvd2x		vs18,	o32,	T1
507	lxvd2x		vs19,	o48,	T1
508	lxvd2x		vs20,	o0,	T2
509	lxvd2x		vs21,	o16,	T2
510	lxvd2x		vs22,	o32,	T2
511	lxvd2x		vs23,	o48,	T2
512
513#endif
514
515
516	xxlxor		vs0,	vs0,	vs0
517	xxlxor		vs1,	vs1,	vs1
518	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
519
520	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
521	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
522
523	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
524	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
525
526	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
527	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
528
529	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
530	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
531	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
532	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
533
534	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
535	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
536	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
537
538
539
540	xxlxor		vs0,	vs0,	vs0
541	xxlxor		vs1,	vs1,	vs1
542	XXSWAPD(vs35,vs35)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
543
544	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
545	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
546
547	XXSWAPD(vs34,vs34)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
548	XXSWAPD(vs35,vs35)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
549
550	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
551	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
552
553	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
554	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
555	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
556	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
557
558	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
559	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
560	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
561
562
563
564	xxlxor		vs0,	vs0,	vs0
565	xxlxor		vs1,	vs1,	vs1
566	XXSWAPD(vs37,vs37)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
567
568	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
569	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
570
571	XXSWAPD(vs36,vs36)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
572	XXSWAPD(vs37,vs37)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
573
574	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
575	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
576
577	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
578	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
579	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
580	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
581
582	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
583	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
584	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
585
586
587
588	xxlxor		vs0,	vs0,	vs0
589	xxlxor		vs1,	vs1,	vs1
590	XXSWAPD(vs39,vs39)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
591
592	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
593	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
594
595	XXSWAPD(vs38,vs38)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
596	XXSWAPD(vs39,vs39)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
597
598	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
599	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
600
601	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
602	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
603	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
604	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
605
606	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
607	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
608	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
609
610
611
612	xxlxor		vs0,	vs0,	vs0
613	xxlxor		vs1,	vs1,	vs1
614	XXSWAPD(vs41,vs41)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
615
616	XSFADD_R1	vs0,	vs0,	vs40		// realA*realB
617	XSFADD_R2	vs0,	vs0,	vs41		// imagA*imagB
618
619	XXSWAPD(vs40,vs40)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
620	XXSWAPD(vs41,vs41)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
621
622	XSFADD_I1	vs1,	vs1,	vs40		// realA*imagB
623	XSFADD_I2	vs1,	vs1,	vs41		// imagA*realB
624
625	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
626	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
627	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
628	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
629
630	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
631	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
632	xxpermdi	vs12,	vs2,	vs3,	0	// merge real and imag part
633
634
635
636	xxlxor		vs0,	vs0,	vs0
637	xxlxor		vs1,	vs1,	vs1
638	XXSWAPD(vs43,vs43)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
639
640	XSFADD_R1	vs0,	vs0,	vs42		// realA*realB
641	XSFADD_R2	vs0,	vs0,	vs43		// imagA*imagB
642
643	XXSWAPD(vs42,vs42)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
644	XXSWAPD(vs43,vs43)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
645
646	XSFADD_I1	vs1,	vs1,	vs42		// realA*imagB
647	XSFADD_I2	vs1,	vs1,	vs43		// imagA*realB
648
649	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
650	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
651	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
652	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
653
654	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
655	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
656	xxpermdi	vs13,	vs2,	vs3,	0	// merge real and imag part
657
658
659
660	xxlxor		vs0,	vs0,	vs0
661	xxlxor		vs1,	vs1,	vs1
662	XXSWAPD(vs45,vs45)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
663
664	XSFADD_R1	vs0,	vs0,	vs44		// realA*realB
665	XSFADD_R2	vs0,	vs0,	vs45		// imagA*imagB
666
667	XXSWAPD(vs44,vs44)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
668	XXSWAPD(vs45,vs45)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
669
670	XSFADD_I1	vs1,	vs1,	vs44		// realA*imagB
671	XSFADD_I2	vs1,	vs1,	vs45		// imagA*realB
672
673	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
674	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
675	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
676	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
677
678	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
679	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
680	xxpermdi	vs14,	vs2,	vs3,	0	// merge real and imag part
681
682
683
684	xxlxor		vs0,	vs0,	vs0
685	xxlxor		vs1,	vs1,	vs1
686	XXSWAPD(vs47,vs47)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
687
688	XSFADD_R1	vs0,	vs0,	vs46		// realA*realB
689	XSFADD_R2	vs0,	vs0,	vs47		// imagA*imagB
690
691	XXSWAPD(vs46,vs46)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
692	XXSWAPD(vs47,vs47)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
693
694	XSFADD_I1	vs1,	vs1,	vs46		// realA*imagB
695	XSFADD_I2	vs1,	vs1,	vs47		// imagA*realB
696
697	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
698	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
699	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
700	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
701
702	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
703	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
704	xxpermdi	vs15,	vs2,	vs3,	0	// merge real and imag part
705
706
707#ifndef TRMMKERNEL
708
709	xvadddp		vs8,	vs8,	vs16
710	xvadddp		vs9,	vs9,	vs17
711	xvadddp		vs10,	vs10,	vs18
712	xvadddp		vs11,	vs11,	vs19
713	xvadddp		vs12,	vs12,	vs20
714	xvadddp		vs13,	vs13,	vs21
715	xvadddp		vs14,	vs14,	vs22
716	xvadddp		vs15,	vs15,	vs23
717
718#endif
719
720	stxvd2x		vs8,	o0,	T1
721	stxvd2x		vs9,	o16,	T1
722	stxvd2x		vs10,	o32,	T1
723	stxvd2x		vs11,	o48,	T1
724	stxvd2x		vs12,	o0,	T2
725	stxvd2x		vs13,	o16,	T2
726	stxvd2x		vs14,	o32,	T2
727	stxvd2x		vs15,	o48,	T2
728
729	add		T1,	T1,	LDC
730	add		T2,	T2,	LDC
731
732#ifndef TRMMKERNEL
733
734	lxvd2x		vs16,	o0,	T1
735	lxvd2x		vs17,	o16,	T1
736	lxvd2x		vs18,	o32,	T1
737	lxvd2x		vs19,	o48,	T1
738	lxvd2x		vs20,	o0,	T2
739	lxvd2x		vs21,	o16,	T2
740	lxvd2x		vs22,	o32,	T2
741	lxvd2x		vs23,	o48,	T2
742
743#endif
744
745
746	xxlxor		vs0,	vs0,	vs0
747	xxlxor		vs1,	vs1,	vs1
748	XXSWAPD(vs49,vs49)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
749
750	XSFADD_R1	vs0,	vs0,	vs48		// realA*realB
751	XSFADD_R2	vs0,	vs0,	vs49		// imagA*imagB
752
753	XXSWAPD(vs48,vs48)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
754	XXSWAPD(vs49,vs49)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
755
756	XSFADD_I1	vs1,	vs1,	vs48		// realA*imagB
757	XSFADD_I2	vs1,	vs1,	vs49		// imagA*realB
758
759	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
760	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
761	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
762	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
763
764	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
765	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
766	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
767
768
769
770	xxlxor		vs0,	vs0,	vs0
771	xxlxor		vs1,	vs1,	vs1
772	XXSWAPD(vs51,vs51)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
773
774	XSFADD_R1	vs0,	vs0,	vs50		// realA*realB
775	XSFADD_R2	vs0,	vs0,	vs51		// imagA*imagB
776
777	XXSWAPD(vs50,vs50)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
778	XXSWAPD(vs51,vs51)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
779
780	XSFADD_I1	vs1,	vs1,	vs50		// realA*imagB
781	XSFADD_I2	vs1,	vs1,	vs51		// imagA*realB
782
783	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
784	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
785	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
786	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
787
788	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
789	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
790	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
791
792
793
794	xxlxor		vs0,	vs0,	vs0
795	xxlxor		vs1,	vs1,	vs1
796	XXSWAPD(vs53,vs53)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
797
798	XSFADD_R1	vs0,	vs0,	vs52		// realA*realB
799	XSFADD_R2	vs0,	vs0,	vs53		// imagA*imagB
800
801	XXSWAPD(vs52,vs52)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
802	XXSWAPD(vs53,vs53)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
803
804	XSFADD_I1	vs1,	vs1,	vs52		// realA*imagB
805	XSFADD_I2	vs1,	vs1,	vs53		// imagA*realB
806
807	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
808	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
809	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
810	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
811
812	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
813	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
814	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
815
816
817
818	xxlxor		vs0,	vs0,	vs0
819	xxlxor		vs1,	vs1,	vs1
820	XXSWAPD(vs55,vs55)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
821
822	XSFADD_R1	vs0,	vs0,	vs54		// realA*realB
823	XSFADD_R2	vs0,	vs0,	vs55		// imagA*imagB
824
825	XXSWAPD(vs54,vs54)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
826	XXSWAPD(vs55,vs55)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
827
828	XSFADD_I1	vs1,	vs1,	vs54		// realA*imagB
829	XSFADD_I2	vs1,	vs1,	vs55		// imagA*realB
830
831	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
832	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
833	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
834	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
835
836	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
837	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
838	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
839
840
841
842	xxlxor		vs0,	vs0,	vs0
843	xxlxor		vs1,	vs1,	vs1
844	XXSWAPD(vs57,vs57)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
845
846	XSFADD_R1	vs0,	vs0,	vs56		// realA*realB
847	XSFADD_R2	vs0,	vs0,	vs57		// imagA*imagB
848
849	XXSWAPD(vs56,vs56)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
850	XXSWAPD(vs57,vs57)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
851
852	XSFADD_I1	vs1,	vs1,	vs56		// realA*imagB
853	XSFADD_I2	vs1,	vs1,	vs57		// imagA*realB
854
855	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
856	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
857	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
858	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
859
860	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
861	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
862	xxpermdi	vs12,	vs2,	vs3,	0	// merge real and imag part
863
864
865
866	xxlxor		vs0,	vs0,	vs0
867	xxlxor		vs1,	vs1,	vs1
868	XXSWAPD(vs59,vs59)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
869
870	XSFADD_R1	vs0,	vs0,	vs58		// realA*realB
871	XSFADD_R2	vs0,	vs0,	vs59		// imagA*imagB
872
873	XXSWAPD(vs58,vs58)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
874	XXSWAPD(vs59,vs59)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
875
876	XSFADD_I1	vs1,	vs1,	vs58		// realA*imagB
877	XSFADD_I2	vs1,	vs1,	vs59		// imagA*realB
878
879	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
880	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
881	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
882	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
883
884	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
885	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
886	xxpermdi	vs13,	vs2,	vs3,	0	// merge real and imag part
887
888
889
890	xxlxor		vs0,	vs0,	vs0
891	xxlxor		vs1,	vs1,	vs1
892	XXSWAPD(vs61,vs61)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
893
894	XSFADD_R1	vs0,	vs0,	vs60		// realA*realB
895	XSFADD_R2	vs0,	vs0,	vs61		// imagA*imagB
896
897	XXSWAPD(vs60,vs60)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
898	XXSWAPD(vs61,vs61)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
899
900	XSFADD_I1	vs1,	vs1,	vs60		// realA*imagB
901	XSFADD_I2	vs1,	vs1,	vs61		// imagA*realB
902
903	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
904	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
905	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
906	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
907
908	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
909	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
910	xxpermdi	vs14,	vs2,	vs3,	0	// merge real and imag part
911
912
913
914	xxlxor		vs0,	vs0,	vs0
915	xxlxor		vs1,	vs1,	vs1
916	XXSWAPD(vs63,vs63)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
917
918	XSFADD_R1	vs0,	vs0,	vs62		// realA*realB
919	XSFADD_R2	vs0,	vs0,	vs63		// imagA*imagB
920
921	XXSWAPD(vs62,vs62)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
922	XXSWAPD(vs63,vs63)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
923
924	XSFADD_I1	vs1,	vs1,	vs62		// realA*imagB
925	XSFADD_I2	vs1,	vs1,	vs63		// imagA*realB
926
927	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
928	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
929	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
930	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
931
932	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
933	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
934	xxpermdi	vs15,	vs2,	vs3,	0	// merge real and imag part
935
936
937#ifndef TRMMKERNEL
938
939	xvadddp		vs8,	vs8,	vs16
940	xvadddp		vs9,	vs9,	vs17
941	xvadddp		vs10,	vs10,	vs18
942	xvadddp		vs11,	vs11,	vs19
943	xvadddp		vs12,	vs12,	vs20
944	xvadddp		vs13,	vs13,	vs21
945	xvadddp		vs14,	vs14,	vs22
946	xvadddp		vs15,	vs15,	vs23
947
948#endif
949
950	stxvd2x		vs8,	o0,	T1
951	stxvd2x		vs9,	o16,	T1
952	stxvd2x		vs10,	o32,	T1
953	stxvd2x		vs11,	o48,	T1
954	stxvd2x		vs12,	o0,	T2
955	stxvd2x		vs13,	o16,	T2
956	stxvd2x		vs14,	o32,	T2
957	stxvd2x		vs15,	o48,	T2
958
959	add		T1,	T1,	LDC
960	add		T2,	T2,	LDC
961	addi		CO,	CO,	128
962
963#if defined(_AIX)
964')
965#else
966.endm
967#endif
968
969
970/**********************************************************************************************
971* Macros for N=2 and M=4
972**********************************************************************************************/
973
974#if defined(_AIX)
975define(`LOAD2x4_1', `
976#else
977.macro LOAD2x4_1
978#endif
979
980	lxvd2x		vs16,	o0,	BO		// load real part from B
981	lxvd2x		vs17,	o16,	BO		// load imag part from B
982	lxvd2x		vs18,	o32,	BO		// load real part from B
983	lxvd2x		vs19,	o48,	BO		// load imag part from B
984
985	addi		BO,	BO,	64
986
987	lxvd2x		vs0,	o0,	AO		// load real,imag from A
988	lxvd2x		vs1,	o16,	AO		// load real,imag from A
989	lxvd2x		vs2,	o32,	AO		// load real,imag from A
990	lxvd2x		vs3,	o48,	AO		// load real,imag from A
991
992	addi		AO,	AO,	64
993
994
995#if defined(_AIX)
996')
997#else
998.endm
999#endif
1000
1001#if defined(_AIX)
1002define(`KERNEL2x4_I1', `
1003#else
1004.macro KERNEL2x4_I1
1005#endif
1006
1007	lxvd2x		vs8,	o0,	AO		// load real,imag from A
1008	lxvd2x		vs9,	o16,	AO		// load real,imag from A
1009	lxvd2x		vs10,	o32,	AO		// load real,imag from A
1010	lxvd2x		vs11,	o48,	AO		// load real,imag from A
1011
1012	addi		AO,	AO,	64
1013
1014	lxvd2x		vs20,	o0,	BO		// load real part from B
1015	lxvd2x		vs21,	o16,	BO		// load imag part from B
1016	lxvd2x		vs22,	o32,	BO		// load real part from B
1017	lxvd2x		vs23,	o48,	BO		// load imag part from B
1018
1019	addi		BO,	BO,	64
1020
1021	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
1022	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
1023	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
1024	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
1025	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
1026	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
1027	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
1028	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
1029
1030	xvmuldp		vs40,	vs0,	vs18		// real*real, imag*real
1031	xvmuldp		vs41,	vs0,	vs19		// real*imag, imag*imag
1032	xvmuldp		vs42,	vs1,	vs18		// real*real, imag*real
1033	xvmuldp		vs43,	vs1,	vs19		// real*imag, imag*imag
1034	xvmuldp		vs44,	vs2,	vs18		// real*real, imag*real
1035	xvmuldp		vs45,	vs2,	vs19		// real*imag, imag*imag
1036	xvmuldp		vs46,	vs3,	vs18		// real*real, imag*real
1037	xvmuldp		vs47,	vs3,	vs19		// real*imag, imag*imag
1038
1039
1040#if defined(_AIX)
1041')
1042#else
1043.endm
1044#endif
1045
1046#if defined(_AIX)
1047define(`KERNEL2x4_1', `
1048#else
1049.macro KERNEL2x4_1
1050#endif
1051
1052	lxvd2x		vs8,	o0,	AO		// load real,imag from A
1053	lxvd2x		vs9,	o16,	AO		// load real,imag from A
1054	lxvd2x		vs10,	o32,	AO		// load real,imag from A
1055	lxvd2x		vs11,	o48,	AO		// load real,imag from A
1056
1057	addi		AO,	AO,	64
1058
1059	lxvd2x		vs20,	o0,	BO		// load real part from B
1060	lxvd2x		vs21,	o16,	BO		// load imag part from B
1061	lxvd2x		vs22,	o32,	BO		// load real part from B
1062	lxvd2x		vs23,	o48,	BO		// load imag part from B
1063
1064	addi		BO,	BO,	64
1065
1066	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
1067	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
1068	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
1069	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
1070	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
1071	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
1072	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
1073	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
1074
1075	xvmaddadp	vs40,	vs0,	vs18		// real*real, imag*real
1076	xvmaddadp	vs41,	vs0,	vs19		// real*imag, imag*imag
1077	xvmaddadp	vs42,	vs1,	vs18		// real*real, imag*real
1078	xvmaddadp	vs43,	vs1,	vs19		// real*imag, imag*imag
1079	xvmaddadp	vs44,	vs2,	vs18		// real*real, imag*real
1080	xvmaddadp	vs45,	vs2,	vs19		// real*imag, imag*imag
1081	xvmaddadp	vs46,	vs3,	vs18		// real*real, imag*real
1082	xvmaddadp	vs47,	vs3,	vs19		// real*imag, imag*imag
1083
1084
1085#if defined(_AIX)
1086')
1087#else
1088.endm
1089#endif
1090
1091#if defined(_AIX)
1092define(`KERNEL2x4_2', `
1093#else
1094.macro KERNEL2x4_2
1095#endif
1096
1097	lxvd2x		vs0,	o0,	AO		// load real,imag from A
1098	lxvd2x		vs1,	o16,	AO		// load real,imag from A
1099	lxvd2x		vs2,	o32,	AO		// load real,imag from A
1100	lxvd2x		vs3,	o48,	AO		// load real,imag from A
1101
1102	addi		AO,	AO,	64
1103
1104	lxvd2x		vs16,	o0,	BO		// load real part from B
1105	lxvd2x		vs17,	o16,	BO		// load imag part from B
1106	lxvd2x		vs18,	o32,	BO		// load real part from B
1107	lxvd2x		vs19,	o48,	BO		// load imag part from B
1108
1109	addi		BO,	BO,	64
1110
1111	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
1112	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
1113	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
1114	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
1115	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
1116	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
1117	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
1118	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
1119
1120	xvmaddadp	vs40,	vs8,	vs22		// real*real, imag*real
1121	xvmaddadp	vs41,	vs8,	vs23		// real*imag, imag*imag
1122	xvmaddadp	vs42,	vs9,	vs22		// real*real, imag*real
1123	xvmaddadp	vs43,	vs9,	vs23		// real*imag, imag*imag
1124	xvmaddadp	vs44,	vs10,	vs22		// real*real, imag*real
1125	xvmaddadp	vs45,	vs10,	vs23		// real*imag, imag*imag
1126	xvmaddadp	vs46,	vs11,	vs22		// real*real, imag*real
1127	xvmaddadp	vs47,	vs11,	vs23		// real*imag, imag*imag
1128
1129
1130#if defined(_AIX)
1131')
1132#else
1133.endm
1134#endif
1135
1136#if defined(_AIX)
1137define(`KERNEL2x4_E2', `
1138#else
1139.macro KERNEL2x4_E2
1140#endif
1141
1142
1143	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
1144	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
1145	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
1146	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
1147	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
1148	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
1149	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
1150	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
1151
1152	xvmaddadp	vs40,	vs8,	vs22		// real*real, imag*real
1153	xvmaddadp	vs41,	vs8,	vs23		// real*imag, imag*imag
1154	xvmaddadp	vs42,	vs9,	vs22		// real*real, imag*real
1155	xvmaddadp	vs43,	vs9,	vs23		// real*imag, imag*imag
1156	xvmaddadp	vs44,	vs10,	vs22		// real*real, imag*real
1157	xvmaddadp	vs45,	vs10,	vs23		// real*imag, imag*imag
1158	xvmaddadp	vs46,	vs11,	vs22		// real*real, imag*real
1159	xvmaddadp	vs47,	vs11,	vs23		// real*imag, imag*imag
1160
1161
1162#if defined(_AIX)
1163')
1164#else
1165.endm
1166#endif
1167
1168#if defined(_AIX)
1169define(`KERNEL2x4_SUBI1', `
1170#else
1171.macro KERNEL2x4_SUBI1
1172#endif
1173
1174	lxvd2x		vs0,	o0,	AO		// load real,imag from A
1175	lxvd2x		vs1,	o16,	AO		// load real,imag from A
1176	lxvd2x		vs2,	o32,	AO		// load real,imag from A
1177	lxvd2x		vs3,	o48,	AO		// load real,imag from A
1178
1179	addi		AO,	AO,	64
1180
1181	lxvd2x		vs16,	o0,	BO		// load real part from B
1182	lxvd2x		vs17,	o16,	BO		// load imag part from B
1183	lxvd2x		vs18,	o32,	BO		// load real part from B
1184	lxvd2x		vs19,	o48,	BO		// load imag part from B
1185
1186	addi		BO,	BO,	64
1187
1188	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
1189	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
1190	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
1191	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
1192	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
1193	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
1194	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
1195	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
1196
1197	xvmuldp		vs40,	vs0,	vs18		// real*real, imag*real
1198	xvmuldp		vs41,	vs0,	vs19		// real*imag, imag*imag
1199	xvmuldp		vs42,	vs1,	vs18		// real*real, imag*real
1200	xvmuldp		vs43,	vs1,	vs19		// real*imag, imag*imag
1201	xvmuldp		vs44,	vs2,	vs18		// real*real, imag*real
1202	xvmuldp		vs45,	vs2,	vs19		// real*imag, imag*imag
1203	xvmuldp		vs46,	vs3,	vs18		// real*real, imag*real
1204	xvmuldp		vs47,	vs3,	vs19		// real*imag, imag*imag
1205
1206
1207#if defined(_AIX)
1208')
1209#else
1210.endm
1211#endif
1212
1213#if defined(_AIX)
1214define(`KERNEL2x4_SUB1', `
1215#else
1216.macro KERNEL2x4_SUB1
1217#endif
1218
1219	lxvd2x		vs0,	o0,	AO		// load real,imag from A
1220	lxvd2x		vs1,	o16,	AO		// load real,imag from A
1221	lxvd2x		vs2,	o32,	AO		// load real,imag from A
1222	lxvd2x		vs3,	o48,	AO		// load real,imag from A
1223
1224	addi		AO,	AO,	64
1225
1226	lxvd2x		vs16,	o0,	BO		// load real part from B
1227	lxvd2x		vs17,	o16,	BO		// load imag part from B
1228	lxvd2x		vs18,	o32,	BO		// load real part from B
1229	lxvd2x		vs19,	o48,	BO		// load imag part from B
1230
1231	addi		BO,	BO,	64
1232
1233	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
1234	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
1235	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
1236	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
1237	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
1238	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
1239	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
1240	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
1241
1242	xvmaddadp	vs40,	vs0,	vs18		// real*real, imag*real
1243	xvmaddadp	vs41,	vs0,	vs19		// real*imag, imag*imag
1244	xvmaddadp	vs42,	vs1,	vs18		// real*real, imag*real
1245	xvmaddadp	vs43,	vs1,	vs19		// real*imag, imag*imag
1246	xvmaddadp	vs44,	vs2,	vs18		// real*real, imag*real
1247	xvmaddadp	vs45,	vs2,	vs19		// real*imag, imag*imag
1248	xvmaddadp	vs46,	vs3,	vs18		// real*real, imag*real
1249	xvmaddadp	vs47,	vs3,	vs19		// real*imag, imag*imag
1250
1251
1252#if defined(_AIX)
1253')
1254#else
1255.endm
1256#endif
1257
1258#if defined(_AIX)
1259define(`SAVE2x4', `
1260#else
1261.macro SAVE2x4
1262#endif
1263
1264
1265	mr		T1,	CO
1266
1267#ifndef TRMMKERNEL
1268
1269	lxvd2x		vs16,	o0,	T1
1270	lxvd2x		vs17,	o16,	T1
1271	lxvd2x		vs18,	o32,	T1
1272	lxvd2x		vs19,	o48,	T1
1273
1274#endif
1275
1276
1277	xxlxor		vs0,	vs0,	vs0
1278	xxlxor		vs1,	vs1,	vs1
1279	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1280
1281	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
1282	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
1283
1284	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
1285	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1286
1287	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
1288	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
1289
1290	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
1291	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
1292	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
1293	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
1294
1295	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
1296	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
1297	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
1298
1299
1300
1301	xxlxor		vs0,	vs0,	vs0
1302	xxlxor		vs1,	vs1,	vs1
1303	XXSWAPD(vs35,vs35)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1304
1305	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
1306	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
1307
1308	XXSWAPD(vs34,vs34)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
1309	XXSWAPD(vs35,vs35)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1310
1311	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
1312	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
1313
1314	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
1315	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
1316	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
1317	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
1318
1319	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
1320	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
1321	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
1322
1323
1324
1325	xxlxor		vs0,	vs0,	vs0
1326	xxlxor		vs1,	vs1,	vs1
1327	XXSWAPD(vs37,vs37)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1328
1329	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
1330	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
1331
1332	XXSWAPD(vs36,vs36)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
1333	XXSWAPD(vs37,vs37)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1334
1335	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
1336	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
1337
1338	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
1339	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
1340	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
1341	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
1342
1343	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
1344	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
1345	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
1346
1347
1348
1349	xxlxor		vs0,	vs0,	vs0
1350	xxlxor		vs1,	vs1,	vs1
1351	XXSWAPD(vs39,vs39)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1352
1353	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
1354	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
1355
1356	XXSWAPD(vs38,vs38)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
1357	XXSWAPD(vs39,vs39)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1358
1359	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
1360	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
1361
1362	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
1363	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
1364	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
1365	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
1366
1367	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
1368	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
1369	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
1370
1371
1372#ifndef TRMMKERNEL
1373
1374	xvadddp		vs8,	vs8,	vs16
1375	xvadddp		vs9,	vs9,	vs17
1376	xvadddp		vs10,	vs10,	vs18
1377	xvadddp		vs11,	vs11,	vs19
1378
1379#endif
1380
1381	stxvd2x		vs8,	o0,	T1
1382	stxvd2x		vs9,	o16,	T1
1383	stxvd2x		vs10,	o32,	T1
1384	stxvd2x		vs11,	o48,	T1
1385
1386	add		T1,	T1,	LDC
1387
1388#ifndef TRMMKERNEL
1389
1390	lxvd2x		vs16,	o0,	T1
1391	lxvd2x		vs17,	o16,	T1
1392	lxvd2x		vs18,	o32,	T1
1393	lxvd2x		vs19,	o48,	T1
1394
1395#endif
1396
1397
1398	xxlxor		vs0,	vs0,	vs0
1399	xxlxor		vs1,	vs1,	vs1
1400	XXSWAPD(vs41,vs41)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1401
1402	XSFADD_R1	vs0,	vs0,	vs40		// realA*realB
1403	XSFADD_R2	vs0,	vs0,	vs41		// imagA*imagB
1404
1405	XXSWAPD(vs40,vs40)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
1406	XXSWAPD(vs41,vs41)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1407
1408	XSFADD_I1	vs1,	vs1,	vs40		// realA*imagB
1409	XSFADD_I2	vs1,	vs1,	vs41		// imagA*realB
1410
1411	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
1412	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
1413	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
1414	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
1415
1416	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
1417	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
1418	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
1419
1420
1421
1422	xxlxor		vs0,	vs0,	vs0
1423	xxlxor		vs1,	vs1,	vs1
1424	XXSWAPD(vs43,vs43)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1425
1426	XSFADD_R1	vs0,	vs0,	vs42		// realA*realB
1427	XSFADD_R2	vs0,	vs0,	vs43		// imagA*imagB
1428
1429	XXSWAPD(vs42,vs42)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
1430	XXSWAPD(vs43,vs43)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1431
1432	XSFADD_I1	vs1,	vs1,	vs42		// realA*imagB
1433	XSFADD_I2	vs1,	vs1,	vs43		// imagA*realB
1434
1435	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
1436	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
1437	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
1438	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
1439
1440	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
1441	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
1442	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
1443
1444
1445
1446	xxlxor		vs0,	vs0,	vs0
1447	xxlxor		vs1,	vs1,	vs1
1448	XXSWAPD(vs45,vs45)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1449
1450	XSFADD_R1	vs0,	vs0,	vs44		// realA*realB
1451	XSFADD_R2	vs0,	vs0,	vs45		// imagA*imagB
1452
1453	XXSWAPD(vs44,vs44)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
1454	XXSWAPD(vs45,vs45)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1455
1456	XSFADD_I1	vs1,	vs1,	vs44		// realA*imagB
1457	XSFADD_I2	vs1,	vs1,	vs45		// imagA*realB
1458
1459	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
1460	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
1461	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
1462	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
1463
1464	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
1465	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
1466	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
1467
1468
1469
1470	xxlxor		vs0,	vs0,	vs0
1471	xxlxor		vs1,	vs1,	vs1
1472	XXSWAPD(vs47,vs47)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1473
1474	XSFADD_R1	vs0,	vs0,	vs46		// realA*realB
1475	XSFADD_R2	vs0,	vs0,	vs47		// imagA*imagB
1476
1477	XXSWAPD(vs46,vs46)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
1478	XXSWAPD(vs47,vs47)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1479
1480	XSFADD_I1	vs1,	vs1,	vs46		// realA*imagB
1481	XSFADD_I2	vs1,	vs1,	vs47		// imagA*realB
1482
1483	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
1484	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
1485	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
1486	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
1487
1488	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
1489	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
1490	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
1491
1492
1493#ifndef TRMMKERNEL
1494
1495	xvadddp		vs8,	vs8,	vs16
1496	xvadddp		vs9,	vs9,	vs17
1497	xvadddp		vs10,	vs10,	vs18
1498	xvadddp		vs11,	vs11,	vs19
1499
1500#endif
1501
1502	stxvd2x		vs8,	o0,	T1
1503	stxvd2x		vs9,	o16,	T1
1504	stxvd2x		vs10,	o32,	T1
1505	stxvd2x		vs11,	o48,	T1
1506
1507	add		T1,	T1,	LDC
1508	addi		CO,	CO,	64
1509
1510#if defined(_AIX)
1511')
1512#else
1513.endm
1514#endif
1515
1516
1517/**********************************************************************************************
1518* Macros for N=2 and M=2
1519**********************************************************************************************/
1520
1521#if defined(_AIX)
1522define(`LOAD2x2_1', `
1523#else
1524.macro LOAD2x2_1
1525#endif
1526
1527	lxvd2x		vs16,	o0,	BO		// load real part from B
1528	lxvd2x		vs17,	o16,	BO		// load imag part from B
1529	lxvd2x		vs18,	o32,	BO		// load real part from B
1530	lxvd2x		vs19,	o48,	BO		// load imag part from B
1531
1532	addi		BO,	BO,	64
1533
1534	lxvd2x		vs0,	o0,	AO		// load real,imag from A
1535	lxvd2x		vs1,	o16,	AO		// load real,imag from A
1536
1537	addi		AO,	AO,	32
1538
1539
1540#if defined(_AIX)
1541')
1542#else
1543.endm
1544#endif
1545
1546#if defined(_AIX)
1547define(`KERNEL2x2_I1', `
1548#else
1549.macro KERNEL2x2_I1
1550#endif
1551
1552	lxvd2x		vs8,	o0,	AO		// load real,imag from A
1553	lxvd2x		vs9,	o16,	AO		// load real,imag from A
1554
1555	addi		AO,	AO,	32
1556
1557	lxvd2x		vs20,	o0,	BO		// load real part from B
1558	lxvd2x		vs21,	o16,	BO		// load imag part from B
1559	lxvd2x		vs22,	o32,	BO		// load real part from B
1560	lxvd2x		vs23,	o48,	BO		// load imag part from B
1561
1562	addi		BO,	BO,	64
1563
1564	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
1565	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
1566	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
1567	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
1568
1569	xvmuldp		vs36,	vs0,	vs18		// real*real, imag*real
1570	xvmuldp		vs37,	vs0,	vs19		// real*imag, imag*imag
1571	xvmuldp		vs38,	vs1,	vs18		// real*real, imag*real
1572	xvmuldp		vs39,	vs1,	vs19		// real*imag, imag*imag
1573
1574
1575#if defined(_AIX)
1576')
1577#else
1578.endm
1579#endif
1580
1581#if defined(_AIX)
1582define(`KERNEL2x2_1', `
1583#else
1584.macro KERNEL2x2_1
1585#endif
1586
1587	lxvd2x		vs8,	o0,	AO		// load real,imag from A
1588	lxvd2x		vs9,	o16,	AO		// load real,imag from A
1589
1590	addi		AO,	AO,	32
1591
1592	lxvd2x		vs20,	o0,	BO		// load real part from B
1593	lxvd2x		vs21,	o16,	BO		// load imag part from B
1594	lxvd2x		vs22,	o32,	BO		// load real part from B
1595	lxvd2x		vs23,	o48,	BO		// load imag part from B
1596
1597	addi		BO,	BO,	64
1598
1599	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
1600	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
1601	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
1602	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
1603
1604	xvmaddadp	vs36,	vs0,	vs18		// real*real, imag*real
1605	xvmaddadp	vs37,	vs0,	vs19		// real*imag, imag*imag
1606	xvmaddadp	vs38,	vs1,	vs18		// real*real, imag*real
1607	xvmaddadp	vs39,	vs1,	vs19		// real*imag, imag*imag
1608
1609
1610#if defined(_AIX)
1611')
1612#else
1613.endm
1614#endif
1615
1616#if defined(_AIX)
1617define(`KERNEL2x2_2', `
1618#else
1619.macro KERNEL2x2_2
1620#endif
1621
1622	lxvd2x		vs0,	o0,	AO		// load real,imag from A
1623	lxvd2x		vs1,	o16,	AO		// load real,imag from A
1624
1625	addi		AO,	AO,	32
1626
1627	lxvd2x		vs16,	o0,	BO		// load real part from B
1628	lxvd2x		vs17,	o16,	BO		// load imag part from B
1629	lxvd2x		vs18,	o32,	BO		// load real part from B
1630	lxvd2x		vs19,	o48,	BO		// load imag part from B
1631
1632	addi		BO,	BO,	64
1633
1634	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
1635	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
1636	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
1637	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
1638
1639	xvmaddadp	vs36,	vs8,	vs22		// real*real, imag*real
1640	xvmaddadp	vs37,	vs8,	vs23		// real*imag, imag*imag
1641	xvmaddadp	vs38,	vs9,	vs22		// real*real, imag*real
1642	xvmaddadp	vs39,	vs9,	vs23		// real*imag, imag*imag
1643
1644
1645#if defined(_AIX)
1646')
1647#else
1648.endm
1649#endif
1650
1651#if defined(_AIX)
1652define(`KERNEL2x2_E2', `
1653#else
1654.macro KERNEL2x2_E2
1655#endif
1656
1657
1658	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
1659	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
1660	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
1661	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
1662
1663	xvmaddadp	vs36,	vs8,	vs22		// real*real, imag*real
1664	xvmaddadp	vs37,	vs8,	vs23		// real*imag, imag*imag
1665	xvmaddadp	vs38,	vs9,	vs22		// real*real, imag*real
1666	xvmaddadp	vs39,	vs9,	vs23		// real*imag, imag*imag
1667
1668
1669#if defined(_AIX)
1670')
1671#else
1672.endm
1673#endif
1674
1675#if defined(_AIX)
1676define(`KERNEL2x2_SUBI1', `
1677#else
1678.macro KERNEL2x2_SUBI1
1679#endif
1680
1681	lxvd2x		vs0,	o0,	AO		// load real,imag from A
1682	lxvd2x		vs1,	o16,	AO		// load real,imag from A
1683
1684	addi		AO,	AO,	32
1685
1686	lxvd2x		vs16,	o0,	BO		// load real part from B
1687	lxvd2x		vs17,	o16,	BO		// load imag part from B
1688	lxvd2x		vs18,	o32,	BO		// load real part from B
1689	lxvd2x		vs19,	o48,	BO		// load imag part from B
1690
1691	addi		BO,	BO,	64
1692
1693	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
1694	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
1695	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
1696	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
1697
1698	xvmuldp		vs36,	vs0,	vs18		// real*real, imag*real
1699	xvmuldp		vs37,	vs0,	vs19		// real*imag, imag*imag
1700	xvmuldp		vs38,	vs1,	vs18		// real*real, imag*real
1701	xvmuldp		vs39,	vs1,	vs19		// real*imag, imag*imag
1702
1703
1704#if defined(_AIX)
1705')
1706#else
1707.endm
1708#endif
1709
1710#if defined(_AIX)
1711define(`KERNEL2x2_SUB1', `
1712#else
1713.macro KERNEL2x2_SUB1
1714#endif
1715
1716	lxvd2x		vs0,	o0,	AO		// load real,imag from A
1717	lxvd2x		vs1,	o16,	AO		// load real,imag from A
1718
1719	addi		AO,	AO,	32
1720
1721	lxvd2x		vs16,	o0,	BO		// load real part from B
1722	lxvd2x		vs17,	o16,	BO		// load imag part from B
1723	lxvd2x		vs18,	o32,	BO		// load real part from B
1724	lxvd2x		vs19,	o48,	BO		// load imag part from B
1725
1726	addi		BO,	BO,	64
1727
1728	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
1729	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
1730	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
1731	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
1732
1733	xvmaddadp	vs36,	vs0,	vs18		// real*real, imag*real
1734	xvmaddadp	vs37,	vs0,	vs19		// real*imag, imag*imag
1735	xvmaddadp	vs38,	vs1,	vs18		// real*real, imag*real
1736	xvmaddadp	vs39,	vs1,	vs19		// real*imag, imag*imag
1737
1738
1739#if defined(_AIX)
1740')
1741#else
1742.endm
1743#endif
1744
1745#if defined(_AIX)
1746define(`SAVE2x2', `
1747#else
1748.macro SAVE2x2
1749#endif
1750
1751
1752	mr		T1,	CO
1753
1754#ifndef TRMMKERNEL
1755
1756	lxvd2x		vs16,	o0,	T1
1757	lxvd2x		vs17,	o16,	T1
1758
1759#endif
1760
1761
1762	xxlxor		vs0,	vs0,	vs0
1763	xxlxor		vs1,	vs1,	vs1
1764	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1765
1766	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
1767	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
1768
1769	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
1770	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1771
1772	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
1773	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
1774
1775	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
1776	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
1777	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
1778	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
1779
1780	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
1781	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
1782	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
1783
1784
1785
1786	xxlxor		vs0,	vs0,	vs0
1787	xxlxor		vs1,	vs1,	vs1
1788	XXSWAPD(vs35,vs35)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1789
1790	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
1791	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
1792
1793	XXSWAPD(vs34,vs34)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
1794	XXSWAPD(vs35,vs35)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1795
1796	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
1797	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
1798
1799	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
1800	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
1801	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
1802	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
1803
1804	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
1805	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
1806	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
1807
1808
1809#ifndef TRMMKERNEL
1810
1811	xvadddp		vs8,	vs8,	vs16
1812	xvadddp		vs9,	vs9,	vs17
1813
1814#endif
1815
1816	stxvd2x		vs8,	o0,	T1
1817	stxvd2x		vs9,	o16,	T1
1818
1819	add		T1,	T1,	LDC
1820
1821#ifndef TRMMKERNEL
1822
1823	lxvd2x		vs16,	o0,	T1
1824	lxvd2x		vs17,	o16,	T1
1825
1826#endif
1827
1828
1829	xxlxor		vs0,	vs0,	vs0
1830	xxlxor		vs1,	vs1,	vs1
1831	XXSWAPD(vs37,vs37)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1832
1833	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
1834	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
1835
1836	XXSWAPD(vs36,vs36)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
1837	XXSWAPD(vs37,vs37)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1838
1839	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
1840	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
1841
1842	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
1843	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
1844	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
1845	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
1846
1847	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
1848	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
1849	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
1850
1851
1852
1853	xxlxor		vs0,	vs0,	vs0
1854	xxlxor		vs1,	vs1,	vs1
1855	XXSWAPD(vs39,vs39)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1856
1857	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
1858	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
1859
1860	XXSWAPD(vs38,vs38)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
1861	XXSWAPD(vs39,vs39)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1862
1863	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
1864	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
1865
1866	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
1867	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
1868	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
1869	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
1870
1871	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
1872	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
1873	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
1874
1875
1876#ifndef TRMMKERNEL
1877
1878	xvadddp		vs8,	vs8,	vs16
1879	xvadddp		vs9,	vs9,	vs17
1880
1881#endif
1882
1883	stxvd2x		vs8,	o0,	T1
1884	stxvd2x		vs9,	o16,	T1
1885
1886	add		T1,	T1,	LDC
1887	addi		CO,	CO,	32
1888
1889#if defined(_AIX)
1890')
1891#else
1892.endm
1893#endif
1894
1895
1896/**********************************************************************************************
1897* Macros for N=2 and M=1
1898**********************************************************************************************/
1899
1900#if defined(_AIX)
1901define(`LOAD2x1_1', `
1902#else
1903.macro LOAD2x1_1
1904#endif
1905
1906	lxvd2x		vs16,	o0,	BO		// load real part from B
1907	lxvd2x		vs17,	o16,	BO		// load imag part from B
1908	lxvd2x		vs18,	o32,	BO		// load real part from B
1909	lxvd2x		vs19,	o48,	BO		// load imag part from B
1910
1911	addi		BO,	BO,	64
1912
1913	lxvd2x		vs0,	o0,	AO		// load real,imag from A
1914
1915	addi		AO,	AO,	16
1916
1917
1918#if defined(_AIX)
1919')
1920#else
1921.endm
1922#endif
1923
1924#if defined(_AIX)
1925define(`KERNEL2x1_I1', `
1926#else
1927.macro KERNEL2x1_I1
1928#endif
1929
1930	lxvd2x		vs8,	o0,	AO		// load real,imag from A
1931
1932	addi		AO,	AO,	16
1933
1934	lxvd2x		vs20,	o0,	BO		// load real part from B
1935	lxvd2x		vs21,	o16,	BO		// load imag part from B
1936	lxvd2x		vs22,	o32,	BO		// load real part from B
1937	lxvd2x		vs23,	o48,	BO		// load imag part from B
1938
1939	addi		BO,	BO,	64
1940
1941	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
1942	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
1943
1944	xvmuldp		vs34,	vs0,	vs18		// real*real, imag*real
1945	xvmuldp		vs35,	vs0,	vs19		// real*imag, imag*imag
1946
1947
1948#if defined(_AIX)
1949')
1950#else
1951.endm
1952#endif
1953
1954#if defined(_AIX)
1955define(`KERNEL2x1_1', `
1956#else
1957.macro KERNEL2x1_1
1958#endif
1959
1960	lxvd2x		vs8,	o0,	AO		// load real,imag from A
1961
1962	addi		AO,	AO,	16
1963
1964	lxvd2x		vs20,	o0,	BO		// load real part from B
1965	lxvd2x		vs21,	o16,	BO		// load imag part from B
1966	lxvd2x		vs22,	o32,	BO		// load real part from B
1967	lxvd2x		vs23,	o48,	BO		// load imag part from B
1968
1969	addi		BO,	BO,	64
1970
1971	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
1972	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
1973
1974	xvmaddadp	vs34,	vs0,	vs18		// real*real, imag*real
1975	xvmaddadp	vs35,	vs0,	vs19		// real*imag, imag*imag
1976
1977
1978#if defined(_AIX)
1979')
1980#else
1981.endm
1982#endif
1983
1984#if defined(_AIX)
1985define(`KERNEL2x1_2', `
1986#else
1987.macro KERNEL2x1_2
1988#endif
1989
1990	lxvd2x		vs0,	o0,	AO		// load real,imag from A
1991
1992	addi		AO,	AO,	16
1993
1994	lxvd2x		vs16,	o0,	BO		// load real part from B
1995	lxvd2x		vs17,	o16,	BO		// load imag part from B
1996	lxvd2x		vs18,	o32,	BO		// load real part from B
1997	lxvd2x		vs19,	o48,	BO		// load imag part from B
1998
1999	addi		BO,	BO,	64
2000
2001	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
2002	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
2003
2004	xvmaddadp	vs34,	vs8,	vs22		// real*real, imag*real
2005	xvmaddadp	vs35,	vs8,	vs23		// real*imag, imag*imag
2006
2007
2008#if defined(_AIX)
2009')
2010#else
2011.endm
2012#endif
2013
2014#if defined(_AIX)
2015define(`KERNEL2x1_E2', `
2016#else
2017.macro KERNEL2x1_E2
2018#endif
2019
2020
2021	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
2022	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
2023
2024	xvmaddadp	vs34,	vs8,	vs22		// real*real, imag*real
2025	xvmaddadp	vs35,	vs8,	vs23		// real*imag, imag*imag
2026
2027
2028#if defined(_AIX)
2029')
2030#else
2031.endm
2032#endif
2033
2034#if defined(_AIX)
2035define(`KERNEL2x1_SUBI1', `
2036#else
2037.macro KERNEL2x1_SUBI1
2038#endif
2039
2040	lxvd2x		vs0,	o0,	AO		// load real,imag from A
2041
2042	addi		AO,	AO,	16
2043
2044	lxvd2x		vs16,	o0,	BO		// load real part from B
2045	lxvd2x		vs17,	o16,	BO		// load imag part from B
2046	lxvd2x		vs18,	o32,	BO		// load real part from B
2047	lxvd2x		vs19,	o48,	BO		// load imag part from B
2048
2049	addi		BO,	BO,	64
2050
2051	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
2052	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
2053
2054	xvmuldp		vs34,	vs0,	vs18		// real*real, imag*real
2055	xvmuldp		vs35,	vs0,	vs19		// real*imag, imag*imag
2056
2057
2058#if defined(_AIX)
2059')
2060#else
2061.endm
2062#endif
2063
2064#if defined(_AIX)
2065define(`KERNEL2x1_SUB1', `
2066#else
2067.macro KERNEL2x1_SUB1
2068#endif
2069
2070	lxvd2x		vs0,	o0,	AO		// load real,imag from A
2071
2072	addi		AO,	AO,	16
2073
2074	lxvd2x		vs16,	o0,	BO		// load real part from B
2075	lxvd2x		vs17,	o16,	BO		// load imag part from B
2076	lxvd2x		vs18,	o32,	BO		// load real part from B
2077	lxvd2x		vs19,	o48,	BO		// load imag part from B
2078
2079	addi		BO,	BO,	64
2080
2081	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
2082	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
2083
2084	xvmaddadp	vs34,	vs0,	vs18		// real*real, imag*real
2085	xvmaddadp	vs35,	vs0,	vs19		// real*imag, imag*imag
2086
2087
2088#if defined(_AIX)
2089')
2090#else
2091.endm
2092#endif
2093
2094#if defined(_AIX)
2095define(`SAVE2x1', `
2096#else
2097.macro SAVE2x1
2098#endif
2099
2100
2101	mr		T1,	CO
2102
2103#ifndef TRMMKERNEL
2104
2105	lxvd2x		vs16,	o0,	T1
2106
2107#endif
2108
2109
2110	xxlxor		vs0,	vs0,	vs0
2111	xxlxor		vs1,	vs1,	vs1
2112	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2113
2114	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
2115	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
2116
2117	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
2118	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2119
2120	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
2121	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
2122
2123	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
2124	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
2125	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
2126	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
2127
2128	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
2129	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
2130	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
2131
2132
2133#ifndef TRMMKERNEL
2134
2135	xvadddp		vs8,	vs8,	vs16
2136
2137#endif
2138
2139	stxvd2x		vs8,	o0,	T1
2140
2141	add		T1,	T1,	LDC
2142
2143#ifndef TRMMKERNEL
2144
2145	lxvd2x		vs16,	o0,	T1
2146
2147#endif
2148
2149
2150	xxlxor		vs0,	vs0,	vs0
2151	xxlxor		vs1,	vs1,	vs1
2152	XXSWAPD(vs35,vs35)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2153
2154	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
2155	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
2156
2157	XXSWAPD(vs34,vs34)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
2158	XXSWAPD(vs35,vs35)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2159
2160	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
2161	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
2162
2163	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
2164	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
2165	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
2166	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
2167
2168	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
2169	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
2170	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
2171
2172
2173#ifndef TRMMKERNEL
2174
2175	xvadddp		vs8,	vs8,	vs16
2176
2177#endif
2178
2179	stxvd2x		vs8,	o0,	T1
2180
2181	add		T1,	T1,	LDC
2182	addi		CO,	CO,	16
2183
2184#if defined(_AIX)
2185')
2186#else
2187.endm
2188#endif
2189
2190
2191/**********************************************************************************************
2192* Macros for N=1 and M=8
2193**********************************************************************************************/
2194
2195#if defined(_AIX)
2196define(`LOAD1x8_1', `
2197#else
2198.macro LOAD1x8_1
2199#endif
2200
2201	lxvd2x		vs16,	o0,	BO		// load real part from B
2202	lxvd2x		vs17,	o16,	BO		// load imag part from B
2203
2204	addi		BO,	BO,	32
2205
2206	lxvd2x		vs0,	o0,	AO		// load real,imag from A
2207	lxvd2x		vs1,	o16,	AO		// load real,imag from A
2208	lxvd2x		vs2,	o32,	AO		// load real,imag from A
2209	lxvd2x		vs3,	o48,	AO		// load real,imag from A
2210
2211	addi		AO,	AO,	64
2212
2213	lxvd2x		vs4,	o0,	AO		// load real,imag from A
2214	lxvd2x		vs5,	o16,	AO		// load real,imag from A
2215	lxvd2x		vs6,	o32,	AO		// load real,imag from A
2216	lxvd2x		vs7,	o48,	AO		// load real,imag from A
2217
2218	addi		AO,	AO,	64
2219
2220
2221#if defined(_AIX)
2222')
2223#else
2224.endm
2225#endif
2226
2227#if defined(_AIX)
2228define(`KERNEL1x8_I1', `
2229#else
2230.macro KERNEL1x8_I1
2231#endif
2232
2233	lxvd2x		vs8,	o0,	AO		// load real,imag from A
2234	lxvd2x		vs9,	o16,	AO		// load real,imag from A
2235	lxvd2x		vs10,	o32,	AO		// load real,imag from A
2236	lxvd2x		vs11,	o48,	AO		// load real,imag from A
2237
2238	addi		AO,	AO,	64
2239
2240	lxvd2x		vs12,	o0,	AO		// load real,imag from A
2241	lxvd2x		vs13,	o16,	AO		// load real,imag from A
2242	lxvd2x		vs14,	o32,	AO		// load real,imag from A
2243	lxvd2x		vs15,	o48,	AO		// load real,imag from A
2244
2245	addi		AO,	AO,	64
2246
2247	lxvd2x		vs20,	o0,	BO		// load real part from B
2248	lxvd2x		vs21,	o16,	BO		// load imag part from B
2249
2250	addi		BO,	BO,	32
2251
2252	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
2253	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
2254	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
2255	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
2256	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
2257	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
2258	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
2259	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
2260	xvmuldp		vs40,	vs4,	vs16		// real*real, imag*real
2261	xvmuldp		vs41,	vs4,	vs17		// real*imag, imag*imag
2262	xvmuldp		vs42,	vs5,	vs16		// real*real, imag*real
2263	xvmuldp		vs43,	vs5,	vs17		// real*imag, imag*imag
2264	xvmuldp		vs44,	vs6,	vs16		// real*real, imag*real
2265	xvmuldp		vs45,	vs6,	vs17		// real*imag, imag*imag
2266	xvmuldp		vs46,	vs7,	vs16		// real*real, imag*real
2267	xvmuldp		vs47,	vs7,	vs17		// real*imag, imag*imag
2268
2269
2270#if defined(_AIX)
2271')
2272#else
2273.endm
2274#endif
2275
2276#if defined(_AIX)
2277define(`KERNEL1x8_1', `
2278#else
2279.macro KERNEL1x8_1
2280#endif
2281
2282	lxvd2x		vs8,	o0,	AO		// load real,imag from A
2283	lxvd2x		vs9,	o16,	AO		// load real,imag from A
2284	lxvd2x		vs10,	o32,	AO		// load real,imag from A
2285	lxvd2x		vs11,	o48,	AO		// load real,imag from A
2286
2287	addi		AO,	AO,	64
2288
2289	lxvd2x		vs12,	o0,	AO		// load real,imag from A
2290	lxvd2x		vs13,	o16,	AO		// load real,imag from A
2291	lxvd2x		vs14,	o32,	AO		// load real,imag from A
2292	lxvd2x		vs15,	o48,	AO		// load real,imag from A
2293
2294	addi		AO,	AO,	64
2295
2296	lxvd2x		vs20,	o0,	BO		// load real part from B
2297	lxvd2x		vs21,	o16,	BO		// load imag part from B
2298
2299	addi		BO,	BO,	32
2300
2301	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
2302	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
2303	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
2304	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
2305	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
2306	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
2307	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
2308	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
2309	xvmaddadp	vs40,	vs4,	vs16		// real*real, imag*real
2310	xvmaddadp	vs41,	vs4,	vs17		// real*imag, imag*imag
2311	xvmaddadp	vs42,	vs5,	vs16		// real*real, imag*real
2312	xvmaddadp	vs43,	vs5,	vs17		// real*imag, imag*imag
2313	xvmaddadp	vs44,	vs6,	vs16		// real*real, imag*real
2314	xvmaddadp	vs45,	vs6,	vs17		// real*imag, imag*imag
2315	xvmaddadp	vs46,	vs7,	vs16		// real*real, imag*real
2316	xvmaddadp	vs47,	vs7,	vs17		// real*imag, imag*imag
2317
2318
2319#if defined(_AIX)
2320')
2321#else
2322.endm
2323#endif
2324
2325#if defined(_AIX)
2326define(`KERNEL1x8_2', `
2327#else
2328.macro KERNEL1x8_2
2329#endif
2330
2331	lxvd2x		vs0,	o0,	AO		// load real,imag from A
2332	lxvd2x		vs1,	o16,	AO		// load real,imag from A
2333	lxvd2x		vs2,	o32,	AO		// load real,imag from A
2334	lxvd2x		vs3,	o48,	AO		// load real,imag from A
2335
2336	addi		AO,	AO,	64
2337
2338	lxvd2x		vs4,	o0,	AO		// load real,imag from A
2339	lxvd2x		vs5,	o16,	AO		// load real,imag from A
2340	lxvd2x		vs6,	o32,	AO		// load real,imag from A
2341	lxvd2x		vs7,	o48,	AO		// load real,imag from A
2342
2343	addi		AO,	AO,	64
2344
2345	lxvd2x		vs16,	o0,	BO		// load real part from B
2346	lxvd2x		vs17,	o16,	BO		// load imag part from B
2347
2348	addi		BO,	BO,	32
2349
2350	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
2351	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
2352	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
2353	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
2354	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
2355	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
2356	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
2357	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
2358	xvmaddadp	vs40,	vs12,	vs20		// real*real, imag*real
2359	xvmaddadp	vs41,	vs12,	vs21		// real*imag, imag*imag
2360	xvmaddadp	vs42,	vs13,	vs20		// real*real, imag*real
2361	xvmaddadp	vs43,	vs13,	vs21		// real*imag, imag*imag
2362	xvmaddadp	vs44,	vs14,	vs20		// real*real, imag*real
2363	xvmaddadp	vs45,	vs14,	vs21		// real*imag, imag*imag
2364	xvmaddadp	vs46,	vs15,	vs20		// real*real, imag*real
2365	xvmaddadp	vs47,	vs15,	vs21		// real*imag, imag*imag
2366
2367
2368#if defined(_AIX)
2369')
2370#else
2371.endm
2372#endif
2373
2374#if defined(_AIX)
2375define(`KERNEL1x8_E2', `
2376#else
2377.macro KERNEL1x8_E2
2378#endif
2379
2380
2381	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
2382	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
2383	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
2384	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
2385	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
2386	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
2387	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
2388	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
2389	xvmaddadp	vs40,	vs12,	vs20		// real*real, imag*real
2390	xvmaddadp	vs41,	vs12,	vs21		// real*imag, imag*imag
2391	xvmaddadp	vs42,	vs13,	vs20		// real*real, imag*real
2392	xvmaddadp	vs43,	vs13,	vs21		// real*imag, imag*imag
2393	xvmaddadp	vs44,	vs14,	vs20		// real*real, imag*real
2394	xvmaddadp	vs45,	vs14,	vs21		// real*imag, imag*imag
2395	xvmaddadp	vs46,	vs15,	vs20		// real*real, imag*real
2396	xvmaddadp	vs47,	vs15,	vs21		// real*imag, imag*imag
2397
2398
2399#if defined(_AIX)
2400')
2401#else
2402.endm
2403#endif
2404
2405#if defined(_AIX)
2406define(`KERNEL1x8_SUBI1', `
2407#else
2408.macro KERNEL1x8_SUBI1
2409#endif
2410
2411	lxvd2x		vs0,	o0,	AO		// load real,imag from A
2412	lxvd2x		vs1,	o16,	AO		// load real,imag from A
2413	lxvd2x		vs2,	o32,	AO		// load real,imag from A
2414	lxvd2x		vs3,	o48,	AO		// load real,imag from A
2415
2416	addi		AO,	AO,	64
2417
2418	lxvd2x		vs4,	o0,	AO		// load real,imag from A
2419	lxvd2x		vs5,	o16,	AO		// load real,imag from A
2420	lxvd2x		vs6,	o32,	AO		// load real,imag from A
2421	lxvd2x		vs7,	o48,	AO		// load real,imag from A
2422
2423	addi		AO,	AO,	64
2424
2425	lxvd2x		vs16,	o0,	BO		// load real part from B
2426	lxvd2x		vs17,	o16,	BO		// load imag part from B
2427
2428	addi		BO,	BO,	32
2429
2430	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
2431	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
2432	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
2433	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
2434	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
2435	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
2436	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
2437	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
2438	xvmuldp		vs40,	vs4,	vs16		// real*real, imag*real
2439	xvmuldp		vs41,	vs4,	vs17		// real*imag, imag*imag
2440	xvmuldp		vs42,	vs5,	vs16		// real*real, imag*real
2441	xvmuldp		vs43,	vs5,	vs17		// real*imag, imag*imag
2442	xvmuldp		vs44,	vs6,	vs16		// real*real, imag*real
2443	xvmuldp		vs45,	vs6,	vs17		// real*imag, imag*imag
2444	xvmuldp		vs46,	vs7,	vs16		// real*real, imag*real
2445	xvmuldp		vs47,	vs7,	vs17		// real*imag, imag*imag
2446
2447
2448#if defined(_AIX)
2449')
2450#else
2451.endm
2452#endif
2453
2454#if defined(_AIX)
2455define(`KERNEL1x8_SUB1', `
2456#else
2457.macro KERNEL1x8_SUB1
2458#endif
2459
2460	lxvd2x		vs0,	o0,	AO		// load real,imag from A
2461	lxvd2x		vs1,	o16,	AO		// load real,imag from A
2462	lxvd2x		vs2,	o32,	AO		// load real,imag from A
2463	lxvd2x		vs3,	o48,	AO		// load real,imag from A
2464
2465	addi		AO,	AO,	64
2466
2467	lxvd2x		vs4,	o0,	AO		// load real,imag from A
2468	lxvd2x		vs5,	o16,	AO		// load real,imag from A
2469	lxvd2x		vs6,	o32,	AO		// load real,imag from A
2470	lxvd2x		vs7,	o48,	AO		// load real,imag from A
2471
2472	addi		AO,	AO,	64
2473
2474	lxvd2x		vs16,	o0,	BO		// load real part from B
2475	lxvd2x		vs17,	o16,	BO		// load imag part from B
2476
2477	addi		BO,	BO,	32
2478
2479	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
2480	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
2481	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
2482	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
2483	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
2484	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
2485	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
2486	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
2487	xvmaddadp	vs40,	vs4,	vs16		// real*real, imag*real
2488	xvmaddadp	vs41,	vs4,	vs17		// real*imag, imag*imag
2489	xvmaddadp	vs42,	vs5,	vs16		// real*real, imag*real
2490	xvmaddadp	vs43,	vs5,	vs17		// real*imag, imag*imag
2491	xvmaddadp	vs44,	vs6,	vs16		// real*real, imag*real
2492	xvmaddadp	vs45,	vs6,	vs17		// real*imag, imag*imag
2493	xvmaddadp	vs46,	vs7,	vs16		// real*real, imag*real
2494	xvmaddadp	vs47,	vs7,	vs17		// real*imag, imag*imag
2495
2496
2497#if defined(_AIX)
2498')
2499#else
2500.endm
2501#endif
2502
2503#if defined(_AIX)
2504define(`SAVE1x8', `
2505#else
2506.macro SAVE1x8
2507#endif
2508
2509
2510	mr		T1,	CO
2511	addi		T2,	T1,	64
2512
2513#ifndef TRMMKERNEL
2514
2515	lxvd2x		vs16,	o0,	T1
2516	lxvd2x		vs17,	o16,	T1
2517	lxvd2x		vs18,	o32,	T1
2518	lxvd2x		vs19,	o48,	T1
2519	lxvd2x		vs20,	o0,	T2
2520	lxvd2x		vs21,	o16,	T2
2521	lxvd2x		vs22,	o32,	T2
2522	lxvd2x		vs23,	o48,	T2
2523
2524#endif
2525
2526
2527	xxlxor		vs0,	vs0,	vs0
2528	xxlxor		vs1,	vs1,	vs1
2529	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2530
2531	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
2532	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
2533
2534	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
2535	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2536
2537	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
2538	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
2539
2540	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
2541	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
2542	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
2543	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
2544
2545	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
2546	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
2547	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
2548
2549
2550
2551	xxlxor		vs0,	vs0,	vs0
2552	xxlxor		vs1,	vs1,	vs1
2553	XXSWAPD(vs35,vs35)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2554
2555	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
2556	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
2557
2558	XXSWAPD(vs34,vs34)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
2559	XXSWAPD(vs35,vs35)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2560
2561	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
2562	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
2563
2564	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
2565	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
2566	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
2567	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
2568
2569	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
2570	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
2571	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
2572
2573
2574
2575	xxlxor		vs0,	vs0,	vs0
2576	xxlxor		vs1,	vs1,	vs1
2577	XXSWAPD(vs37,vs37)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2578
2579	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
2580	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
2581
2582	XXSWAPD(vs36,vs36)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
2583	XXSWAPD(vs37,vs37)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2584
2585	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
2586	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
2587
2588	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
2589	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
2590	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
2591	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
2592
2593	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
2594	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
2595	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
2596
2597
2598
2599	xxlxor		vs0,	vs0,	vs0
2600	xxlxor		vs1,	vs1,	vs1
2601	XXSWAPD(vs39,vs39)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2602
2603	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
2604	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
2605
2606	XXSWAPD(vs38,vs38)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
2607	XXSWAPD(vs39,vs39)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2608
2609	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
2610	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
2611
2612	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
2613	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
2614	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
2615	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
2616
2617	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
2618	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
2619	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
2620
2621
2622
2623	xxlxor		vs0,	vs0,	vs0
2624	xxlxor		vs1,	vs1,	vs1
2625	XXSWAPD(vs41,vs41)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2626
2627	XSFADD_R1	vs0,	vs0,	vs40		// realA*realB
2628	XSFADD_R2	vs0,	vs0,	vs41		// imagA*imagB
2629
2630	XXSWAPD(vs40,vs40)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
2631	XXSWAPD(vs41,vs41)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2632
2633	XSFADD_I1	vs1,	vs1,	vs40		// realA*imagB
2634	XSFADD_I2	vs1,	vs1,	vs41		// imagA*realB
2635
2636	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
2637	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
2638	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
2639	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
2640
2641	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
2642	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
2643	xxpermdi	vs12,	vs2,	vs3,	0	// merge real and imag part
2644
2645
2646
2647	xxlxor		vs0,	vs0,	vs0
2648	xxlxor		vs1,	vs1,	vs1
2649	XXSWAPD(vs43,vs43)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2650
2651	XSFADD_R1	vs0,	vs0,	vs42		// realA*realB
2652	XSFADD_R2	vs0,	vs0,	vs43		// imagA*imagB
2653
2654	XXSWAPD(vs42,vs42)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
2655	XXSWAPD(vs43,vs43)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2656
2657	XSFADD_I1	vs1,	vs1,	vs42		// realA*imagB
2658	XSFADD_I2	vs1,	vs1,	vs43		// imagA*realB
2659
2660	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
2661	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
2662	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
2663	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
2664
2665	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
2666	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
2667	xxpermdi	vs13,	vs2,	vs3,	0	// merge real and imag part
2668
2669
2670
2671	xxlxor		vs0,	vs0,	vs0
2672	xxlxor		vs1,	vs1,	vs1
2673	XXSWAPD(vs45,vs45)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2674
2675	XSFADD_R1	vs0,	vs0,	vs44		// realA*realB
2676	XSFADD_R2	vs0,	vs0,	vs45		// imagA*imagB
2677
2678	XXSWAPD(vs44,vs44)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
2679	XXSWAPD(vs45,vs45)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2680
2681	XSFADD_I1	vs1,	vs1,	vs44		// realA*imagB
2682	XSFADD_I2	vs1,	vs1,	vs45		// imagA*realB
2683
2684	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
2685	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
2686	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
2687	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
2688
2689	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
2690	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
2691	xxpermdi	vs14,	vs2,	vs3,	0	// merge real and imag part
2692
2693
2694
2695	xxlxor		vs0,	vs0,	vs0
2696	xxlxor		vs1,	vs1,	vs1
2697	XXSWAPD(vs47,vs47)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2698
2699	XSFADD_R1	vs0,	vs0,	vs46		// realA*realB
2700	XSFADD_R2	vs0,	vs0,	vs47		// imagA*imagB
2701
2702	XXSWAPD(vs46,vs46)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
2703	XXSWAPD(vs47,vs47)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2704
2705	XSFADD_I1	vs1,	vs1,	vs46		// realA*imagB
2706	XSFADD_I2	vs1,	vs1,	vs47		// imagA*realB
2707
2708	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
2709	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
2710	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
2711	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
2712
2713	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
2714	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
2715	xxpermdi	vs15,	vs2,	vs3,	0	// merge real and imag part
2716
2717
2718#ifndef TRMMKERNEL
2719
2720	xvadddp		vs8,	vs8,	vs16
2721	xvadddp		vs9,	vs9,	vs17
2722	xvadddp		vs10,	vs10,	vs18
2723	xvadddp		vs11,	vs11,	vs19
2724	xvadddp		vs12,	vs12,	vs20
2725	xvadddp		vs13,	vs13,	vs21
2726	xvadddp		vs14,	vs14,	vs22
2727	xvadddp		vs15,	vs15,	vs23
2728
2729#endif
2730
2731	stxvd2x		vs8,	o0,	T1
2732	stxvd2x		vs9,	o16,	T1
2733	stxvd2x		vs10,	o32,	T1
2734	stxvd2x		vs11,	o48,	T1
2735	stxvd2x		vs12,	o0,	T2
2736	stxvd2x		vs13,	o16,	T2
2737	stxvd2x		vs14,	o32,	T2
2738	stxvd2x		vs15,	o48,	T2
2739
2740	add		T1,	T1,	LDC
2741	add		T2,	T2,	LDC
2742	addi		CO,	CO,	128
2743
2744#if defined(_AIX)
2745')
2746#else
2747.endm
2748#endif
2749
2750
2751/**********************************************************************************************
2752* Macros for N=1 and M=4
2753**********************************************************************************************/
2754
2755#if defined(_AIX)
2756define(`LOAD1x4_1', `
2757#else
2758.macro LOAD1x4_1
2759#endif
2760
2761	lxvd2x		vs16,	o0,	BO		// load real part from B
2762	lxvd2x		vs17,	o16,	BO		// load imag part from B
2763
2764	addi		BO,	BO,	32
2765
2766	lxvd2x		vs0,	o0,	AO		// load real,imag from A
2767	lxvd2x		vs1,	o16,	AO		// load real,imag from A
2768	lxvd2x		vs2,	o32,	AO		// load real,imag from A
2769	lxvd2x		vs3,	o48,	AO		// load real,imag from A
2770
2771	addi		AO,	AO,	64
2772
2773
2774#if defined(_AIX)
2775')
2776#else
2777.endm
2778#endif
2779
2780#if defined(_AIX)
2781define(`KERNEL1x4_I1', `
2782#else
2783.macro KERNEL1x4_I1
2784#endif
2785
2786	lxvd2x		vs8,	o0,	AO		// load real,imag from A
2787	lxvd2x		vs9,	o16,	AO		// load real,imag from A
2788	lxvd2x		vs10,	o32,	AO		// load real,imag from A
2789	lxvd2x		vs11,	o48,	AO		// load real,imag from A
2790
2791	addi		AO,	AO,	64
2792
2793	lxvd2x		vs20,	o0,	BO		// load real part from B
2794	lxvd2x		vs21,	o16,	BO		// load imag part from B
2795
2796	addi		BO,	BO,	32
2797
2798	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
2799	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
2800	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
2801	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
2802	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
2803	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
2804	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
2805	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
2806
2807
2808#if defined(_AIX)
2809')
2810#else
2811.endm
2812#endif
2813
2814#if defined(_AIX)
2815define(`KERNEL1x4_1', `
2816#else
2817.macro KERNEL1x4_1
2818#endif
2819
2820	lxvd2x		vs8,	o0,	AO		// load real,imag from A
2821	lxvd2x		vs9,	o16,	AO		// load real,imag from A
2822	lxvd2x		vs10,	o32,	AO		// load real,imag from A
2823	lxvd2x		vs11,	o48,	AO		// load real,imag from A
2824
2825	addi		AO,	AO,	64
2826
2827	lxvd2x		vs20,	o0,	BO		// load real part from B
2828	lxvd2x		vs21,	o16,	BO		// load imag part from B
2829
2830	addi		BO,	BO,	32
2831
2832	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
2833	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
2834	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
2835	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
2836	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
2837	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
2838	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
2839	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
2840
2841
2842#if defined(_AIX)
2843')
2844#else
2845.endm
2846#endif
2847
2848#if defined(_AIX)
2849define(`KERNEL1x4_2', `
2850#else
2851.macro KERNEL1x4_2
2852#endif
2853
2854	lxvd2x		vs0,	o0,	AO		// load real,imag from A
2855	lxvd2x		vs1,	o16,	AO		// load real,imag from A
2856	lxvd2x		vs2,	o32,	AO		// load real,imag from A
2857	lxvd2x		vs3,	o48,	AO		// load real,imag from A
2858
2859	addi		AO,	AO,	64
2860
2861	lxvd2x		vs16,	o0,	BO		// load real part from B
2862	lxvd2x		vs17,	o16,	BO		// load imag part from B
2863
2864	addi		BO,	BO,	32
2865
2866	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
2867	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
2868	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
2869	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
2870	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
2871	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
2872	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
2873	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
2874
2875
2876#if defined(_AIX)
2877')
2878#else
2879.endm
2880#endif
2881
2882#if defined(_AIX)
2883define(`KERNEL1x4_E2', `
2884#else
2885.macro KERNEL1x4_E2
2886#endif
2887
2888
2889	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
2890	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
2891	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
2892	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
2893	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
2894	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
2895	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
2896	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
2897
2898
2899#if defined(_AIX)
2900')
2901#else
2902.endm
2903#endif
2904
2905#if defined(_AIX)
2906define(`KERNEL1x4_SUBI1', `
2907#else
2908.macro KERNEL1x4_SUBI1
2909#endif
2910
2911	lxvd2x		vs0,	o0,	AO		// load real,imag from A
2912	lxvd2x		vs1,	o16,	AO		// load real,imag from A
2913	lxvd2x		vs2,	o32,	AO		// load real,imag from A
2914	lxvd2x		vs3,	o48,	AO		// load real,imag from A
2915
2916	addi		AO,	AO,	64
2917
2918	lxvd2x		vs16,	o0,	BO		// load real part from B
2919	lxvd2x		vs17,	o16,	BO		// load imag part from B
2920
2921	addi		BO,	BO,	32
2922
2923	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
2924	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
2925	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
2926	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
2927	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
2928	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
2929	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
2930	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
2931
2932
2933#if defined(_AIX)
2934')
2935#else
2936.endm
2937#endif
2938
2939#if defined(_AIX)
2940define(`KERNEL1x4_SUB1', `
2941#else
2942.macro KERNEL1x4_SUB1
2943#endif
2944
2945	lxvd2x		vs0,	o0,	AO		// load real,imag from A
2946	lxvd2x		vs1,	o16,	AO		// load real,imag from A
2947	lxvd2x		vs2,	o32,	AO		// load real,imag from A
2948	lxvd2x		vs3,	o48,	AO		// load real,imag from A
2949
2950	addi		AO,	AO,	64
2951
2952	lxvd2x		vs16,	o0,	BO		// load real part from B
2953	lxvd2x		vs17,	o16,	BO		// load imag part from B
2954
2955	addi		BO,	BO,	32
2956
2957	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
2958	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
2959	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
2960	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
2961	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
2962	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
2963	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
2964	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
2965
2966
2967#if defined(_AIX)
2968')
2969#else
2970.endm
2971#endif
2972
2973#if defined(_AIX)
2974define(`SAVE1x4', `
2975#else
2976.macro SAVE1x4
2977#endif
2978
2979
2980	mr		T1,	CO
2981
2982#ifndef TRMMKERNEL
2983
2984	lxvd2x		vs16,	o0,	T1
2985	lxvd2x		vs17,	o16,	T1
2986	lxvd2x		vs18,	o32,	T1
2987	lxvd2x		vs19,	o48,	T1
2988
2989#endif
2990
2991
2992	xxlxor		vs0,	vs0,	vs0
2993	xxlxor		vs1,	vs1,	vs1
2994	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2995
2996	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
2997	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
2998
2999	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
3000	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
3001
3002	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
3003	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
3004
3005	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
3006	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
3007	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
3008	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
3009
3010	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
3011	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
3012	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
3013
3014
3015
3016	xxlxor		vs0,	vs0,	vs0
3017	xxlxor		vs1,	vs1,	vs1
3018	XXSWAPD(vs35,vs35)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
3019
3020	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
3021	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
3022
3023	XXSWAPD(vs34,vs34)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
3024	XXSWAPD(vs35,vs35)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
3025
3026	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
3027	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
3028
3029	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
3030	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
3031	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
3032	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
3033
3034	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
3035	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
3036	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
3037
3038
3039
3040	xxlxor		vs0,	vs0,	vs0
3041	xxlxor		vs1,	vs1,	vs1
3042	XXSWAPD(vs37,vs37)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
3043
3044	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
3045	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
3046
3047	XXSWAPD(vs36,vs36)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
3048	XXSWAPD(vs37,vs37)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
3049
3050	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
3051	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
3052
3053	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
3054	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
3055	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
3056	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
3057
3058	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
3059	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
3060	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
3061
3062
3063
3064	xxlxor		vs0,	vs0,	vs0
3065	xxlxor		vs1,	vs1,	vs1
3066	XXSWAPD(vs39,vs39)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
3067
3068	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
3069	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
3070
3071	XXSWAPD(vs38,vs38)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
3072	XXSWAPD(vs39,vs39)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
3073
3074	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
3075	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
3076
3077	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
3078	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
3079	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
3080	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
3081
3082	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
3083	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
3084	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
3085
3086
3087#ifndef TRMMKERNEL
3088
3089	xvadddp		vs8,	vs8,	vs16
3090	xvadddp		vs9,	vs9,	vs17
3091	xvadddp		vs10,	vs10,	vs18
3092	xvadddp		vs11,	vs11,	vs19
3093
3094#endif
3095
3096	stxvd2x		vs8,	o0,	T1
3097	stxvd2x		vs9,	o16,	T1
3098	stxvd2x		vs10,	o32,	T1
3099	stxvd2x		vs11,	o48,	T1
3100
3101	add		T1,	T1,	LDC
3102	addi		CO,	CO,	64
3103
3104#if defined(_AIX)
3105')
3106#else
3107.endm
3108#endif
3109
3110
3111/**********************************************************************************************
3112* Macros for N=1 and M=2
3113**********************************************************************************************/
3114
3115#if defined(_AIX)
3116define(`LOAD1x2_1', `
3117#else
3118.macro LOAD1x2_1
3119#endif
3120
3121	lxvd2x		vs16,	o0,	BO		// load real part from B
3122	lxvd2x		vs17,	o16,	BO		// load imag part from B
3123
3124	addi		BO,	BO,	32
3125
3126	lxvd2x		vs0,	o0,	AO		// load real,imag from A
3127	lxvd2x		vs1,	o16,	AO		// load real,imag from A
3128
3129	addi		AO,	AO,	32
3130
3131
3132#if defined(_AIX)
3133')
3134#else
3135.endm
3136#endif
3137
3138#if defined(_AIX)
3139define(`KERNEL1x2_I1', `
3140#else
3141.macro KERNEL1x2_I1
3142#endif
3143
3144	lxvd2x		vs8,	o0,	AO		// load real,imag from A
3145	lxvd2x		vs9,	o16,	AO		// load real,imag from A
3146
3147	addi		AO,	AO,	32
3148
3149	lxvd2x		vs20,	o0,	BO		// load real part from B
3150	lxvd2x		vs21,	o16,	BO		// load imag part from B
3151
3152	addi		BO,	BO,	32
3153
3154	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
3155	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
3156	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
3157	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
3158
3159
3160#if defined(_AIX)
3161')
3162#else
3163.endm
3164#endif
3165
3166#if defined(_AIX)
3167define(`KERNEL1x2_1', `
3168#else
3169.macro KERNEL1x2_1
3170#endif
3171
3172	lxvd2x		vs8,	o0,	AO		// load real,imag from A
3173	lxvd2x		vs9,	o16,	AO		// load real,imag from A
3174
3175	addi		AO,	AO,	32
3176
3177	lxvd2x		vs20,	o0,	BO		// load real part from B
3178	lxvd2x		vs21,	o16,	BO		// load imag part from B
3179
3180	addi		BO,	BO,	32
3181
3182	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
3183	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
3184	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
3185	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
3186
3187
3188#if defined(_AIX)
3189')
3190#else
3191.endm
3192#endif
3193
3194#if defined(_AIX)
3195define(`KERNEL1x2_2', `
3196#else
3197.macro KERNEL1x2_2
3198#endif
3199
3200	lxvd2x		vs0,	o0,	AO		// load real,imag from A
3201	lxvd2x		vs1,	o16,	AO		// load real,imag from A
3202
3203	addi		AO,	AO,	32
3204
3205	lxvd2x		vs16,	o0,	BO		// load real part from B
3206	lxvd2x		vs17,	o16,	BO		// load imag part from B
3207
3208	addi		BO,	BO,	32
3209
3210	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
3211	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
3212	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
3213	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
3214
3215
3216#if defined(_AIX)
3217')
3218#else
3219.endm
3220#endif
3221
3222#if defined(_AIX)
3223define(`KERNEL1x2_E2', `
3224#else
3225.macro KERNEL1x2_E2
3226#endif
3227
3228
3229	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
3230	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
3231	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
3232	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
3233
3234
3235#if defined(_AIX)
3236')
3237#else
3238.endm
3239#endif
3240
3241#if defined(_AIX)
3242define(`KERNEL1x2_SUBI1', `
3243#else
3244.macro KERNEL1x2_SUBI1
3245#endif
3246
3247	lxvd2x		vs0,	o0,	AO		// load real,imag from A
3248	lxvd2x		vs1,	o16,	AO		// load real,imag from A
3249
3250	addi		AO,	AO,	32
3251
3252	lxvd2x		vs16,	o0,	BO		// load real part from B
3253	lxvd2x		vs17,	o16,	BO		// load imag part from B
3254
3255	addi		BO,	BO,	32
3256
3257	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
3258	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
3259	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
3260	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
3261
3262
3263#if defined(_AIX)
3264')
3265#else
3266.endm
3267#endif
3268
3269#if defined(_AIX)
3270define(`KERNEL1x2_SUB1', `
3271#else
3272.macro KERNEL1x2_SUB1
3273#endif
3274
3275	lxvd2x		vs0,	o0,	AO		// load real,imag from A
3276	lxvd2x		vs1,	o16,	AO		// load real,imag from A
3277
3278	addi		AO,	AO,	32
3279
3280	lxvd2x		vs16,	o0,	BO		// load real part from B
3281	lxvd2x		vs17,	o16,	BO		// load imag part from B
3282
3283	addi		BO,	BO,	32
3284
3285	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
3286	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
3287	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
3288	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
3289
3290
3291#if defined(_AIX)
3292')
3293#else
3294.endm
3295#endif
3296
3297#if defined(_AIX)
3298define(`SAVE1x2', `
3299#else
3300.macro SAVE1x2
3301#endif
3302
3303
3304	mr		T1,	CO
3305
3306#ifndef TRMMKERNEL
3307
3308	lxvd2x		vs16,	o0,	T1
3309	lxvd2x		vs17,	o16,	T1
3310
3311#endif
3312
3313
3314	xxlxor		vs0,	vs0,	vs0
3315	xxlxor		vs1,	vs1,	vs1
3316	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
3317
3318	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
3319	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
3320
3321	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
3322	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
3323
3324	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
3325	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
3326
3327	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
3328	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
3329	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
3330	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
3331
3332	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
3333	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
3334	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
3335
3336
3337
3338	xxlxor		vs0,	vs0,	vs0
3339	xxlxor		vs1,	vs1,	vs1
3340	XXSWAPD(vs35,vs35)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
3341
3342	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
3343	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
3344
3345	XXSWAPD(vs34,vs34)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
3346	XXSWAPD(vs35,vs35)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
3347
3348	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
3349	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
3350
3351	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
3352	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
3353	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
3354	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
3355
3356	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
3357	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
3358	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
3359
3360
3361#ifndef TRMMKERNEL
3362
3363	xvadddp		vs8,	vs8,	vs16
3364	xvadddp		vs9,	vs9,	vs17
3365
3366#endif
3367
3368	stxvd2x		vs8,	o0,	T1
3369	stxvd2x		vs9,	o16,	T1
3370
3371	add		T1,	T1,	LDC
3372	addi		CO,	CO,	32
3373
3374#if defined(_AIX)
3375')
3376#else
3377.endm
3378#endif
3379
3380
3381/**********************************************************************************************
3382* Macros for N=1 and M=1
3383**********************************************************************************************/
3384
3385#if defined(_AIX)
3386define(`LOAD1x1_1', `
3387#else
3388.macro LOAD1x1_1
3389#endif
3390
3391	lxvd2x		vs16,	o0,	BO		// load real part from B
3392	lxvd2x		vs17,	o16,	BO		// load imag part from B
3393
3394	addi		BO,	BO,	32
3395
3396	lxvd2x		vs0,	o0,	AO		// load real,imag from A
3397
3398	addi		AO,	AO,	16
3399
3400
3401#if defined(_AIX)
3402')
3403#else
3404.endm
3405#endif
3406
3407#if defined(_AIX)
3408define(`KERNEL1x1_I1', `
3409#else
3410.macro KERNEL1x1_I1
3411#endif
3412
3413	lxvd2x		vs8,	o0,	AO		// load real,imag from A
3414
3415	addi		AO,	AO,	16
3416
3417	lxvd2x		vs20,	o0,	BO		// load real part from B
3418	lxvd2x		vs21,	o16,	BO		// load imag part from B
3419
3420	addi		BO,	BO,	32
3421
3422	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
3423	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
3424
3425
3426#if defined(_AIX)
3427')
3428#else
3429.endm
3430#endif
3431
3432#if defined(_AIX)
3433define(`KERNEL1x1_1', `
3434#else
3435.macro KERNEL1x1_1
3436#endif
3437
3438	lxvd2x		vs8,	o0,	AO		// load real,imag from A
3439
3440	addi		AO,	AO,	16
3441
3442	lxvd2x		vs20,	o0,	BO		// load real part from B
3443	lxvd2x		vs21,	o16,	BO		// load imag part from B
3444
3445	addi		BO,	BO,	32
3446
3447	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
3448	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
3449
3450
3451#if defined(_AIX)
3452')
3453#else
3454.endm
3455#endif
3456
3457#if defined(_AIX)
3458define(`KERNEL1x1_2', `
3459#else
3460.macro KERNEL1x1_2
3461#endif
3462
3463	lxvd2x		vs0,	o0,	AO		// load real,imag from A
3464
3465	addi		AO,	AO,	16
3466
3467	lxvd2x		vs16,	o0,	BO		// load real part from B
3468	lxvd2x		vs17,	o16,	BO		// load imag part from B
3469
3470	addi		BO,	BO,	32
3471
3472	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
3473	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
3474
3475
3476#if defined(_AIX)
3477')
3478#else
3479.endm
3480#endif
3481
3482#if defined(_AIX)
3483define(`KERNEL1x1_E2', `
3484#else
3485.macro KERNEL1x1_E2
3486#endif
3487
3488
3489	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
3490	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
3491
3492
3493#if defined(_AIX)
3494')
3495#else
3496.endm
3497#endif
3498
3499#if defined(_AIX)
3500define(`KERNEL1x1_SUBI1', `
3501#else
3502.macro KERNEL1x1_SUBI1
3503#endif
3504
3505	lxvd2x		vs0,	o0,	AO		// load real,imag from A
3506
3507	addi		AO,	AO,	16
3508
3509	lxvd2x		vs16,	o0,	BO		// load real part from B
3510	lxvd2x		vs17,	o16,	BO		// load imag part from B
3511
3512	addi		BO,	BO,	32
3513
3514	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
3515	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
3516
3517
3518#if defined(_AIX)
3519')
3520#else
3521.endm
3522#endif
3523
3524#if defined(_AIX)
3525define(`KERNEL1x1_SUB1', `
3526#else
3527.macro KERNEL1x1_SUB1
3528#endif
3529
3530	lxvd2x		vs0,	o0,	AO		// load real,imag from A
3531
3532	addi		AO,	AO,	16
3533
3534	lxvd2x		vs16,	o0,	BO		// load real part from B
3535	lxvd2x		vs17,	o16,	BO		// load imag part from B
3536
3537	addi		BO,	BO,	32
3538
3539	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
3540	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
3541
3542
3543#if defined(_AIX)
3544')
3545#else
3546.endm
3547#endif
3548
3549#if defined(_AIX)
3550define(`SAVE1x1', `
3551#else
3552.macro SAVE1x1
3553#endif
3554
3555
3556	mr		T1,	CO
3557
3558#ifndef TRMMKERNEL
3559
3560	lxvd2x		vs16,	o0,	T1
3561
3562#endif
3563
3564
3565	xxlxor		vs0,	vs0,	vs0
3566	xxlxor		vs1,	vs1,	vs1
3567	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
3568
3569	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
3570	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
3571
3572	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
3573	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
3574
3575	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
3576	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
3577
3578	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
3579	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
3580	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
3581	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
3582
3583	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
3584	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
3585	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
3586
3587
3588#ifndef TRMMKERNEL
3589
3590	xvadddp		vs8,	vs8,	vs16
3591
3592#endif
3593
3594	stxvd2x		vs8,	o0,	T1
3595
3596	add		T1,	T1,	LDC
3597	addi		CO,	CO,	16
3598
3599#if defined(_AIX)
3600')
3601#else
3602.endm
3603#endif
3604
3605
3606
3607#if defined(_AIX)
3608define(`ZCOPYB_1x1', `
3609#else
3610.macro ZCOPYB_1x1
3611#endif
3612
3613        lxvdsx          vs4,    o0,     BO              // b0_r
3614        lxvdsx          vs5,    o8,     BO              // b0_i
3615        addi            BO,     BO,     16
3616        stxvd2x         vs4,    o0,     BBO
3617        stxvd2x         vs5,    o16,    BBO
3618        addi            BBO,    BBO,    32
3619
3620#if defined(_AIX)
3621')
3622#else
3623.endm
3624#endif
3625
3626
3627#if defined(_AIX)
3628define(`ZCOPYB_8x1', `
3629#else
3630.macro ZCOPYB_8x1
3631#endif
3632
3633        lxvd2x          vs32,   o0,     BO
3634        lxvd2x          vs33,  o16,     BO
3635        lxvd2x          vs34,  o32,     BO
3636        lxvd2x          vs35,  o48,     BO
3637        addi            BO,     BO,     64
3638
3639        lxvd2x          vs36,   o0,     BO
3640        lxvd2x          vs37,  o16,     BO
3641        lxvd2x          vs38,  o32,     BO
3642        lxvd2x          vs39,  o48,     BO
3643        addi            BO,     BO,     64
3644
3645	XXSPLTD(vs40,vs32,0)
3646	XXSPLTD(vs41,vs32,1)
3647	XXSPLTD(vs42,vs33,0)
3648	XXSPLTD(vs43,vs33,1)
3649	XXSPLTD(vs44,vs34,0)
3650	XXSPLTD(vs45,vs34,1)
3651	XXSPLTD(vs46,vs35,0)
3652	XXSPLTD(vs47,vs35,1)
3653
3654	XXSPLTD(vs48,vs36,0)
3655	XXSPLTD(vs49,vs36,1)
3656	XXSPLTD(vs50,vs37,0)
3657	XXSPLTD(vs51,vs37,1)
3658	XXSPLTD(vs52,vs38,0)
3659	XXSPLTD(vs53,vs38,1)
3660	XXSPLTD(vs54,vs39,0)
3661	XXSPLTD(vs55,vs39,1)
3662
3663        stxvd2x         vs40,    o0,     BBO
3664        stxvd2x         vs41,   o16,     BBO
3665        stxvd2x         vs42,   o32,     BBO
3666        stxvd2x         vs43,   o48,     BBO
3667        addi            BBO,    BBO,    64
3668
3669        stxvd2x         vs44,    o0,     BBO
3670        stxvd2x         vs45,   o16,     BBO
3671        stxvd2x         vs46,   o32,     BBO
3672        stxvd2x         vs47,   o48,     BBO
3673        addi            BBO,    BBO,    64
3674
3675        stxvd2x         vs48,    o0,     BBO
3676        stxvd2x         vs49,   o16,     BBO
3677        stxvd2x         vs50,   o32,     BBO
3678        stxvd2x         vs51,   o48,     BBO
3679        addi            BBO,    BBO,    64
3680
3681        stxvd2x         vs52,    o0,     BBO
3682        stxvd2x         vs53,   o16,     BBO
3683        stxvd2x         vs54,   o32,     BBO
3684        stxvd2x         vs55,   o48,     BBO
3685        addi            BBO,    BBO,    64
3686
3687#if defined(_AIX)
3688')
3689#else
3690.endm
3691#endif
3692
3693
3694