1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#ifdef linux
26#ifndef __64BIT__
27#define N	r3
28#define X	r6
29#define INCX	r7
30#define Y	r8
31#define INCY	r9
32#define	YY	r4
33#define PRE	r5
34#else
35#define N	r3
36#define X	r8
37#define INCX	r9
38#define Y	r5
39#define INCY	r4
40#define	YY	r6
41#define PRE	r7
42#endif
43#endif
44
45#if defined(_AIX) || defined(__APPLE__)
46#if !defined(__64BIT__) && defined(DOUBLE)
47#define N	r3
48#define X	r10
49#define INCX	r4
50#define Y	r5
51#define INCY	r6
52#define	YY	r7
53#define PRE	r8
54#else
55#define N	r3
56#define X	r8
57#define INCX	r9
58#define Y	r10
59#define INCY	r4
60#define	YY	r5
61#define PRE	r6
62#endif
63#endif
64
65#define ALPHA_R	f24
66#define ALPHA_I	f25
67
68#ifndef CONJ
69#define ADD1	FNMSUB
70#define ADD2	FMADD
71#else
72#define ADD1	FMADD
73#define ADD2	FNMSUB
74#endif
75
76#define STACKSIZE 96
77
78	PROLOGUE
79	PROFCODE
80
81	subi	SP, SP, STACKSIZE
82
83	stfd	f14,    0(SP)
84	stfd	f15,    8(SP)
85	stfd	f16,   16(SP)
86	stfd	f17,   24(SP)
87
88	stfd	f18,   32(SP)
89	stfd	f19,   40(SP)
90	stfd	f20,   48(SP)
91	stfd	f21,   56(SP)
92
93	stfd	f22,   64(SP)
94	stfd	f23,   72(SP)
95	stfd	f24,   80(SP)
96	stfd	f25,   88(SP)
97
98#if defined(linux) && defined(__64BIT__)
99        ld	INCY, 112 + STACKSIZE(SP)
100#endif
101
102#if defined(_AIX) || defined(__APPLE__)
103#ifdef __64BIT__
104	ld	INCY,   112 + STACKSIZE(SP)
105#else
106#ifdef DOUBLE
107	lwz	INCX,    56 + STACKSIZE(SP)
108	lwz	Y,       60 + STACKSIZE(SP)
109	lwz	INCY,    64 + STACKSIZE(SP)
110#else
111	lwz	INCY,    56 + STACKSIZE(SP)
112#endif
113#endif
114#endif
115
116	fmr	ALPHA_R, f1
117	slwi	INCX, INCX, ZBASE_SHIFT
118	fmr	ALPHA_I, f2
119	slwi	INCY, INCY, ZBASE_SHIFT
120
121	subi	INCX, INCX, SIZE
122	subi	INCY, INCY, SIZE
123
124	li	PRE, 2 * 16 * SIZE
125
126	cmpwi	cr0, N, 0
127	ble-	LL(999)
128
129	sub	X, X, INCX
130	sub	Y, Y, INCY
131	mr	YY, Y
132
133	srawi.	r0, N, 3
134	mtspr	CTR,  r0
135	ble-	LL(150)
136	.align 4
137
138	LFDUX	f0,    X, INCX
139	LFDU	f1,    1 * SIZE(X)
140	LFDUX	f2,    X, INCX
141	LFDU	f3,    1 * SIZE(X)
142
143	LFDUX	f8,    Y, INCY
144	LFDU	f9,    1 * SIZE(Y)
145	LFDUX	f10,   Y, INCY
146	LFDU	f11,   1 * SIZE(Y)
147
148	LFDUX	f4,    X, INCX
149	LFDU	f5,    1 * SIZE(X)
150	LFDUX	f6,    X, INCX
151	LFDU	f7,    1 * SIZE(X)
152
153	LFDUX	f12,   Y, INCY
154	LFDU	f13,   1 * SIZE(Y)
155	LFDUX	f14,   Y, INCY
156	LFDU	f15,   1 * SIZE(Y)
157	bdz	LL(120)
158	.align 4
159
160LL(110):
161	FMADD	f16,  ALPHA_R, f0, f8
162	LFDUX	f8,    Y, INCY
163	FMADD	f17,  ALPHA_I, f0, f9
164	LFDU	f9,    1 * SIZE(Y)
165	FMADD	f18,  ALPHA_R, f2, f10
166	LFDUX	f10,   Y, INCY
167	FMADD	f19,  ALPHA_I, f2, f11
168	LFDU	f11,   1 * SIZE(Y)
169#ifdef PPCG4
170	dcbt	X, PRE
171#endif
172
173	ADD1	f16,  ALPHA_I, f1, f16
174	LFDUX	f0,    X, INCX
175	ADD2	f17,  ALPHA_R, f1, f17
176	LFDU	f1,    1 * SIZE(X)
177	ADD1	f18,  ALPHA_I, f3, f18
178	LFDUX	f2,    X, INCX
179	ADD2	f19,  ALPHA_R, f3, f19
180	LFDU	f3,    1 * SIZE(X)
181#ifdef PPCG4
182	dcbtst	Y, PRE
183#endif
184
185	FMADD	f20,  ALPHA_R, f4, f12
186	LFDUX	f12,   Y, INCY
187	FMADD	f21,  ALPHA_I, f4, f13
188	LFDU	f13,   1 * SIZE(Y)
189	FMADD	f22,  ALPHA_R, f6, f14
190	LFDUX	f14,   Y, INCY
191	FMADD	f23,  ALPHA_I, f6, f15
192	LFDU	f15,   1 * SIZE(Y)
193#if defined(PPCG4) && defined(DOUBLE)
194	dcbt	X, PRE
195#endif
196
197	ADD1	f20,  ALPHA_I, f5, f20
198	LFDUX	f4,    X, INCX
199	ADD2	f21,  ALPHA_R, f5, f21
200	LFDU	f5,    1 * SIZE(X)
201	ADD1	f22,  ALPHA_I, f7, f22
202	LFDUX	f6,    X, INCX
203	ADD2	f23,  ALPHA_R, f7, f23
204	LFDU	f7,    1 * SIZE(X)
205#if defined(PPCG4) && defined(DOUBLE)
206	dcbtst	Y, PRE
207#endif
208
209	STFDUX	f16,   YY, INCY
210	STFDU	f17,   1 * SIZE(YY)
211	STFDUX	f18,   YY, INCY
212	STFDU	f19,   1 * SIZE(YY)
213
214	FMADD	f16,  ALPHA_R, f0, f8
215	LFDUX	f8,    Y, INCY
216	FMADD	f17,  ALPHA_I, f0, f9
217	LFDU	f9,    1 * SIZE(Y)
218	FMADD	f18,  ALPHA_R, f2, f10
219	LFDUX	f10,   Y, INCY
220	FMADD	f19,  ALPHA_I, f2, f11
221	LFDU	f11,   1 * SIZE(Y)
222#ifdef PPCG4
223	dcbt	X, PRE
224#endif
225
226	ADD1	f16,  ALPHA_I, f1, f16
227	LFDUX	f0,    X, INCX
228	ADD2	f17,  ALPHA_R, f1, f17
229	LFDU	f1,    1 * SIZE(X)
230	ADD1	f18,  ALPHA_I, f3, f18
231	LFDUX	f2,    X, INCX
232	ADD2	f19,  ALPHA_R, f3, f19
233	LFDU	f3,    1 * SIZE(X)
234#ifdef PPCG4
235	dcbtst	Y, PRE
236#endif
237
238	STFDUX	f20,   YY, INCY
239	STFDU	f21,   1 * SIZE(YY)
240	STFDUX	f22,   YY, INCY
241	STFDU	f23,   1 * SIZE(YY)
242
243	FMADD	f20,  ALPHA_R, f4, f12
244	LFDUX	f12,   Y, INCY
245	FMADD	f21,  ALPHA_I, f4, f13
246	LFDU	f13,   1 * SIZE(Y)
247	FMADD	f22,  ALPHA_R, f6, f14
248	LFDUX	f14,   Y, INCY
249	FMADD	f23,  ALPHA_I, f6, f15
250	LFDU	f15,   1 * SIZE(Y)
251#if defined(PPCG4) && defined(DOUBLE)
252	dcbt	X, PRE
253#endif
254
255	ADD1	f20,  ALPHA_I, f5, f20
256	LFDUX	f4,    X, INCX
257	ADD2	f21,  ALPHA_R, f5, f21
258	LFDU	f5,    1 * SIZE(X)
259	ADD1	f22,  ALPHA_I, f7, f22
260	LFDUX	f6,    X, INCX
261	ADD2	f23,  ALPHA_R, f7, f23
262	LFDU	f7,    1 * SIZE(X)
263#if defined(PPCG4) && defined(DOUBLE)
264	dcbtst	Y, PRE
265#endif
266
267	STFDUX	f16,   YY, INCY
268	STFDU	f17,   1 * SIZE(YY)
269	STFDUX	f18,   YY, INCY
270	STFDU	f19,   1 * SIZE(YY)
271
272	STFDUX	f20,   YY, INCY
273	STFDU	f21,   1 * SIZE(YY)
274	STFDUX	f22,   YY, INCY
275	STFDU	f23,   1 * SIZE(YY)
276	bdnz	LL(110)
277	.align 4
278
279LL(120):
280	FMADD	f16,  ALPHA_R, f0, f8
281	LFDUX	f8,    Y, INCY
282	FMADD	f17,  ALPHA_I, f0, f9
283	LFDU	f9,    1 * SIZE(Y)
284	FMADD	f18,  ALPHA_R, f2, f10
285	LFDUX	f10,   Y, INCY
286	FMADD	f19,  ALPHA_I, f2, f11
287	LFDU	f11,   1 * SIZE(Y)
288
289	ADD1	f16,  ALPHA_I, f1, f16
290	LFDUX	f0,    X, INCX
291	ADD2	f17,  ALPHA_R, f1, f17
292	LFDU	f1,    1 * SIZE(X)
293	ADD1	f18,  ALPHA_I, f3, f18
294	LFDUX	f2,    X, INCX
295	ADD2	f19,  ALPHA_R, f3, f19
296	LFDU	f3,    1 * SIZE(X)
297
298	FMADD	f20,  ALPHA_R, f4, f12
299	LFDUX	f12,   Y, INCY
300	FMADD	f21,  ALPHA_I, f4, f13
301	LFDU	f13,   1 * SIZE(Y)
302	FMADD	f22,  ALPHA_R, f6, f14
303	LFDUX	f14,   Y, INCY
304	FMADD	f23,  ALPHA_I, f6, f15
305	LFDU	f15,   1 * SIZE(Y)
306
307	ADD1	f20,  ALPHA_I, f5, f20
308	LFDUX	f4,    X, INCX
309	ADD2	f21,  ALPHA_R, f5, f21
310	LFDU	f5,    1 * SIZE(X)
311	ADD1	f22,  ALPHA_I, f7, f22
312	LFDUX	f6,    X, INCX
313	ADD2	f23,  ALPHA_R, f7, f23
314	LFDU	f7,    1 * SIZE(X)
315
316	STFDUX	f16,   YY, INCY
317	FMADD	f16,  ALPHA_R, f0, f8
318	STFDU	f17,   1 * SIZE(YY)
319	FMADD	f17,  ALPHA_I, f0, f9
320	STFDUX	f18,   YY, INCY
321	FMADD	f18,  ALPHA_R, f2, f10
322	STFDU	f19,   1 * SIZE(YY)
323	FMADD	f19,  ALPHA_I, f2, f11
324
325	ADD1	f16,  ALPHA_I, f1, f16
326	ADD2	f17,  ALPHA_R, f1, f17
327	ADD1	f18,  ALPHA_I, f3, f18
328	ADD2	f19,  ALPHA_R, f3, f19
329
330	STFDUX	f20,   YY, INCY
331	FMADD	f20,  ALPHA_R, f4, f12
332	STFDU	f21,   1 * SIZE(YY)
333	FMADD	f21,  ALPHA_I, f4, f13
334	STFDUX	f22,   YY, INCY
335	FMADD	f22,  ALPHA_R, f6, f14
336	STFDU	f23,   1 * SIZE(YY)
337	FMADD	f23,  ALPHA_I, f6, f15
338
339	ADD1	f20,  ALPHA_I, f5, f20
340	STFDUX	f16,   YY, INCY
341	ADD2	f21,  ALPHA_R, f5, f21
342	STFDU	f17,   1 * SIZE(YY)
343	ADD1	f22,  ALPHA_I, f7, f22
344	STFDUX	f18,   YY, INCY
345	ADD2	f23,  ALPHA_R, f7, f23
346	STFDU	f19,   1 * SIZE(YY)
347
348	STFDUX	f20,   YY, INCY
349	STFDU	f21,   1 * SIZE(YY)
350	STFDUX	f22,   YY, INCY
351	STFDU	f23,   1 * SIZE(YY)
352	.align 4
353
354LL(150):
355	andi.	r0,  N, 7
356	mtspr	CTR, r0
357	ble	LL(999)
358	.align 4
359
360LL(160):
361	LFDUX	f0,   X, INCX
362	LFDU	f1,   1 * SIZE(X)
363	LFDUX	f8,   Y, INCY
364	LFDU	f9,   1 * SIZE(Y)
365
366	FMADD	f16,  ALPHA_R, f0, f8
367	FMADD	f17,  ALPHA_I, f0, f9
368
369	ADD1	f16,  ALPHA_I, f1, f16
370	ADD2	f17,  ALPHA_R, f1, f17
371
372	STFDUX	f16,  YY, INCY
373	STFDU	f17,  1 * SIZE(YY)
374	bdnz	LL(160)
375	.align 4
376
377LL(999):
378	lfd	f14,    0(SP)
379	lfd	f15,    8(SP)
380	lfd	f16,   16(SP)
381	lfd	f17,   24(SP)
382
383	lfd	f18,   32(SP)
384	lfd	f19,   40(SP)
385	lfd	f20,   48(SP)
386	lfd	f21,   56(SP)
387
388	lfd	f22,   64(SP)
389	lfd	f23,   72(SP)
390	lfd	f24,   80(SP)
391	lfd	f25,   88(SP)
392
393	addi	SP, SP, STACKSIZE
394	li	r0, 0
395	blr
396	EPILOGUE
397