1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define N	r3
26#define X	r4
27#define INCX	r5
28
29#define PREA	r8
30
31#define FZERO	144(SP)
32#define FONE	148(SP)
33
34#define STACKSIZE 160
35
36	PROLOGUE
37	PROFCODE
38
39	addi	SP, SP, -STACKSIZE
40	li	r10,   0
41	lis	r11,   0x3f80
42
43	stfd	f14,    0(SP)
44	stfd	f15,    8(SP)
45	stfd	f16,   16(SP)
46	stfd	f17,   24(SP)
47
48	stfd	f18,   32(SP)
49	stfd	f19,   40(SP)
50	stfd	f20,   48(SP)
51	stfd	f21,   56(SP)
52
53	stfd	f22,   64(SP)
54	stfd	f23,   72(SP)
55	stfd	f24,   80(SP)
56	stfd	f25,   88(SP)
57
58	stfd	f26,   96(SP)
59	stfd	f27,  104(SP)
60	stfd	f28,  112(SP)
61	stfd	f29,  120(SP)
62
63	stfd	f30,  128(SP)
64	stfd	f31,  136(SP)
65
66	stw	r10,  FZERO
67	stw	r11,  FONE
68
69	lfs	f1,   FZERO
70
71#ifdef F_INTERFACE
72	LDINT	N,    0(N)
73	LDINT	INCX, 0(INCX)
74#endif
75
76	slwi	INCX, INCX, BASE_SHIFT
77
78	li	PREA, 4 * 16 * SIZE
79
80	cmpwi	cr0, N, 0
81	ble-	LL(9999)
82	cmpwi	cr0, INCX, 0
83	ble-	LL(9999)
84
85	fmr	f0,  f1
86	fmr	f2,  f1
87	fmr	f3,  f1
88	fmr	f4,  f1
89	fmr	f5,  f1
90	fmr	f6,  f1
91	fmr	f7,  f1
92	fmr	f8,  f1
93	fmr	f9,  f1
94	fmr	f10, f1
95	fmr	f11, f1
96	fmr	f12, f1
97	fmr	f13, f1
98	fmr	f14, f1
99	fmr	f15, f1
100
101	cmpwi	cr0, INCX, SIZE
102	bne-	cr0, LL(1000)
103
104	srawi.	r0, N, 4
105	mtspr	CTR, r0
106	beq-	cr0, LL(150)
107
108	LFD	f16,  0 * SIZE(X)
109	LFD	f17,  1 * SIZE(X)
110	LFD	f18,  2 * SIZE(X)
111	LFD	f19,  3 * SIZE(X)
112	LFD	f20,  4 * SIZE(X)
113	LFD	f21,  5 * SIZE(X)
114	LFD	f22,  6 * SIZE(X)
115	LFD	f23,  7 * SIZE(X)
116
117	LFD	f24,  8 * SIZE(X)
118	LFD	f25,  9 * SIZE(X)
119	LFD	f26, 10 * SIZE(X)
120	LFD	f27, 11 * SIZE(X)
121	LFD	f28, 12 * SIZE(X)
122	LFD	f29, 13 * SIZE(X)
123	LFD	f30, 14 * SIZE(X)
124	LFD	f31, 15 * SIZE(X)
125
126	bdz	LL(120)
127	.align 4
128
129LL(110):
130	fmadd	f0,  f16, f16, f0
131	fmadd	f1,  f17, f17, f1
132	fmadd	f2,  f18, f18, f2
133	fmadd	f3,  f19, f19, f3
134
135	LFD	f16, 16 * SIZE(X)
136	LFD	f17, 17 * SIZE(X)
137	LFD	f18, 18 * SIZE(X)
138	LFD	f19, 19 * SIZE(X)
139
140	fmadd	f4,  f20, f20, f4
141	fmadd	f5,  f21, f21, f5
142	fmadd	f6,  f22, f22, f6
143	fmadd	f7,  f23, f23, f7
144
145	LFD	f20, 20 * SIZE(X)
146	LFD	f21, 21 * SIZE(X)
147	LFD	f22, 22 * SIZE(X)
148	LFD	f23, 23 * SIZE(X)
149
150	fmadd	f8,  f24, f24, f8
151	fmadd	f9,  f25, f25, f9
152	fmadd	f10, f26, f26, f10
153	fmadd	f11, f27, f27, f11
154
155	LFD	f24, 24 * SIZE(X)
156	LFD	f25, 25 * SIZE(X)
157	LFD	f26, 26 * SIZE(X)
158	LFD	f27, 27 * SIZE(X)
159
160	fmadd	f12, f28, f28, f12
161	fmadd	f13, f29, f29, f13
162	fmadd	f14, f30, f30, f14
163	fmadd	f15, f31, f31, f15
164
165	LFD	f28, 28 * SIZE(X)
166	LFD	f29, 29 * SIZE(X)
167	LFD	f30, 30 * SIZE(X)
168	LFD	f31, 31 * SIZE(X)
169
170#ifndef POWER6
171	L1_PREFETCH	X, PREA
172#endif
173	addi	X, X, 16 * SIZE
174#ifdef POWER6
175	L1_PREFETCH	X, PREA
176#endif
177
178	bdnz	LL(110)
179	.align 4
180
181LL(120):
182	fmadd	f0,  f16, f16, f0
183	fmadd	f1,  f17, f17, f1
184	fmadd	f2,  f18, f18, f2
185	fmadd	f3,  f19, f19, f3
186	fmadd	f4,  f20, f20, f4
187	fmadd	f5,  f21, f21, f5
188	fmadd	f6,  f22, f22, f6
189	fmadd	f7,  f23, f23, f7
190	fmadd	f8,  f24, f24, f8
191	fmadd	f9,  f25, f25, f9
192	fmadd	f10, f26, f26, f10
193	fmadd	f11, f27, f27, f11
194	fmadd	f12, f28, f28, f12
195	fmadd	f13, f29, f29, f13
196	fmadd	f14, f30, f30, f14
197	fmadd	f15, f31, f31, f15
198	addi	X, X,  16 * SIZE
199	.align 4
200
201LL(150):
202	andi.	r0,  N, 15
203	mtspr	CTR, r0
204	beq-	cr0, LL(170)
205	.align 4
206
207LL(160):
208	LFD	f16,  0 * SIZE(X)
209	addi	X, X,  1 * SIZE
210	fmadd	f0,  f16, f16, f0
211	bdnz	LL(160)
212	.align 4
213
214LL(170):
215	fadd   f0,  f0,  f1
216	fadd   f2,  f2,  f3
217	fadd   f4,  f4,  f5
218	fadd   f6,  f6,  f7
219
220	fadd   f8,  f8,  f9
221	fadd   f10, f10, f11
222	fadd   f12, f12, f13
223	fadd   f14, f14, f15
224
225	fadd   f0,  f0,  f2
226	fadd   f4,  f4,  f6
227	fadd   f8,  f8,  f10
228	fadd   f12, f12, f14
229
230	fadd   f0,  f0,  f4
231	fadd   f8,  f8,  f12
232
233	fadd   f0,  f0,  f8
234
235	fsqrts  f1, f0
236	b	LL(9999)
237	.align 4
238
239LL(1000):
240	sub	X, X, INCX
241
242	srawi.	r0, N, 4
243	mtspr	CTR, r0
244	beq-	cr0, LL(1150)
245
246	LFDUX	f16, X, INCX
247	LFDUX	f17, X, INCX
248	LFDUX	f18, X, INCX
249	LFDUX	f19, X, INCX
250	LFDUX	f20, X, INCX
251	LFDUX	f21, X, INCX
252	LFDUX	f22, X, INCX
253	LFDUX	f23, X, INCX
254
255	LFDUX	f24, X, INCX
256	LFDUX	f25, X, INCX
257	LFDUX	f26, X, INCX
258	LFDUX	f27, X, INCX
259
260	LFDUX	f28, X, INCX
261	LFDUX	f29, X, INCX
262	LFDUX	f30, X, INCX
263	LFDUX	f31, X, INCX
264	bdz	LL(1120)
265	.align 4
266
267LL(1110):
268	fmadd	f0,  f16, f16, f0
269	fmadd	f1,  f17, f17, f1
270	fmadd	f2,  f18, f18, f2
271	fmadd	f3,  f19, f19, f3
272
273	LFDUX	f16, X, INCX
274	LFDUX	f17, X, INCX
275	LFDUX	f18, X, INCX
276	LFDUX	f19, X, INCX
277
278	fmadd	f4,  f20, f20, f4
279	fmadd	f5,  f21, f21, f5
280	fmadd	f6,  f22, f22, f6
281	fmadd	f7,  f23, f23, f7
282
283	LFDUX	f20, X, INCX
284	LFDUX	f21, X, INCX
285	LFDUX	f22, X, INCX
286	LFDUX	f23, X, INCX
287
288	fmadd	f8,  f24, f24, f8
289	fmadd	f9,  f25, f25, f9
290	fmadd	f10, f26, f26, f10
291	fmadd	f11, f27, f27, f11
292
293	LFDUX	f24, X, INCX
294	LFDUX	f25, X, INCX
295	LFDUX	f26, X, INCX
296	LFDUX	f27, X, INCX
297
298	fmadd	f12, f28, f28, f12
299	fmadd	f13, f29, f29, f13
300	fmadd	f14, f30, f30, f14
301	fmadd	f15, f31, f31, f15
302
303	LFDUX	f28, X, INCX
304	LFDUX	f29, X, INCX
305	LFDUX	f30, X, INCX
306	LFDUX	f31, X, INCX
307	bdnz	LL(1110)
308	.align 4
309
310LL(1120):
311	fmadd	f0,  f16, f16, f0
312	fmadd	f1,  f17, f17, f1
313	fmadd	f2,  f18, f18, f2
314	fmadd	f3,  f19, f19, f3
315
316	fmadd	f4,  f20, f20, f4
317	fmadd	f5,  f21, f21, f5
318	fmadd	f6,  f22, f22, f6
319	fmadd	f7,  f23, f23, f7
320
321	fmadd	f8,  f24, f24, f8
322	fmadd	f9,  f25, f25, f9
323	fmadd	f10, f26, f26, f10
324	fmadd	f11, f27, f27, f11
325
326	fmadd	f12, f28, f28, f12
327	fmadd	f13, f29, f29, f13
328	fmadd	f14, f30, f30, f14
329	fmadd	f15, f31, f31, f15
330	.align 4
331
332LL(1150):
333	andi.	r0,  N, 15
334	mtspr	CTR, r0
335	beq-	cr0, LL(1170)
336	.align 4
337
338LL(1160):
339	LFDUX	f16, X, INCX
340	fmadd	f0,  f16, f16, f0
341	bdnz	LL(1160)
342	.align 4
343
344LL(1170):
345	fadd   f0,  f0,  f1
346	fadd   f2,  f2,  f3
347	fadd   f4,  f4,  f5
348	fadd   f6,  f6,  f7
349
350	fadd   f8,  f8,  f9
351	fadd   f10, f10, f11
352	fadd   f12, f12, f13
353	fadd   f14, f14, f15
354
355	fadd   f0,  f0,  f2
356	fadd   f4,  f4,  f6
357	fadd   f8,  f8,  f10
358	fadd   f12, f12, f14
359
360	fadd   f0,  f0,  f4
361	fadd   f8,  f8,  f12
362
363	fadd   f0,  f0,  f8
364
365	fsqrts  f1, f0
366	.align 4
367
368LL(9999):
369	lfd	f14,    0(SP)
370	lfd	f15,    8(SP)
371	lfd	f16,   16(SP)
372	lfd	f17,   24(SP)
373
374	lfd	f18,   32(SP)
375	lfd	f19,   40(SP)
376	lfd	f20,   48(SP)
377	lfd	f21,   56(SP)
378
379	lfd	f22,   64(SP)
380	lfd	f23,   72(SP)
381	lfd	f24,   80(SP)
382	lfd	f25,   88(SP)
383
384	lfd	f26,   96(SP)
385	lfd	f27,  104(SP)
386	lfd	f28,  112(SP)
387	lfd	f29,  120(SP)
388
389	lfd	f30,  128(SP)
390	lfd	f31,  136(SP)
391
392	addi	SP, SP, STACKSIZE
393	blr
394
395	EPILOGUE
396