1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define N	r3
43#define X	r4
44#define INCX	r5
45
46#define PREA	r8
47#define	INCXM1	r9
48
49#define FZERO	f1
50
51#define STACKSIZE 160
52
53	PROLOGUE
54	PROFCODE
55
56	addi	SP, SP, -STACKSIZE
57	li	r0,   0
58
59	stfd	f14,    0(SP)
60	stfd	f15,    8(SP)
61	stfd	f16,   16(SP)
62	stfd	f17,   24(SP)
63
64	stfd	f18,   32(SP)
65	stfd	f19,   40(SP)
66	stfd	f20,   48(SP)
67	stfd	f21,   56(SP)
68
69	stfd	f22,   64(SP)
70	stfd	f23,   72(SP)
71	stfd	f24,   80(SP)
72	stfd	f25,   88(SP)
73
74	stfd	f26,   96(SP)
75	stfd	f27,  104(SP)
76	stfd	f28,  112(SP)
77	stfd	f29,  120(SP)
78
79	stfd	f30,  128(SP)
80	stfd	f31,  136(SP)
81
82	stw	r0,   144(SP)
83	lfs	FZERO,144(SP)
84
85#ifdef F_INTERFACE
86	LDINT	N,    0(N)
87	LDINT	INCX, 0(INCX)
88#endif
89
90	slwi	INCX, INCX, ZBASE_SHIFT
91	subi	INCXM1, INCX, SIZE
92
93	li	PREA, 10 * 16 * SIZE
94
95	cmpwi	cr0, N, 0
96	ble-	LL(9999)
97	cmpwi	cr0, INCX, 0
98	ble-	LL(9999)
99
100	LFD	f1, 0 * SIZE(X)
101	LFD	f2, 1 * SIZE(X)
102	add	X, X, INCX
103
104	fabs	f1, f1
105	fabs	f2, f2
106	fadd	f1, f1, f2
107
108	fmr	f0, f1
109	fmr	f2, f1
110	fmr	f3, f1
111
112	subi	N, N, 1
113
114	cmpwi	cr0, INCX, 2 * SIZE
115	bne-	cr0, LL(100)
116
117	srawi.	r0, N, 3
118	mtspr	CTR, r0
119	beq-	cr0, LL(50)
120	.align 4
121
122	LFD	f24,   0 * SIZE(X)
123	LFD	f25,   1 * SIZE(X)
124
125	fabs	f8,  f24
126	LFD	f26,   2 * SIZE(X)
127	fabs	f9,  f25
128	LFD	f27,   3 * SIZE(X)
129	fabs	f10, f26
130	LFD	f28,   4 * SIZE(X)
131	fabs	f11, f27
132	LFD	f29,   5 * SIZE(X)
133	fabs	f12, f28
134	LFD	f30,   6 * SIZE(X)
135	fabs	f13, f29
136	LFD	f31,   7 * SIZE(X)
137	fabs	f14, f30
138	nop
139	fabs	f15, f31
140	bdz	LL(20)
141	.align 4
142
143LL(10):
144	fadd	f4,  f8,  f9
145	dcbt	X, PREA
146	fadd	f5,  f10, f11
147	nop
148	fadd	f6,  f12, f13
149	LFD	f24,   8 * SIZE(X)
150	fadd	f7,  f14, f15
151	LFD	f25,   9 * SIZE(X)
152
153	fabs	f8,  f24
154	LFD	f26,  10 * SIZE(X)
155	fabs	f9,  f25
156	LFD	f27,  11 * SIZE(X)
157	fabs	f10, f26
158	fabs	f11, f27
159
160	fsub	f16, f0,  f4
161	fsub	f17, f1,  f5
162	fsub	f18, f2,  f6
163	LFD	f28,  12 * SIZE(X)
164	fsub	f19, f3,  f7
165	LFD	f29,  13 * SIZE(X)
166
167	fabs	f12, f28
168	LFD	f30,  14 * SIZE(X)
169	fabs	f13, f29
170	LFD	f31,  15 * SIZE(X)
171	fabs	f14, f30
172	fabs	f15, f31
173
174	fsel	f0,  f16, f4, f0
175	fsel	f1,  f17, f5, f1
176	fsel	f2,  f18, f6, f2
177	fsel	f3,  f19, f7, f3
178
179	fadd	f20, f8,  f9
180	fadd	f21, f10, f11
181	fadd	f22, f12, f13
182	LFD	f24,  16 * SIZE(X)
183	fadd	f23, f14, f15
184	LFD	f25,  17 * SIZE(X)
185
186	fabs	f8,  f24
187	LFD	f26,  18 * SIZE(X)
188	fabs	f9,  f25
189	LFD	f27,  19 * SIZE(X)
190	fabs	f10, f26
191	fabs	f11, f27
192
193	fsub	f16, f0,  f20
194	fsub	f17, f1,  f21
195	fsub	f18, f2,  f22
196	LFD	f28,  20 * SIZE(X)
197	fsub	f19, f3,  f23
198	LFD	f29,  21 * SIZE(X)
199
200	fabs	f12, f28
201	LFD	f30,  22 * SIZE(X)
202	fabs	f13, f29
203	LFD	f31,  23 * SIZE(X)
204	fabs	f14, f30
205	addi	X, X, 16 * SIZE
206	fabs	f15, f31
207
208	fsel	f0,  f16, f20, f0
209	fsel	f1,  f17, f21, f1
210	fsel	f2,  f18, f22, f2
211	fsel	f3,  f19, f23, f3
212
213	bdnz	LL(10)
214	.align 4
215
216LL(20):
217	fadd	f4,  f8,  f9
218	fadd	f5,  f10, f11
219	fadd	f6,  f12, f13
220	LFD	f24,   8 * SIZE(X)
221	fadd	f7,  f14, f15
222	LFD	f25,   9 * SIZE(X)
223
224	fabs	f8,  f24
225	LFD	f26,  10 * SIZE(X)
226	fabs	f9,  f25
227	LFD	f27,  11 * SIZE(X)
228	fabs	f10, f26
229	fabs	f11, f27
230
231	fsub	f16, f0,  f4
232	fsub	f17, f1,  f5
233	fsub	f18, f2,  f6
234	LFD	f28,  12 * SIZE(X)
235	fsub	f19, f3,  f7
236	LFD	f29,  13 * SIZE(X)
237
238	fabs	f12, f28
239	LFD	f30,  14 * SIZE(X)
240	fabs	f13, f29
241	LFD	f31,  15 * SIZE(X)
242	fabs	f14, f30
243	fabs	f15, f31
244
245	fsel	f0,  f16, f4, f0
246	fsel	f1,  f17, f5, f1
247	fsel	f2,  f18, f6, f2
248	fsel	f3,  f19, f7, f3
249
250	fadd	f20, f8,  f9
251	fadd	f21, f10, f11
252	fadd	f22, f12, f13
253	fadd	f23, f14, f15
254
255	fsub	f16, f0,  f20
256	fsub	f17, f1,  f21
257	fsub	f18, f2,  f22
258	fsub	f19, f3,  f23
259
260	fsel	f0,  f16, f20, f0
261	fsel	f1,  f17, f21, f1
262	fsel	f2,  f18, f22, f2
263	fsel	f3,  f19, f23, f3
264	addi	X, X, 16 * SIZE
265
266	.align 4
267
268LL(50):
269	andi.	r0,  N, 7
270	mtspr	CTR, r0
271	beq	LL(999)
272	.align 4
273
274LL(60):
275	LFD	f8,  0 * SIZE(X)
276	LFD	f9,  1 * SIZE(X)
277	addi	X, X,  2 * SIZE
278
279	fabs	f8, f8
280	fabs	f9, f9
281	fadd	f8, f8, f9
282	fsub	f16, f1, f8
283	fsel	f1, f16, f8, f1
284	bdnz	LL(60)
285	b	LL(999)
286	.align 4
287
288LL(100):
289	sub	X, X, INCXM1
290
291	srawi.	r0, N, 3
292	mtspr	CTR,  r0
293	beq-	LL(150)
294
295	LFDX	f24,   X, INCXM1
296	LFDUX	f25,   X, INCX
297	LFDX	f26,   X, INCXM1
298	LFDUX	f27,   X, INCX
299	LFDX	f28,   X, INCXM1
300	LFDUX	f29,   X, INCX
301	LFDX	f30,   X, INCXM1
302	LFDUX	f31,   X, INCX
303
304	fabs	f8,  f24
305	fabs	f9,  f25
306	fabs	f10, f26
307	fabs	f11, f27
308	fabs	f12, f28
309	fabs	f13, f29
310	fabs	f14, f30
311	fabs	f15, f31
312
313	LFDX	f24,   X, INCXM1
314	LFDUX	f25,   X, INCX
315	LFDX	f26,   X, INCXM1
316	LFDUX	f27,   X, INCX
317	LFDX	f28,   X, INCXM1
318	LFDUX	f29,   X, INCX
319	LFDX	f30,   X, INCXM1
320	LFDUX	f31,   X, INCX
321
322	bdz	LL(120)
323	.align 4
324
325LL(110):
326	fadd	f4,  f8,  f9
327	fadd	f5,  f10, f11
328	fadd	f6,  f12, f13
329	fadd	f7,  f14, f15
330
331	fabs	f8,  f24
332	fabs	f9,  f25
333	fabs	f10, f26
334	fabs	f11, f27
335
336	LFDX	f24,   X, INCXM1
337	LFDUX	f25,   X, INCX
338	LFDX	f26,   X, INCXM1
339	LFDUX	f27,   X, INCX
340
341	fabs	f12, f28
342	fabs	f13, f29
343	fabs	f14, f30
344	fabs	f15, f31
345
346	LFDX	f28,   X, INCXM1
347	LFDUX	f29,   X, INCX
348	LFDX	f30,   X, INCXM1
349	LFDUX	f31,   X, INCX
350
351	fsub	f16, f0,  f4
352	fsub	f17, f1,  f5
353	fsub	f18, f2,  f6
354	fsub	f19, f3,  f7
355
356	fadd	f20, f8,  f9
357	fadd	f21, f10, f11
358	fadd	f22, f12, f13
359	fadd	f23, f14, f15
360
361	fabs	f8,  f24
362	fabs	f9,  f25
363	fabs	f10, f26
364	fabs	f11, f27
365
366	LFDX	f24,   X, INCXM1
367	LFDUX	f25,   X, INCX
368	LFDX	f26,   X, INCXM1
369	LFDUX	f27,   X, INCX
370
371	fsel	f0,  f16, f4, f0
372	fsel	f1,  f17, f5, f1
373	fsel	f2,  f18, f6, f2
374	fsel	f3,  f19, f7, f3
375
376	fabs	f12, f28
377	fabs	f13, f29
378	fabs	f14, f30
379	fabs	f15, f31
380
381	LFDX	f28,   X, INCXM1
382	LFDUX	f29,   X, INCX
383	LFDX	f30,   X, INCXM1
384	LFDUX	f31,   X, INCX
385
386	fsub	f16, f0,  f20
387	fsub	f17, f1,  f21
388	fsub	f18, f2,  f22
389	fsub	f19, f3,  f23
390
391	fsel	f0,  f16, f20, f0
392	fsel	f1,  f17, f21, f1
393	fsel	f2,  f18, f22, f2
394	fsel	f3,  f19, f23, f3
395	bdnz	LL(110)
396	.align 4
397
398LL(120):
399	fadd	f4,  f8,  f9
400	fadd	f5,  f10, f11
401	fadd	f6,  f12, f13
402	fadd	f7,  f14, f15
403
404	fabs	f8,  f24
405	fabs	f9,  f25
406	fabs	f10, f26
407	fabs	f11, f27
408
409	fabs	f12, f28
410	fabs	f13, f29
411	fabs	f14, f30
412	fabs	f15, f31
413
414	fsub	f16, f0,  f4
415	fsub	f17, f1,  f5
416	fsub	f18, f2,  f6
417	fsub	f19, f3,  f7
418
419	fadd	f20, f8,  f9
420	fadd	f21, f10, f11
421	fadd	f22, f12, f13
422	fadd	f23, f14, f15
423
424	fsel	f0,  f16, f4, f0
425	fsel	f1,  f17, f5, f1
426	fsel	f2,  f18, f6, f2
427	fsel	f3,  f19, f7, f3
428
429	fsub	f16, f0,  f20
430	fsub	f17, f1,  f21
431	fsub	f18, f2,  f22
432	fsub	f19, f3,  f23
433
434	fsel	f0,  f16, f20, f0
435	fsel	f1,  f17, f21, f1
436	fsel	f2,  f18, f22, f2
437	fsel	f3,  f19, f23, f3
438	.align 4
439
440LL(150):
441	andi.	r0,  N, 7
442	mtspr	CTR, r0
443	beq	LL(999)
444	.align 4
445
446LL(160):
447	LFDX	f8,    X, INCXM1
448	LFDUX	f9,    X, INCX
449
450	fabs	f8, f8
451	fabs	f9, f9
452	fadd	f8, f8, f9
453	fsub	f16, f1, f8
454	fsel	f1, f16, f8, f1
455	bdnz	LL(160)
456	.align 4
457
458LL(999):
459	fsub	f8,  f0,  f1
460	fsub	f9,  f2,  f3
461
462	fsel	f0,  f8,  f1, f0
463	fsel	f2,  f9,  f3, f2
464	fsub	f8,  f0,  f2
465	fsel	f1,  f8,  f2, f0
466	.align 4
467
468LL(9999):
469	lfd	f14,    0(SP)
470	lfd	f15,    8(SP)
471	lfd	f16,   16(SP)
472	lfd	f17,   24(SP)
473
474	lfd	f18,   32(SP)
475	lfd	f19,   40(SP)
476	lfd	f20,   48(SP)
477	lfd	f21,   56(SP)
478
479	lfd	f22,   64(SP)
480	lfd	f23,   72(SP)
481	lfd	f24,   80(SP)
482	lfd	f25,   88(SP)
483
484	lfd	f26,   96(SP)
485	lfd	f27,  104(SP)
486	lfd	f28,  112(SP)
487	lfd	f29,  120(SP)
488
489	lfd	f30,  128(SP)
490	lfd	f31,  136(SP)
491
492	addi	SP, SP, STACKSIZE
493	blr
494
495	EPILOGUE
496