1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define N	r3
43#define X	r4
44#define INCX	r5
45
46#define NN	r6
47#define XX	r7
48
49#define PRE	r8
50
51#define FZERO	144(SP)
52#define FONE	148(SP)
53#define FMAX	152(SP)
54#define C1	156(SP)
55#define C2	160(SP)
56
57#define STACKSIZE 168
58
59	PROLOGUE
60	PROFCODE
61
62	addi	SP, SP, -STACKSIZE
63	li	r10,   0
64	lis	r11,   0x3f80
65	lis	r12,   0x5fe0
66	lis	r6,    0x3f00
67	lis	r7,    0x4040
68
69	stfd	f14,    0(SP)
70	stfd	f15,    8(SP)
71	stfd	f16,   16(SP)
72	stfd	f17,   24(SP)
73
74	stfd	f18,   32(SP)
75	stfd	f19,   40(SP)
76	stfd	f20,   48(SP)
77	stfd	f21,   56(SP)
78
79	stfd	f22,   64(SP)
80	stfd	f23,   72(SP)
81	stfd	f24,   80(SP)
82	stfd	f25,   88(SP)
83
84	stfd	f26,   96(SP)
85	stfd	f27,  104(SP)
86	stfd	f28,  112(SP)
87	stfd	f29,  120(SP)
88
89	stfd	f30,  128(SP)
90	stfd	f31,  136(SP)
91
92	stw	r10,  FZERO
93	stw	r11,  FONE
94	stw	r12,  FMAX
95	stw	r10,  4 + FMAX
96	stw	r6,   C1
97	stw	r7,   C2
98
99	lfs	f1,   FZERO
100
101#ifdef F_INTERFACE
102	LDINT	N,    0(N)
103	LDINT	INCX, 0(INCX)
104#endif
105
106	slwi	INCX, INCX, BASE_SHIFT
107	sub	X, X, INCX
108
109	li	PRE, 3 * 16 * SIZE
110
111	cmpwi	cr0, N, 0
112	ble-	LL(999)
113	cmpwi	cr0, INCX, 0
114	ble-	LL(999)
115
116	mr	NN, N
117	mr	XX, X
118
119	LFDUX	f1, X, INCX
120
121	fabs	f0, f1
122	fabs	f2, f1
123	fabs	f3, f1
124	fabs	f4, f1
125	fabs	f5, f1
126	fabs	f6, f1
127	fabs	f7, f1
128	fabs	f1, f1
129	subi	N, N, 1
130
131	cmpwi	cr0, N, 0
132	ble-	LL(999)
133
134	srawi.	r0, N, 4
135	mtspr	CTR,  r0
136	beq-	LL(50)
137
138	LFDUX	f24,   X, INCX
139	LFDUX	f25,   X, INCX
140	LFDUX	f26,   X, INCX
141	LFDUX	f27,   X, INCX
142	LFDUX	f28,   X, INCX
143	LFDUX	f29,   X, INCX
144	LFDUX	f30,   X, INCX
145	LFDUX	f31,   X, INCX
146
147	fabs	f8,  f24
148	LFDUX	f24,   X, INCX
149	fabs	f9,  f25
150	LFDUX	f25,   X, INCX
151	fabs	f10, f26
152	LFDUX	f26,   X, INCX
153	fabs	f11, f27
154	LFDUX	f27,   X, INCX
155
156	fabs	f12, f28
157	LFDUX	f28,   X, INCX
158	fabs	f13, f29
159	LFDUX	f29,   X, INCX
160	fabs	f14, f30
161	LFDUX	f30,   X, INCX
162	fabs	f15, f31
163	LFDUX	f31,   X, INCX
164	bdz	LL(20)
165	.align 4
166
167LL(10):
168	fsub	f16, f0,  f8
169	fsub	f17, f1,  f9
170	fsub	f18, f2,  f10
171	fsub	f19, f3,  f11
172	fsub	f20, f4,  f12
173	fsub	f21, f5,  f13
174	fsub	f22, f6,  f14
175	fsub	f23, f7,  f15
176
177	fsel	f0,  f16, f0,  f8
178#ifdef PPCG4
179	dcbt	X, PRE
180#endif
181	fabs	f8,  f24
182	LFDUX	f24,   X, INCX
183	fsel	f1,  f17, f1,  f9
184	fabs	f9,  f25
185	LFDUX	f25,   X, INCX
186	fsel	f2,  f18, f2,  f10
187	fabs	f10, f26
188	LFDUX	f26,   X, INCX
189	fsel	f3,  f19, f3,  f11
190	fabs	f11, f27
191	LFDUX	f27,   X, INCX
192
193	fsel	f4,  f20, f4,  f12
194#ifdef PPCG4
195	dcbt	X, PRE
196#endif
197	fabs	f12, f28
198	LFDUX	f28,   X, INCX
199	fsel	f5,  f21, f5,  f13
200	fabs	f13, f29
201	LFDUX	f29,   X, INCX
202	fsel	f6,  f22, f6,  f14
203	fabs	f14, f30
204	LFDUX	f30,   X, INCX
205	fsel	f7,  f23, f7,  f15
206	fabs	f15, f31
207	LFDUX	f31,   X, INCX
208
209	fsub	f16, f0,  f8
210	fsub	f17, f1,  f9
211	fsub	f18, f2,  f10
212	fsub	f19, f3,  f11
213	fsub	f20, f4,  f12
214	fsub	f21, f5,  f13
215	fsub	f22, f6,  f14
216	fsub	f23, f7,  f15
217
218	fsel	f0,  f16, f0,  f8
219#ifdef PPCG4
220	dcbt	X, PRE
221#endif
222	fabs	f8,  f24
223	LFDUX	f24,   X, INCX
224	fsel	f1,  f17, f1,  f9
225	fabs	f9,  f25
226	LFDUX	f25,   X, INCX
227	fsel	f2,  f18, f2,  f10
228	fabs	f10, f26
229	LFDUX	f26,   X, INCX
230	fsel	f3,  f19, f3,  f11
231	fabs	f11, f27
232	LFDUX	f27,   X, INCX
233
234	fsel	f4,  f20, f4,  f12
235#ifdef PPCG4
236	dcbt	X, PRE
237#endif
238	fabs	f12, f28
239	LFDUX	f28,   X, INCX
240	fsel	f5,  f21, f5,  f13
241	fabs	f13, f29
242	LFDUX	f29,   X, INCX
243	fsel	f6,  f22, f6,  f14
244	fabs	f14, f30
245	LFDUX	f30,   X, INCX
246	fsel	f7,  f23, f7,  f15
247	fabs	f15, f31
248	LFDUX	f31,   X, INCX
249	bdnz	LL(10)
250	.align 4
251
252LL(20):
253	fsub	f16, f0,  f8
254	fsub	f17, f1,  f9
255	fsub	f18, f2,  f10
256	fsub	f19, f3,  f11
257	fsub	f20, f4,  f12
258	fsub	f21, f5,  f13
259	fsub	f22, f6,  f14
260	fsub	f23, f7,  f15
261
262	fsel	f0,  f16, f0,  f8
263	fabs	f8,  f24
264	fsel	f1,  f17, f1,  f9
265	fabs	f9,  f25
266	fsel	f2,  f18, f2,  f10
267	fabs	f10, f26
268	fsel	f3,  f19, f3,  f11
269	fabs	f11, f27
270
271	fsel	f4,  f20, f4,  f12
272	fabs	f12, f28
273	fsel	f5,  f21, f5,  f13
274	fabs	f13, f29
275	fsel	f6,  f22, f6,  f14
276	fabs	f14, f30
277	fsel	f7,  f23, f7,  f15
278	fabs	f15, f31
279
280	fsub	f16, f0,  f8
281	fsub	f17, f1,  f9
282	fsub	f18, f2,  f10
283	fsub	f19, f3,  f11
284	fsub	f20, f4,  f12
285	fsub	f21, f5,  f13
286	fsub	f22, f6,  f14
287	fsub	f23, f7,  f15
288
289	fsel	f0,  f16, f0,  f8
290	fsel	f1,  f17, f1,  f9
291	fsel	f2,  f18, f2,  f10
292	fsel	f3,  f19, f3,  f11
293	fsel	f4,  f20, f4,  f12
294	fsel	f5,  f21, f5,  f13
295	fsel	f6,  f22, f6,  f14
296	fsel	f7,  f23, f7,  f15
297	.align 4
298
299LL(50):
300	andi.	r0,  N, 15
301	mtspr	CTR, r0
302	beq	LL(99)
303	.align 4
304
305LL(60):
306	LFDUX	f8,    X, INCX
307	fabs	f8, f8
308	fsub	f16, f1, f8
309	fsel	f1, f16, f1, f8
310	bdnz	LL(60)
311	.align 4
312
313LL(99):
314	fsub	f8,  f0,  f1
315	fsub	f9,  f2,  f3
316	fsub	f10, f4,  f5
317	fsub	f11, f6,  f7
318
319	fsel	f0,  f8,  f0,  f1
320	fsel	f2,  f9,  f2,  f3
321	fsel	f4,  f10, f4,  f5
322	fsel	f6,  f11, f6,  f7
323
324	fsub	f8,  f0,  f2
325	fsub	f9,  f4,  f6
326	fsel	f0,  f8,  f0,  f2
327	fsel	f4,  f9,  f4,  f6
328
329	fsub	f8,  f0,  f4
330	fsel	f31, f8,  f0,  f4
331
332	lfs	f1,  FZERO
333	lfs	f0,  FONE
334	lfd	f2,  FMAX
335
336	fcmpu	cr0, f1, f31
337	beq-	cr0, LL(999)
338
339	fdiv	f30, f0, f31
340
341	fmr	f0, f1
342	fmr	f2, f1
343	fmr	f3, f1
344	fmr	f4, f1
345	fmr	f5, f1
346	fmr	f6, f1
347	fmr	f7, f1
348
349	srawi.	r0, NN, 4
350	mtspr	CTR, r0
351	beq-	cr0, LL(150)
352
353	LFDUX	f8,   XX, INCX
354	LFDUX	f9,   XX, INCX
355	LFDUX	f10,  XX, INCX
356	LFDUX	f11,  XX, INCX
357	LFDUX	f12,  XX, INCX
358	LFDUX	f13,  XX, INCX
359	LFDUX	f14,  XX, INCX
360	LFDUX	f15,  XX, INCX
361
362	fmul	f16, f30, f8
363	LFDUX	f8,  XX, INCX
364	fmul	f17, f30, f9
365	LFDUX	f9,  XX, INCX
366	fmul	f18, f30, f10
367	LFDUX	f10, XX, INCX
368	fmul	f19, f30, f11
369	LFDUX	f11, XX, INCX
370
371	fmul	f20, f30, f12
372	LFDUX	f12, XX, INCX
373	fmul	f21, f30, f13
374	LFDUX	f13, XX, INCX
375	fmul	f22, f30, f14
376	LFDUX	f14, XX, INCX
377	fmul	f23, f30, f15
378	LFDUX	f15, XX, INCX
379	bdz	LL(120)
380	.align 4
381
382LL(110):
383	fmadd	f0,  f16, f16, f0
384#ifdef PPCG4
385	dcbt	XX, PRE
386#endif
387	fmul	f16, f30, f8
388	LFDUX	f8,   XX, INCX
389	fmadd	f1,  f17, f17, f1
390	fmul	f17, f30, f9
391	LFDUX	f9,   XX, INCX
392	fmadd	f2,  f18, f18, f2
393	fmul	f18, f30, f10
394	LFDUX	f10,  XX, INCX
395	fmadd	f3,  f19, f19, f3
396	fmul	f19, f30, f11
397	LFDUX	f11,  XX, INCX
398
399	fmadd	f4,  f20, f20, f4
400#ifdef PPCG4
401	dcbt	XX, PRE
402#endif
403	fmul	f20, f30, f12
404	LFDUX	f12,  XX, INCX
405	fmadd	f5,  f21, f21, f5
406	fmul	f21, f30, f13
407	LFDUX	f13,  XX, INCX
408	fmadd	f6,  f22, f22, f6
409	fmul	f22, f30, f14
410	LFDUX	f14,  XX, INCX
411	fmadd	f7,  f23, f23, f7
412	fmul	f23, f30, f15
413	LFDUX	f15,  XX, INCX
414
415	fmadd	f0,  f16, f16, f0
416#ifdef PPCG4
417	dcbt	XX, PRE
418#endif
419	fmul	f16, f30, f8
420	LFDUX	f8,   XX, INCX
421	fmadd	f1,  f17, f17, f1
422	fmul	f17, f30, f9
423	LFDUX	f9,   XX, INCX
424	fmadd	f2,  f18, f18, f2
425	fmul	f18, f30, f10
426	LFDUX	f10,  XX, INCX
427	fmadd	f3,  f19, f19, f3
428	fmul	f19, f30, f11
429	LFDUX	f11,  XX, INCX
430
431	fmadd	f4,  f20, f20, f4
432#ifdef PPCG4
433	dcbt	XX, PRE
434#endif
435	fmul	f20, f30, f12
436	LFDUX	f12,  XX, INCX
437	fmadd	f5,  f21, f21, f5
438	fmul	f21, f30, f13
439	LFDUX	f13,  XX, INCX
440	fmadd	f6,  f22, f22, f6
441	fmul	f22, f30, f14
442	LFDUX	f14,  XX, INCX
443	fmadd	f7,  f23, f23, f7
444	fmul	f23, f30, f15
445	LFDUX	f15,  XX, INCX
446	bdnz	LL(110)
447	.align 4
448
449LL(120):
450	fmadd	f0,  f16, f16, f0
451	fmul	f16, f30, f8
452	fmadd	f1,  f17, f17, f1
453	fmul	f17, f30, f9
454	fmadd	f2,  f18, f18, f2
455	fmul	f18, f30, f10
456	fmadd	f3,  f19, f19, f3
457	fmul	f19, f30, f11
458
459	fmadd	f4,  f20, f20, f4
460	fmul	f20, f30, f12
461	fmadd	f5,  f21, f21, f5
462	fmul	f21, f30, f13
463	fmadd	f6,  f22, f22, f6
464	fmul	f22, f30, f14
465	fmadd	f7,  f23, f23, f7
466	fmul	f23, f30, f15
467
468	fmadd	f0,  f16, f16, f0
469	fmadd	f1,  f17, f17, f1
470	fmadd	f2,  f18, f18, f2
471	fmadd	f3,  f19, f19, f3
472	fmadd	f4,  f20, f20, f4
473	fmadd	f5,  f21, f21, f5
474	fmadd	f6,  f22, f22, f6
475	fmadd	f7,  f23, f23, f7
476	.align 4
477
478LL(150):
479	andi.	r0,  NN, 15
480	mtspr	CTR, r0
481	beq-	cr0, LL(170)
482	.align 4
483
484LL(160):
485	LFDUX	f8,  XX, INCX
486
487	fmul	f16, f30, f8
488	fmadd	f0,  f16, f16, f0
489	bdnz	LL(160)
490	.align 4
491
492LL(170):
493	fadd   f0, f0, f1
494	fadd   f2, f2, f3
495	fadd   f4, f4, f5
496	fadd   f6, f6, f7
497
498	fadd   f0, f0, f2
499	fadd   f4, f4, f6
500
501	fadd   f1, f0, f4
502
503	frsqrte f0, f1
504	lfs	f8, C1
505	lfs	f9, C2
506
507	fmul	f2, f1, f0
508	fadd	f7, f8, f8
509	fmul	f3, f0, f8
510	fnmsub	f4, f2, f0, f9
511	fmul	f0, f3, f4
512
513	fmul	f2, f1, f0
514	fmul	f3, f0, f8
515	fnmsub	f4, f2, f0, f9
516	fmul	f0, f3, f4
517
518	fmul	f2, f1, f0
519	fmul	f3, f0, f8
520	fnmsub	f4, f2, f0, f9
521	fmul	f0, f3, f4
522
523	fmul	f5, f1, f0
524	fmul	f2, f5, f8
525	fnmsub	f3, f5, f0, f7
526	fmadd	f1, f2, f3, f5
527	fmul    f1,  f31, f1
528	.align 4
529
530LL(999):
531	lfd	f14,    0(SP)
532	lfd	f15,    8(SP)
533	lfd	f16,   16(SP)
534	lfd	f17,   24(SP)
535
536	lfd	f18,   32(SP)
537	lfd	f19,   40(SP)
538	lfd	f20,   48(SP)
539	lfd	f21,   56(SP)
540
541	lfd	f22,   64(SP)
542	lfd	f23,   72(SP)
543	lfd	f24,   80(SP)
544	lfd	f25,   88(SP)
545
546	lfd	f26,   96(SP)
547	lfd	f27,  104(SP)
548	lfd	f28,  112(SP)
549	lfd	f29,  120(SP)
550
551	lfd	f30,  128(SP)
552	lfd	f31,  136(SP)
553
554	addi	SP, SP, STACKSIZE
555	blr
556	EPILOGUE
557