1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define N	r3
43#define X	r4
44#define INCX	r5
45
46#define NN	r6
47#define XX	r7
48#define INC1	r9
49#define PRE	r10
50
51#define FZERO	144(SP)
52#define FONE	148(SP)
53#define FMAX	152(SP)
54#define C1	156(SP)
55#define C2	160(SP)
56
57#define STACKSIZE 168
58
59	PROLOGUE
60	PROFCODE
61
62	addi	SP, SP, -STACKSIZE
63	li	r10,   0
64	lis	r11,   0x3f80
65	lis	r12,   0x5fe0
66	lis	r6,    0x3f00
67	lis	r7,    0x4040
68
69	stfd	f14,    0(SP)
70	stfd	f15,    8(SP)
71	stfd	f16,   16(SP)
72	stfd	f17,   24(SP)
73
74	stfd	f18,   32(SP)
75	stfd	f19,   40(SP)
76	stfd	f20,   48(SP)
77	stfd	f21,   56(SP)
78
79	stfd	f22,   64(SP)
80	stfd	f23,   72(SP)
81	stfd	f24,   80(SP)
82	stfd	f25,   88(SP)
83
84	stfd	f26,   96(SP)
85	stfd	f27,  104(SP)
86	stfd	f28,  112(SP)
87	stfd	f29,  120(SP)
88
89	stfd	f30,  128(SP)
90	stfd	f31,  136(SP)
91
92	stw	r10,  FZERO
93	stw	r11,  FONE
94	stw	r12,  FMAX
95	stw	r10,  4 + FMAX
96	stw	r6,   C1
97	stw	r7,   C2
98
99	lfs	f1,   FZERO
100
101#ifdef F_INTERFACE
102	LDINT	N,    0(N)
103	LDINT	INCX, 0(INCX)
104#endif
105
106	slwi	INCX, INCX, ZBASE_SHIFT
107	sub	X, X, INCX
108	li	INC1, SIZE
109
110	li	PRE, 3 * 16 * SIZE
111
112	cmpwi	cr0, N, 0
113	ble-	LL(999)
114	cmpwi	cr0, INCX, 0
115	ble-	LL(999)
116
117	mr	NN, N
118	mr	XX, X
119
120	LFDUX	f0, X, INCX
121	LFDX	f1, X, INC1
122
123	fabs	f2, f0
124	fabs	f3, f1
125	fabs	f4, f0
126	fabs	f5, f1
127	fabs	f6, f0
128	fabs	f7, f1
129	fabs	f0, f0
130	fabs	f1, f1
131
132	subi	N, N, 1
133
134	srawi.	r0, N, 3
135	mtspr	CTR,  r0
136	beq-	LL(50)
137
138	LFDUX	f24,   X, INCX
139	LFDX	f25,   X, INC1
140	LFDUX	f26,   X, INCX
141	LFDX	f27,   X, INC1
142	LFDUX	f28,   X, INCX
143	LFDX	f29,   X, INC1
144	LFDUX	f30,   X, INCX
145	LFDX	f31,   X, INC1
146
147	fabs	f8,  f24
148	LFDUX	f24,   X, INCX
149	fabs	f9,  f25
150	LFDX	f25,   X, INC1
151	fabs	f10, f26
152	LFDUX	f26,   X, INCX
153	fabs	f11, f27
154	LFDX	f27,   X, INC1
155
156	fabs	f12, f28
157	LFDUX	f28,   X, INCX
158	fabs	f13, f29
159	LFDX	f29,   X, INC1
160	fabs	f14, f30
161	LFDUX	f30,   X, INCX
162	fabs	f15, f31
163	LFDX	f31,   X, INC1
164	bdz	LL(20)
165	.align 4
166
167LL(10):
168	fsub	f16, f0,  f8
169	fsub	f17, f1,  f9
170	fsub	f18, f2,  f10
171	fsub	f19, f3,  f11
172	fsub	f20, f4,  f12
173	fsub	f21, f5,  f13
174	fsub	f22, f6,  f14
175	fsub	f23, f7,  f15
176
177	fsel	f0,  f16, f0,  f8
178#ifdef PPCG4
179	dcbt	X, PRE
180#endif
181	fabs	f8,  f24
182	LFDUX	f24,   X, INCX
183	fsel	f1,  f17, f1,  f9
184	fabs	f9,  f25
185	LFDX	f25,   X, INC1
186	fsel	f2,  f18, f2,  f10
187	fabs	f10, f26
188	LFDUX	f26,   X, INCX
189	fsel	f3,  f19, f3,  f11
190	fabs	f11, f27
191	LFDX	f27,   X, INC1
192
193	fsel	f4,  f20, f4,  f12
194#ifdef PPCG4
195	dcbt	X, PRE
196#endif
197	fabs	f12, f28
198	LFDUX	f28,   X, INCX
199	fsel	f5,  f21, f5,  f13
200	fabs	f13, f29
201	LFDX	f29,   X, INC1
202	fsel	f6,  f22, f6,  f14
203	fabs	f14, f30
204	LFDUX	f30,   X, INCX
205	fsel	f7,  f23, f7,  f15
206	fabs	f15, f31
207	LFDX	f31,   X, INC1
208
209	fsub	f16, f0,  f8
210	fsub	f17, f1,  f9
211	fsub	f18, f2,  f10
212	fsub	f19, f3,  f11
213	fsub	f20, f4,  f12
214	fsub	f21, f5,  f13
215	fsub	f22, f6,  f14
216	fsub	f23, f7,  f15
217
218	fsel	f0,  f16, f0,  f8
219#ifdef PPCG4
220	dcbt	X, PRE
221#endif
222	fabs	f8,  f24
223	LFDUX	f24,   X, INCX
224	fsel	f1,  f17, f1,  f9
225	fabs	f9,  f25
226	LFDX	f25,   X, INC1
227	fsel	f2,  f18, f2,  f10
228	fabs	f10, f26
229	LFDUX	f26,   X, INCX
230	fsel	f3,  f19, f3,  f11
231	fabs	f11, f27
232	LFDX	f27,   X, INC1
233
234	fsel	f4,  f20, f4,  f12
235#ifdef PPCG4
236	dcbt	X, PRE
237#endif
238	fabs	f12, f28
239	LFDUX	f28,   X, INCX
240	fsel	f5,  f21, f5,  f13
241	fabs	f13, f29
242	LFDX	f29,   X, INC1
243	fsel	f6,  f22, f6,  f14
244	fabs	f14, f30
245	LFDUX	f30,   X, INCX
246	fsel	f7,  f23, f7,  f15
247	fabs	f15, f31
248	LFDX	f31,   X, INC1
249	bdnz	LL(10)
250	.align 4
251
252LL(20):
253	fsub	f16, f0,  f8
254	fsub	f17, f1,  f9
255	fsub	f18, f2,  f10
256	fsub	f19, f3,  f11
257	fsub	f20, f4,  f12
258	fsub	f21, f5,  f13
259	fsub	f22, f6,  f14
260	fsub	f23, f7,  f15
261
262	fsel	f0,  f16, f0,  f8
263	fabs	f8,  f24
264	fsel	f1,  f17, f1,  f9
265	fabs	f9,  f25
266	fsel	f2,  f18, f2,  f10
267	fabs	f10, f26
268	fsel	f3,  f19, f3,  f11
269	fabs	f11, f27
270
271	fsel	f4,  f20, f4,  f12
272	fabs	f12, f28
273	fsel	f5,  f21, f5,  f13
274	fabs	f13, f29
275	fsel	f6,  f22, f6,  f14
276	fabs	f14, f30
277	fsel	f7,  f23, f7,  f15
278	fabs	f15, f31
279
280	fsub	f16, f0,  f8
281	fsub	f17, f1,  f9
282	fsub	f18, f2,  f10
283	fsub	f19, f3,  f11
284	fsub	f20, f4,  f12
285	fsub	f21, f5,  f13
286	fsub	f22, f6,  f14
287	fsub	f23, f7,  f15
288
289	fsel	f0,  f16, f0,  f8
290	fsel	f1,  f17, f1,  f9
291	fsel	f2,  f18, f2,  f10
292	fsel	f3,  f19, f3,  f11
293	fsel	f4,  f20, f4,  f12
294	fsel	f5,  f21, f5,  f13
295	fsel	f6,  f22, f6,  f14
296	fsel	f7,  f23, f7,  f15
297	.align 4
298
299LL(50):
300	andi.	r0,  N, 7
301	mtspr	CTR, r0
302	beq	LL(99)
303	.align 4
304
305LL(60):
306	LFDUX	f8,    X, INCX
307	LFDX	f9,    X, INC1
308
309	fabs	f8, f8
310	fabs	f9, f9
311	fsub	f16, f0, f8
312	fsub	f17, f1, f9
313	fsel	f0, f16, f0, f8
314	fsel	f1, f17, f1, f9
315	bdnz	LL(60)
316	.align 4
317
318LL(99):
319	fsub	f8,  f0,  f1
320	fsub	f9,  f2,  f3
321	fsub	f10, f4,  f5
322	fsub	f11, f6,  f7
323
324	fsel	f0,  f8,  f0,  f1
325	fsel	f2,  f9,  f2,  f3
326	fsel	f4,  f10, f4,  f5
327	fsel	f6,  f11, f6,  f7
328
329	fsub	f8,  f0,  f2
330	fsub	f9,  f4,  f6
331	fsel	f0,  f8,  f0,  f2
332	fsel	f4,  f9,  f4,  f6
333
334	fsub	f8,  f0,  f4
335	fsel	f31, f8,  f0,  f4
336
337	lfs	f1,  FZERO
338	lfs	f0,  FONE
339
340	fcmpu	cr0, f1, f31
341	beq-	cr0, LL(999)
342
343	fdiv	f30, f0, f31
344
345	fmr	f0, f1
346	fmr	f2, f1
347	fmr	f3, f1
348	fmr	f4, f1
349	fmr	f5, f1
350	fmr	f6, f1
351	fmr	f7, f1
352
353	srawi.	r0, NN, 3
354	mtspr	CTR, r0
355	beq-	cr0, LL(150)
356
357	LFDUX	f8,   XX, INCX
358	LFDX	f9,   XX, INC1
359	LFDUX	f10,  XX, INCX
360	LFDX	f11,  XX, INC1
361	LFDUX	f12,  XX, INCX
362	LFDX	f13,  XX, INC1
363	LFDUX	f14,  XX, INCX
364	LFDX	f15,  XX, INC1
365
366	fmul	f16, f30, f8
367	LFDUX	f8,  XX, INCX
368	fmul	f17, f30, f9
369	LFDX	f9,  XX, INC1
370	fmul	f18, f30, f10
371	LFDUX	f10, XX, INCX
372	fmul	f19, f30, f11
373	LFDX	f11, XX, INC1
374
375	fmul	f20, f30, f12
376	LFDUX	f12, XX, INCX
377	fmul	f21, f30, f13
378	LFDX	f13, XX, INC1
379	fmul	f22, f30, f14
380	LFDUX	f14, XX, INCX
381	fmul	f23, f30, f15
382	LFDX	f15, XX, INC1
383	bdz	LL(120)
384	.align 4
385
386LL(110):
387	fmadd	f0,  f16, f16, f0
388#ifdef PPCG4
389	dcbt	XX, PRE
390#endif
391	fmul	f16, f30, f8
392	LFDUX	f8,   XX, INCX
393	fmadd	f1,  f17, f17, f1
394	fmul	f17, f30, f9
395	LFDX	f9,   XX, INC1
396	fmadd	f2,  f18, f18, f2
397	fmul	f18, f30, f10
398	LFDUX	f10,  XX, INCX
399	fmadd	f3,  f19, f19, f3
400	fmul	f19, f30, f11
401	LFDX	f11,  XX, INC1
402
403	fmadd	f4,  f20, f20, f4
404#ifdef PPCG4
405	dcbt	XX, PRE
406#endif
407	fmul	f20, f30, f12
408	LFDUX	f12,  XX, INCX
409	fmadd	f5,  f21, f21, f5
410	fmul	f21, f30, f13
411	LFDX	f13,  XX, INC1
412	fmadd	f6,  f22, f22, f6
413	fmul	f22, f30, f14
414	LFDUX	f14,  XX, INCX
415	fmadd	f7,  f23, f23, f7
416	fmul	f23, f30, f15
417	LFDX	f15,  XX, INC1
418
419	fmadd	f0,  f16, f16, f0
420#ifdef PPCG4
421	dcbt	XX, PRE
422#endif
423	fmul	f16, f30, f8
424	LFDUX	f8,   XX, INCX
425	fmadd	f1,  f17, f17, f1
426	fmul	f17, f30, f9
427	LFDX	f9,   XX, INC1
428	fmadd	f2,  f18, f18, f2
429	fmul	f18, f30, f10
430	LFDUX	f10,  XX, INCX
431	fmadd	f3,  f19, f19, f3
432	fmul	f19, f30, f11
433	LFDX	f11,  XX, INC1
434
435	fmadd	f4,  f20, f20, f4
436#ifdef PPCG4
437	dcbt	XX, PRE
438#endif
439	fmul	f20, f30, f12
440	LFDUX	f12,  XX, INCX
441	fmadd	f5,  f21, f21, f5
442	fmul	f21, f30, f13
443	LFDX	f13,  XX, INC1
444	fmadd	f6,  f22, f22, f6
445	fmul	f22, f30, f14
446	LFDUX	f14,  XX, INCX
447	fmadd	f7,  f23, f23, f7
448	fmul	f23, f30, f15
449	LFDX	f15,  XX, INC1
450	bdnz	LL(110)
451	.align 4
452
453LL(120):
454	fmadd	f0,  f16, f16, f0
455	fmul	f16, f30, f8
456	fmadd	f1,  f17, f17, f1
457	fmul	f17, f30, f9
458	fmadd	f2,  f18, f18, f2
459	fmul	f18, f30, f10
460	fmadd	f3,  f19, f19, f3
461	fmul	f19, f30, f11
462
463	fmadd	f4,  f20, f20, f4
464	fmul	f20, f30, f12
465	fmadd	f5,  f21, f21, f5
466	fmul	f21, f30, f13
467	fmadd	f6,  f22, f22, f6
468	fmul	f22, f30, f14
469	fmadd	f7,  f23, f23, f7
470	fmul	f23, f30, f15
471
472	fmadd	f0,  f16, f16, f0
473	fmadd	f1,  f17, f17, f1
474	fmadd	f2,  f18, f18, f2
475	fmadd	f3,  f19, f19, f3
476	fmadd	f4,  f20, f20, f4
477	fmadd	f5,  f21, f21, f5
478	fmadd	f6,  f22, f22, f6
479	fmadd	f7,  f23, f23, f7
480	.align 4
481
482LL(150):
483	andi.	r0,  NN, 7
484	mtspr	CTR, r0
485	beq-	cr0, LL(170)
486	.align 4
487
488LL(160):
489	LFDUX	f8,  XX, INCX
490	LFDX	f9,  XX, INC1
491
492	fmul	f16, f30, f8
493	fmul	f17, f30, f9
494	fmadd	f0,  f16, f16, f0
495	fmadd	f1,  f17, f17, f1
496	bdnz	LL(160)
497	.align 4
498
499LL(170):
500	fadd   f0, f0, f1
501	fadd   f2, f2, f3
502	fadd   f4, f4, f5
503	fadd   f6, f6, f7
504
505	fadd   f0, f0, f2
506	fadd   f4, f4, f6
507
508	fadd   f1, f0, f4
509
510	frsqrte f0, f1
511	lfs	f8, C1
512	lfs	f9, C2
513
514	fmul	f2, f1, f0
515	fadd	f7, f8, f8
516	fmul	f3, f0, f8
517	fnmsub	f4, f2, f0, f9
518	fmul	f0, f3, f4
519
520	fmul	f2, f1, f0
521	fmul	f3, f0, f8
522	fnmsub	f4, f2, f0, f9
523	fmul	f0, f3, f4
524
525	fmul	f2, f1, f0
526	fmul	f3, f0, f8
527	fnmsub	f4, f2, f0, f9
528	fmul	f0, f3, f4
529
530	fmul	f5, f1, f0
531	fmul	f2, f5, f8
532	fnmsub	f3, f5, f0, f7
533	fmadd	f1, f2, f3, f5
534	fmul    f1,  f31, f1
535	.align 4
536
537LL(999):
538	lfd	f14,    0(SP)
539	lfd	f15,    8(SP)
540	lfd	f16,   16(SP)
541	lfd	f17,   24(SP)
542
543	lfd	f18,   32(SP)
544	lfd	f19,   40(SP)
545	lfd	f20,   48(SP)
546	lfd	f21,   56(SP)
547
548	lfd	f22,   64(SP)
549	lfd	f23,   72(SP)
550	lfd	f24,   80(SP)
551	lfd	f25,   88(SP)
552
553	lfd	f26,   96(SP)
554	lfd	f27,  104(SP)
555	lfd	f28,  112(SP)
556	lfd	f29,  120(SP)
557
558	lfd	f30,  128(SP)
559	lfd	f31,  136(SP)
560
561	addi	SP, SP, STACKSIZE
562	blr
563
564	EPILOGUE
565