1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define N	r3
43#define X	r4
44#define INCX	r5
45
46#define PRE	r8
47#define INC1	r9
48
49#define FZERO	144(SP)
50#define FONE	148(SP)
51#define C1	152(SP)
52#define C2	156(SP)
53
54#define STACKSIZE 160
55
56	PROLOGUE
57	PROFCODE
58
59	addi	SP, SP, -STACKSIZE
60	li	r10,   0
61	lis	r11,   0x3f80
62	lis	r6,    0x3f00
63	lis	r7,    0x4040
64
65	stfd	f14,    0(SP)
66	stfd	f15,    8(SP)
67	stfd	f16,   16(SP)
68	stfd	f17,   24(SP)
69
70	stfd	f18,   32(SP)
71	stfd	f19,   40(SP)
72	stfd	f20,   48(SP)
73	stfd	f21,   56(SP)
74
75	stfd	f22,   64(SP)
76	stfd	f23,   72(SP)
77	stfd	f24,   80(SP)
78	stfd	f25,   88(SP)
79
80	stfd	f26,   96(SP)
81	stfd	f27,  104(SP)
82	stfd	f28,  112(SP)
83	stfd	f29,  120(SP)
84
85	stfd	f30,  128(SP)
86	stfd	f31,  136(SP)
87
88	stw	r10,  FZERO
89	stw	r11,  FONE
90	stw	r6,   C1
91	stw	r7,   C2
92
93	lfs	f1,   FZERO
94
95#ifdef F_INTERFACE
96	LDINT	N,    0(N)
97	LDINT	INCX, 0(INCX)
98#endif
99
100	slwi	INCX, INCX, ZBASE_SHIFT
101	li	INC1, SIZE
102	li	PRE, 3 * 16 * SIZE
103
104	cmpwi	cr0, N, 0
105	ble-	LL(999)
106	cmpwi	cr0, INCX, 0
107	ble-	LL(999)
108
109	fmr	f0,  f1
110	sub	X, X, INCX
111	fmr	f2,  f1
112	fmr	f3,  f1
113	fmr	f4,  f1
114	fmr	f5,  f1
115	fmr	f6,  f1
116	fmr	f7,  f1
117	fmr	f8,  f1
118	fmr	f9,  f1
119	fmr	f10, f1
120	fmr	f11, f1
121	fmr	f12, f1
122	fmr	f13, f1
123	fmr	f14, f1
124	fmr	f15, f1
125
126	srawi.	r0, N, 3
127	mtspr	CTR, r0
128	beq-	cr0, LL(1150)
129
130	LFDUX	f16, X, INCX
131	LFDX	f17, X, INC1
132	LFDUX	f18, X, INCX
133	LFDX	f19, X, INC1
134	LFDUX	f20, X, INCX
135	LFDX	f21, X, INC1
136	LFDUX	f22, X, INCX
137	LFDX	f23, X, INC1
138
139	LFDUX	f24, X, INCX
140	LFDX	f25, X, INC1
141	LFDUX	f26, X, INCX
142	LFDX	f27, X, INC1
143	LFDUX	f28, X, INCX
144	LFDX	f29, X, INC1
145	LFDUX	f30, X, INCX
146	LFDX	f31, X, INC1
147	bdz	LL(1120)
148	.align 4
149
150LL(1110):
151	fmadd	f0,  f16, f16, f0
152	LFDUX	f16, X, INCX
153	fmadd	f1,  f17, f17, f1
154	LFDX	f17, X, INC1
155	fmadd	f2,  f18, f18, f2
156	LFDUX	f18, X, INCX
157	fmadd	f3,  f19, f19, f3
158	LFDX	f19, X, INC1
159
160#ifdef PPCG4
161	dcbt	X, PRE
162#endif
163
164	fmadd	f4,  f20, f20, f4
165	LFDUX	f20, X, INCX
166	fmadd	f5,  f21, f21, f5
167	LFDX	f21, X, INC1
168	fmadd	f6,  f22, f22, f6
169	LFDUX	f22, X, INCX
170	fmadd	f7,  f23, f23, f7
171	LFDX	f23, X, INC1
172
173	fmadd	f8,  f24, f24, f8
174	LFDUX	f24, X, INCX
175	fmadd	f9,  f25, f25, f9
176	LFDX	f25, X, INC1
177	fmadd	f10, f26, f26, f10
178	LFDUX	f26, X, INCX
179	fmadd	f11, f27, f27, f11
180	LFDX	f27, X, INC1
181
182#ifdef PPCG4
183	dcbt	X, PRE
184#endif
185
186	fmadd	f12, f28, f28, f12
187	LFDUX	f28, X, INCX
188	fmadd	f13, f29, f29, f13
189	LFDX	f29, X, INC1
190	fmadd	f14, f30, f30, f14
191	LFDUX	f30, X, INCX
192	fmadd	f15, f31, f31, f15
193	LFDX	f31, X, INC1
194	bdnz	LL(1110)
195	.align 4
196
197LL(1120):
198	fmadd	f0,  f16, f16, f0
199	fmadd	f1,  f17, f17, f1
200	fmadd	f2,  f18, f18, f2
201	fmadd	f3,  f19, f19, f3
202
203	fmadd	f4,  f20, f20, f4
204	fmadd	f5,  f21, f21, f5
205	fmadd	f6,  f22, f22, f6
206	fmadd	f7,  f23, f23, f7
207
208	fmadd	f8,  f24, f24, f8
209	fmadd	f9,  f25, f25, f9
210	fmadd	f10, f26, f26, f10
211	fmadd	f11, f27, f27, f11
212
213	fmadd	f12, f28, f28, f12
214	fmadd	f13, f29, f29, f13
215	fmadd	f14, f30, f30, f14
216	fmadd	f15, f31, f31, f15
217	.align 4
218
219LL(1150):
220	andi.	r0,  N, 7
221	mtspr	CTR, r0
222	beq-	cr0, LL(1170)
223	.align 4
224
225LL(1160):
226	LFDUX	f16,   X, INCX
227	LFDX	f17,   X, INC1
228	fmadd	f0,  f16, f16, f0
229	fmadd	f1,  f17, f17, f1
230	bdnz	LL(1160)
231	.align 4
232
233LL(1170):
234	fadd   f0,  f0,  f1
235	fadd   f2,  f2,  f3
236	fadd   f4,  f4,  f5
237	fadd   f6,  f6,  f7
238
239	fadd   f8,  f8,  f9
240	fadd   f10, f10, f11
241	fadd   f12, f12, f13
242	fadd   f14, f14, f15
243
244	fadd   f0,  f0,  f2
245	fadd   f4,  f4,  f6
246	fadd   f8,  f8,  f10
247	fadd   f12, f12, f14
248
249	fadd   f0,  f0,  f4
250	fadd   f8,  f8,  f12
251
252	fadd   f1,  f0,  f8
253	lfs    f4,   FZERO
254
255	fcmpu	cr0, f1, f4
256	beq	cr0, LL(999)
257
258	frsqrte f0, f1
259	lfs	f8, C1
260	lfs	f9, C2
261
262	fmul	f2, f1, f0
263	fadd	f7, f8, f8
264	fmul	f3, f0, f8
265	fnmsub	f4, f2, f0, f9
266	fmul	f0, f3, f4
267
268	fmul	f5, f1, f0
269	fmul	f2, f5, f8
270	fnmsub	f3, f5, f0, f7
271	fmadd	f1, f2, f3, f5
272	.align 4
273
274LL(999):
275	lfd	f14,    0(SP)
276	lfd	f15,    8(SP)
277
278	lfd	f16,   16(SP)
279	lfd	f17,   24(SP)
280
281	lfd	f18,   32(SP)
282	lfd	f19,   40(SP)
283	lfd	f20,   48(SP)
284	lfd	f21,   56(SP)
285
286	lfd	f22,   64(SP)
287	lfd	f23,   72(SP)
288	lfd	f24,   80(SP)
289	lfd	f25,   88(SP)
290
291	lfd	f26,   96(SP)
292	lfd	f27,  104(SP)
293	lfd	f28,  112(SP)
294	lfd	f29,  120(SP)
295
296	lfd	f30,  128(SP)
297	lfd	f31,  136(SP)
298	addi	SP, SP, STACKSIZE
299	blr
300
301	EPILOGUE
302