1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define N	r3
43#define X	r4
44#define INCX	r5
45
46#define INCX2	r6
47#define X2	r7
48
49#define C1	f1
50#define C2	f0
51#define C3	f2
52#define C4	f3
53
54#define A1	f4
55#define A2	f5
56#define A3	f6
57#define A4	f7
58#define A5	f8
59#define A6	f9
60#define A7	f10
61#define A8	f11
62
63#define T1	f12
64#define T2	f13
65#define T3	f14
66#define T4	f15
67
68	PROLOGUE
69	PROFCODE
70
71	li	r10, -16
72
73	stfpdux	f14, SP, r10
74	stfpdux	f15, SP, r10
75
76	li	r10,   0
77	stwu	r10,   -4(SP)
78	stwu	r10,   -4(SP)
79	stwu	r10,   -4(SP)
80	stwu	r10,   -4(SP)
81
82#ifdef F_INTERFACE
83	LDINT	N,    0(N)
84	LDINT	INCX, 0(INCX)
85#endif
86
87	lfpdx	C1, SP, r10		# Zero clear
88
89	slwi	INCX,  INCX, BASE_SHIFT
90	add	INCX2, INCX, INCX
91
92	fpmr	C2, C1
93	fpmr	C3, C1
94	fpmr	C4, C1
95
96	cmpwi	cr0, N, 0
97	ble	LL(999)
98	cmpwi	cr0, INCX, 0
99	ble	LL(999)
100
101	cmpwi	cr0, INCX, SIZE
102	bne	LL(100)
103
104	andi.	r0, X, 2 * SIZE - 1
105	beq	LL(05)
106
107	LFD	C1, 0(X)
108	addi	X, X, 1 * SIZE
109	addi	N, N, -1
110	cmpwi	cr0, N, 0
111	fabs	C1, C1
112	ble	LL(999)
113	.align 4
114
115LL(05):
116	srawi.	r0, N, 4
117	sub	X, X, INCX2
118	mtspr	CTR,  r0
119	beq-	LL(15)
120
121	LFPDUX	A1,   X, INCX2
122	fpmr	T1, C2
123	LFPDUX	A2,   X, INCX2
124	fpmr	T2, C2
125	LFPDUX	A3,   X, INCX2
126	fpmr	T3, C2
127	LFPDUX	A4,   X, INCX2
128	fpmr	T4, C2
129	LFPDUX	A5,   X, INCX2
130	LFPDUX	A6,   X, INCX2
131	LFPDUX	A7,   X, INCX2
132	LFPDUX	A8,   X, INCX2
133	bdz	LL(13)
134	.align 4
135
136LL(12):
137	fpadd	C1, C1, T1
138	nop
139	fpabs	T1, A1
140	LFPDUX	A1,   X, INCX2
141
142	fpadd	C2, C2, T2
143	nop
144	fpabs	T2, A2
145	LFPDUX	A2,   X, INCX2
146
147	fpadd	C3, C3, T3
148	nop
149	fpabs	T3, A3
150	LFPDUX	A3,   X, INCX2
151
152	fpadd	C4, C4, T4
153	nop
154	fpabs	T4, A4
155	LFPDUX	A4,   X, INCX2
156
157	fpadd	C1, C1, T1
158	nop
159	fpabs	T1, A5
160	LFPDUX	A5,   X, INCX2
161
162	fpadd	C2, C2, T2
163	nop
164	fpabs	T2, A6
165	LFPDUX	A6,   X, INCX2
166
167	fpadd	C3, C3, T3
168	nop
169	fpabs	T3, A7
170	LFPDUX	A7,   X, INCX2
171
172	fpadd	C4, C4, T4
173	fpabs	T4, A8
174	LFPDUX	A8,   X, INCX2
175	bdnz	LL(12)
176	.align 4
177
178LL(13):
179	fpadd	C1, C1, T1
180	fpabs	T1, A1
181	fpadd	C2, C2, T2
182	fpabs	T2, A2
183	fpadd	C3, C3, T3
184	fpabs	T3, A3
185	fpadd	C4, C4, T4
186	fpabs	T4, A4
187
188	fpadd	C1, C1, T1
189	fpabs	T1, A5
190	fpadd	C2, C2, T2
191	fpabs	T2, A6
192	fpadd	C3, C3, T3
193	fpabs	T3, A7
194	fpadd	C4, C4, T4
195	fpabs	T4, A8
196
197	fpadd	C1, C1, T1
198	fpadd	C2, C2, T2
199	fpadd	C3, C3, T3
200	fpadd	C4, C4, T4
201	.align 4
202
203LL(15):
204	andi.	r0,  N, 15
205	beq	LL(999)
206	andi.	r0,  N, 8
207	beq	LL(16)
208
209	LFPDUX	A1,    X, INCX2
210	LFPDUX	A2,    X, INCX2
211	LFPDUX	A3,    X, INCX2
212	LFPDUX	A4,    X, INCX2
213
214	fpabs	T1, A1
215	fpabs	T2, A2
216	fpabs	T3, A3
217	fpabs	T4, A4
218
219	fpadd	C1, C1, T1
220	fpadd	C2, C2, T2
221	fpadd	C3, C3, T3
222	fpadd	C4, C4, T4
223	.align 4
224
225LL(16):
226	andi.	r0,  N, 4
227	beq	LL(17)
228
229	LFPDUX	A1,    X, INCX2
230	LFPDUX	A2,    X, INCX2
231	fpabs	T1, A1
232	fpabs	T2, A2
233
234	fpadd	C1, C1, T1
235	fpadd	C2, C2, T2
236	.align 4
237
238LL(17):
239	andi.	r0,  N, 2
240	beq	LL(18)
241
242	LFPDUX	A1,    X, INCX2
243	fpabs	T1, A1
244	fpadd	C1, C1, T1
245	.align 4
246
247LL(18):
248	andi.	r0,  N, 1
249	beq	LL(999)
250
251	LFDX	A1,    X, INCX2
252	fabs	T1, A1
253	fadd	C1, C1, T1
254	b LL(999)
255	.align 4
256
257LL(100):
258	sub	X2, X, INCX
259	sub	X,  X, INCX2
260
261	srawi.	r0, N, 4
262	mtspr	CTR,  r0
263	beq-	LL(115)
264
265
266	LFDUX	A1,   X, INCX2
267	fpmr	T1, C2
268	LFDUX	A2,   X, INCX2
269	fpmr	T2, C2
270	LFDUX	A3,   X, INCX2
271	fpmr	T3, C2
272	LFDUX	A4,   X, INCX2
273	fpmr	T4, C2
274
275	LFDUX	A5,   X, INCX2
276	LFSDUX	A1,   X2, INCX2
277
278	LFDUX	A6,   X, INCX2
279	LFSDUX	A2,   X2, INCX2
280
281	LFDUX	A7,   X, INCX2
282	LFSDUX	A3,   X2, INCX2
283
284	LFDUX	A8,   X, INCX2
285	LFSDUX	A4,   X2, INCX2
286	bdz	LL(113)
287	.align 4
288
289LL(112):
290	fpadd	C1, C1, T1
291	LFSDUX	A5,   X2, INCX2
292	fpabs	T1, A1
293	LFDUX	A1,   X, INCX2
294
295	fpadd	C2, C2, T2
296	LFSDUX	A6,   X2, INCX2
297	fpabs	T2, A2
298	LFDUX	A2,   X, INCX2
299
300	fpadd	C3, C3, T3
301	LFSDUX	A7,   X2, INCX2
302	fpabs	T3, A3
303	LFDUX	A3,   X, INCX2
304
305	fpadd	C4, C4, T4
306	LFSDUX	A8,   X2, INCX2
307	fpabs	T4, A4
308	LFDUX	A4,   X, INCX2
309
310	fpadd	C1, C1, T1
311	LFSDUX	A1,   X2, INCX2
312	fpabs	T1, A5
313	LFDUX	A5,   X, INCX2
314	fpadd	C2, C2, T2
315	LFSDUX	A2,   X2, INCX2
316	fpabs	T2, A6
317	LFDUX	A6,   X, INCX2
318
319	fpadd	C3, C3, T3
320	LFSDUX	A3,   X2, INCX2
321	fpabs	T3, A7
322	LFDUX	A7,   X, INCX2
323	fpadd	C4, C4, T4
324	LFSDUX	A4,   X2, INCX2
325	fpabs	T4, A8
326	LFDUX	A8,   X, INCX2
327
328	bdnz	LL(112)
329	.align 4
330
331LL(113):
332	fpadd	C1, C1, T1
333	nop
334	fpabs	T1, A1
335	LFSDUX	A5,   X2, INCX2
336	fpadd	C2, C2, T2
337	nop
338	fpabs	T2, A2
339	LFSDUX	A6,   X2, INCX2
340	fpadd	C3, C3, T3
341
342	nop
343	fpabs	T3, A3
344	LFSDUX	A7,   X2, INCX2
345	fpadd	C4, C4, T4
346	nop
347	fpabs	T4, A4
348	LFSDUX	A8,   X2, INCX2
349
350	fpadd	C1, C1, T1
351	fpabs	T1, A5
352	fpadd	C2, C2, T2
353	fpabs	T2, A6
354	fpadd	C3, C3, T3
355	fpabs	T3, A7
356	fpadd	C4, C4, T4
357	fpabs	T4, A8
358
359	fpadd	C1, C1, T1
360	fpadd	C2, C2, T2
361	fpadd	C3, C3, T3
362	fpadd	C4, C4, T4
363	.align 4
364
365LL(115):
366	andi.	r0,  N, 15
367	beq	LL(999)
368	andi.	r0,  N, 8
369	beq	LL(116)
370
371	LFDUX	A1,    X,  INCX2
372	LFDUX	A2,    X2, INCX2
373	LFDUX	A3,    X,  INCX2
374	LFDUX	A4,    X2, INCX2
375
376	fabs	T1, A1
377	LFDUX	A5,    X,  INCX2
378	fabs	T2, A2
379	LFDUX	A6,    X2, INCX2
380	fabs	T3, A3
381	LFDUX	A7,    X,  INCX2
382	fabs	T4, A4
383	LFDUX	A8,    X2, INCX2
384
385	fadd	C1, C1, T1
386	fabs	T1, A5
387	fadd	C2, C2, T2
388	fabs	T2, A6
389
390	fadd	C3, C3, T3
391	fabs	T3, A7
392	fadd	C4, C4, T4
393	fabs	T4, A8
394
395	fadd	C1, C1, T1
396	fadd	C2, C2, T2
397	fadd	C3, C3, T3
398	fadd	C4, C4, T4
399	.align 4
400
401LL(116):
402	andi.	r0,  N, 4
403	beq	LL(117)
404
405	LFDUX	A1,    X,  INCX2
406	LFDUX	A2,    X2, INCX2
407	LFDUX	A3,    X,  INCX2
408	LFDUX	A4,    X2, INCX2
409
410	fabs	T1, A1
411	fabs	T2, A2
412	fabs	T3, A3
413	fabs	T4, A4
414
415	fadd	C1, C1, T1
416	fadd	C2, C2, T2
417	fadd	C3, C3, T3
418	fadd	C4, C4, T4
419	.align 4
420
421LL(117):
422	andi.	r0,  N, 2
423	beq	LL(118)
424
425	LFDUX	A1,    X,  INCX2
426	LFDUX	A2,    X2, INCX2
427
428	fabs	T1, A1
429	fabs	T2, A2
430	fadd	C1, C1, T1
431	fadd	C2, C2, T2
432	.align 4
433
434LL(118):
435	andi.	r0,  N, 1
436	beq	LL(999)
437
438	LFDX	A1,    X, INCX2
439	fabs	T1, A1
440	fadd	C1, C1, T1
441	.align 4
442
443LL(999):
444	fpadd	C1,  C1,  C2
445	li	r10, 16
446	fpadd	C3,  C3,  C4
447	fpadd	C1,  C1,  C3
448	lfpdux	f15, SP, r10
449	fsmtp	C2, C1
450	lfpdux	f14, SP, r10
451	addi	SP, SP,  16
452	fadd	C1, C2, C1
453	blr
454
455	EPILOGUE
456