1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define N	r3
43#define X	r4
44#define INCX	r5
45
46#define INCX2	r6
47#define X2	r7
48
49#define	XX	r8
50#define RET	r9
51#define NN	r10
52
53#define C1	f1
54#define C2	f0
55#define C3	f2
56#define C4	f3
57
58#define A1	f4
59#define A2	f5
60#define A3	f6
61#define A4	f7
62#define A5	f8
63#define A6	f9
64#define A7	f10
65#define A8	f11
66
67#define F1	f12
68#define F2	f13
69#define F3	f14
70#define F4	f15
71
72#define T1	f16
73#define T2	f17
74#define T3	f18
75#define T4	f19
76
77#define B1	f20
78#define B2	f21
79#define B3	f22
80#define B4	f23
81#define B5	f24
82#define B6	f25
83#define B7	f26
84#define B8	f27
85
86
87	PROLOGUE
88	PROFCODE
89
90	li	r10, -16
91
92	stfpdux	f14, SP, r10
93	stfpdux	f15, SP, r10
94
95	stfpdux	f16, SP, r10
96	stfpdux	f17, SP, r10
97	stfpdux	f18, SP, r10
98	stfpdux	f19, SP, r10
99
100	stfpdux	f20, SP, r10
101	stfpdux	f21, SP, r10
102	stfpdux	f22, SP, r10
103	stfpdux	f23, SP, r10
104
105	stfpdux	f24, SP, r10
106	stfpdux	f25, SP, r10
107	stfpdux	f26, SP, r10
108	stfpdux	f27, SP, r10
109
110#ifdef F_INTERFACE
111	LDINT	N,    0(N)
112	LDINT	INCX, 0(INCX)
113#endif
114
115	slwi	INCX,  INCX, BASE_SHIFT
116	add	INCX2, INCX, INCX
117
118	li	RET, 0
119	cmpwi	cr0, N, 0
120	ble	LL(999)
121	cmpwi	cr0, INCX, 0
122	mr	NN, N
123	ble	LL(999)
124
125	mr	XX, X
126
127	LFD	A1, 0 * SIZE(X)
128	LFD	A2, 1 * SIZE(X)
129	add	X, X, INCX2
130	li	RET, 1
131
132	fabs	A1, A1
133	fabs	A2, A2
134
135	subi	INCX2, INCX2, SIZE
136
137	addi	N, N, -1
138	cmpwi	cr0, N, 0
139	fadd	C1, A1, A2
140	ble	LL(999)
141
142	fsmfp	C1, C1
143	li	INCX, SIZE
144	fpmr	C2, C1
145	sub	X,  X, INCX2
146	fpmr	C3, C1
147	srawi.	r0, N, 3
148	fpmr	C4, C1
149	mtspr	CTR,  r0
150	beq-	LL(105)
151
152	LFDUX	A1,   X, INCX2
153	LFDUX	A2,   X, INCX
154	LFDUX	A3,   X, INCX2
155	LFDUX	A4,   X, INCX
156
157	LFSDUX	A1,   X, INCX2
158	LFSDUX	A2,   X, INCX
159	LFSDUX	A3,   X, INCX2
160	LFSDUX	A4,   X, INCX
161
162	LFDUX	A5,   X, INCX2
163	LFDUX	A6,   X, INCX
164	LFDUX	A7,   X, INCX2
165	LFDUX	A8,   X, INCX
166
167	LFSDUX	A5,   X, INCX2
168	LFSDUX	A6,   X, INCX
169	LFSDUX	A7,   X, INCX2
170	LFSDUX	A8,   X, INCX
171	bdz	LL(103)
172	.align 4
173
174LL(102):
175	fpabs	B1, A1
176	LFDUX	A1,   X, INCX2
177	fpabs	B2, A2
178	LFDUX	A2,   X, INCX
179	fpabs	B3, A3
180	LFDUX	A3,   X, INCX2
181	fpabs	B4, A4
182	LFDUX	A4,   X, INCX
183
184	fpabs	B5, A5
185	LFSDUX	A1,   X, INCX2
186	fpabs	B6, A6
187	LFSDUX	A2,   X, INCX
188	fpabs	B7, A7
189	LFSDUX	A3,   X, INCX2
190	fpabs	B8, A8
191	LFSDUX	A4,   X, INCX
192
193	fpadd	T1, B1, B2
194	LFDUX	A5,   X, INCX2
195	fpadd	T2, B3, B4
196	LFDUX	A6,   X, INCX
197	fpadd	T3, B5, B6
198	LFDUX	A7,   X, INCX2
199	fpadd	T4, B7, B8
200	LFDUX	A8,   X, INCX
201
202	fpsub	F1, T1, C1
203	LFSDUX	A5,   X, INCX2
204	fpsub	F2, T2, C2
205	LFSDUX	A6,   X, INCX
206	fpsub	F3, T3, C3
207	LFSDUX	A7,   X, INCX2
208	fpsub	F4, T4, C4
209	LFSDUX	A8,   X, INCX
210
211	fpsel	C1, F1, C1, T1
212	fpsel	C2, F2, C2, T2
213	fpsel	C3, F3, C3, T3
214	fpsel	C4, F4, C4, T4
215	bdnz	LL(102)
216	.align 4
217
218LL(103):
219	fpabs	B1, A1
220	fpabs	B2, A2
221	fpabs	B3, A3
222	fpabs	B4, A4
223
224	fpabs	B5, A5
225	fpabs	B6, A6
226	fpabs	B7, A7
227	fpabs	B8, A8
228
229	fpadd	T1, B1, B2
230	fpadd	T2, B3, B4
231	fpadd	T3, B5, B6
232	fpadd	T4, B7, B8
233
234	fpsub	F1, T1, C1
235	fpsub	F2, T2, C2
236	fpsub	F3, T3, C3
237	fpsub	F4, T4, C4
238
239	fpsel	C1, F1, C1, T1
240	fpsel	C2, F2, C2, T2
241	fpsel	C3, F3, C3, T3
242	fpsel	C4, F4, C4, T4
243	.align 4
244
245LL(105):
246	andi.	r0,  N, 7
247	beq	LL(120)
248
249	andi.	r0,  N, 4
250	beq	LL(106)
251
252	LFDUX	A1,   X, INCX2
253	LFDUX	A2,   X, INCX
254	LFDUX	A3,   X, INCX2
255	LFDUX	A4,   X, INCX
256
257	LFSDUX	A1,   X, INCX2
258	LFSDUX	A2,   X, INCX
259	LFSDUX	A3,   X, INCX2
260	LFSDUX	A4,   X, INCX
261
262	fpabs	A1, A1
263	fpabs	A2, A2
264	fpabs	A3, A3
265	fpabs	A4, A4
266
267	fpadd	A1, A1, A2
268	fpadd	A3, A3, A4
269
270	fpsub	F1, A1, C1
271	fpsub	F2, A3, C2
272
273	fpsel	C1, F1, C1, A1
274	fpsel	C2, F2, C2, A3
275	.align 4
276
277LL(106):
278	andi.	r0,  N, 2
279	beq	LL(107)
280
281	LFDUX	A1,   X, INCX2
282	LFDUX	A2,   X, INCX
283	LFSDUX	A1,   X, INCX2
284	LFSDUX	A2,   X, INCX
285
286	fpabs	A1, A1
287	fpabs	A2, A2
288
289	fpadd	A1, A1, A2
290
291	fpsub	F1, A1, C1
292	fpsel	C1, F1, C1, A1
293	.align 4
294
295LL(107):
296	andi.	r0,  N, 1
297	beq	LL(120)
298
299	LFDUX	A1,   X, INCX2
300	LFDUX	A2,   X, INCX
301
302	fabs	A1, A1
303	fabs	A2, A2
304
305	fadd	A1, A1, A2
306
307	fsub	F1, A1, C1
308	fsel	C1, F1, C1, A1
309	.align 4
310
311LL(120):
312	fpsub	F1,  C2, C1
313	fpsub	F2,  C4, C3
314
315	fpsel	C1,  F1,  C1,  C2
316	fpsel	C3,  F2,  C3,  C4
317
318	fpsub	F1,  C3, C1
319	fpsel	C1,  F1,  C1,  C3
320
321	fsmtp	C2, C1
322
323	li	RET, 0
324	fsub	F1,  C2, C1
325	fsel	C1,  F1,  C1,  C2
326
327	fsmfp	C1, C1
328
329	sub	XX,  XX, INCX2
330
331	srawi.	r0, NN, 3
332	mtspr	CTR,  r0
333	beq-	LL(125)
334
335	LFDUX	A1,   XX, INCX2
336	LFDUX	A2,   XX, INCX
337	LFDUX	A3,   XX, INCX2
338	LFDUX	A4,   XX, INCX
339
340	LFSDUX	A1,   XX, INCX2
341	LFSDUX	A2,   XX, INCX
342	LFSDUX	A3,   XX, INCX2
343	LFSDUX	A4,   XX, INCX
344
345	LFDUX	A5,   XX, INCX2
346	LFDUX	A6,   XX, INCX
347	LFDUX	A7,   XX, INCX2
348	LFDUX	A8,   XX, INCX
349
350	LFSDUX	A5,   XX, INCX2
351	LFSDUX	A6,   XX, INCX
352	LFSDUX	A7,   XX, INCX2
353	LFSDUX	A8,   XX, INCX
354
355	fpabs	T1, A1
356	fpabs	T2, A2
357	fpabs	T3, A3
358	fpabs	T4, A4
359
360	fpadd	B1, T1, T2
361	fpadd	B2, T3, T4
362
363 	bdz	LL(123)
364	.align 4
365
366LL(122):
367	LFDUX	A1,   XX, INCX2
368	fpabs	T1, A5
369	addi	RET, RET, 1
370	fcmpu	cr0, C1, B1
371	LFDUX	A2,   XX, INCX
372	beq	cr0, LL(999)
373
374	LFDUX	A3,   XX, INCX2
375	fpabs	T2, A6
376	addi	RET, RET, 1
377	fcmpu	cr0, C1, B2
378	LFDUX	A4,   XX, INCX
379	beq	cr0, LL(999)
380
381	LFSDUX	A1,   XX, INCX2
382	fpabs	T3, A7
383	addi	RET, RET, 1
384	fscmp	cr0, C1, B1
385	LFSDUX	A2,   XX, INCX
386	beq	cr0, LL(999)
387
388	LFSDUX	A3,   XX, INCX2
389	fpabs	T4, A8
390	addi	RET, RET, 1
391	fscmp	cr0, C1, B2
392	LFSDUX	A4,   XX, INCX
393	beq	cr0, LL(999)
394
395	fpadd	B3, T1, T2
396	fpadd	B4, T3, T4
397
398	LFDUX	A5,   XX, INCX2
399	fpabs	T1, A1
400	addi	RET, RET, 1
401	fcmpu	cr0, C1, B3
402	LFDUX	A6,   XX, INCX
403	beq	cr0, LL(999)
404
405	LFDUX	A7,   XX, INCX2
406	fpabs	T2, A2
407	addi	RET, RET, 1
408	fcmpu	cr0, C1, B4
409	LFDUX	A8,   XX, INCX
410	beq	cr0, LL(999)
411
412	LFSDUX	A5,   XX, INCX2
413	fpabs	T3, A3
414	addi	RET, RET, 1
415	fscmp	cr0, C1, B3
416	LFSDUX	A6,   XX, INCX
417	beq	cr0, LL(999)
418
419	LFSDUX	A7,   XX, INCX2
420	fpabs	T4, A4
421	addi	RET, RET, 1
422	fscmp	cr0, C1, B4
423	LFSDUX	A8,   XX, INCX
424	beq	cr0, LL(999)
425
426	fpadd	B1, T1, T2
427	fpadd	B2, T3, T4
428	bdnz	LL(122)
429	.align 4
430
431LL(123):
432	fpabs	T1, A5
433	addi	RET, RET, 1
434	fcmpu	cr0, C1, B1
435	beq	cr0, LL(999)
436
437	fpabs	T2, A6
438	addi	RET, RET, 1
439	fcmpu	cr0, C1, B2
440	beq	cr0, LL(999)
441
442	fpabs	T3, A7
443	addi	RET, RET, 1
444	fscmp	cr0, C1, B1
445	beq	cr0, LL(999)
446
447	fpabs	T4, A8
448	addi	RET, RET, 1
449	fscmp	cr0, C1, B2
450	beq	cr0, LL(999)
451
452	fpadd	B3, T1, T2
453	fpadd	B4, T3, T4
454
455	addi	RET, RET, 1
456	fcmpu	cr0, C1, B3
457	beq	cr0, LL(999)
458
459	addi	RET, RET, 1
460	fcmpu	cr0, C1, B4
461	beq	cr0, LL(999)
462
463	addi	RET, RET, 1
464	fscmp	cr0, C1, B3
465	beq	cr0, LL(999)
466
467	addi	RET, RET, 1
468	fscmp	cr0, C1, B4
469	beq	cr0, LL(999)
470	.align 4
471
472LL(125):
473	andi.	r0,  NN, 4
474	beq	LL(126)
475
476	LFDUX	A1,   XX, INCX2
477	LFDUX	A2,   XX, INCX
478	LFDUX	A3,   XX, INCX2
479	LFDUX	A4,   XX, INCX
480
481	LFSDUX	A1,   XX, INCX2
482	LFSDUX	A2,   XX, INCX
483	LFSDUX	A3,   XX, INCX2
484	LFSDUX	A4,   XX, INCX
485
486	fpabs	A1, A1
487	fpabs	A2, A2
488	fpabs	A3, A3
489	fpabs	A4, A4
490
491	fpadd	A1, A1, A2
492	fpadd	A3, A3, A4
493
494	addi	RET, RET, 1
495	fcmpu	cr0, C1, A1
496	beq	cr0, LL(999)
497
498	addi	RET, RET, 1
499	fcmpu	cr0, C1, A3
500	beq	cr0, LL(999)
501
502	addi	RET, RET, 1
503	fscmp	cr0, C1, A1
504	beq	cr0, LL(999)
505
506	addi	RET, RET, 1
507	fscmp	cr0, C1, A3
508	beq	cr0, LL(999)
509	.align 4
510
511LL(126):
512	andi.	r0,  NN, 2
513	beq	LL(127)
514
515	LFDUX	A1,   XX, INCX2
516	LFDUX	A2,   XX, INCX
517	LFDUX	A3,   XX, INCX2
518	LFDUX	A4,   XX, INCX
519
520	fabs	A1, A1
521	fabs	A2, A2
522	fabs	A3, A3
523	fabs	A4, A4
524
525	fadd	A1, A1, A2
526	fadd	A3, A3, A4
527
528	addi	RET, RET, 1
529	fcmpu	cr0, C1, A1
530	beq	cr0, LL(999)
531
532	addi	RET, RET, 1
533	fcmpu	cr0, C1, A3
534	beq	cr0, LL(999)
535	.align 4
536
537LL(127):
538	addi	RET, RET, 1
539	.align 4
540
541LL(999):
542	li	r10, 16
543	addi	SP, SP, -16
544	mr	r3, RET
545
546	lfpdux	f27, SP, r10
547	lfpdux	f26, SP, r10
548	lfpdux	f25, SP, r10
549	lfpdux	f24, SP, r10
550
551	lfpdux	f23, SP, r10
552	lfpdux	f22, SP, r10
553	lfpdux	f21, SP, r10
554	lfpdux	f20, SP, r10
555
556	lfpdux	f19, SP, r10
557	lfpdux	f18, SP, r10
558	lfpdux	f17, SP, r10
559	lfpdux	f16, SP, r10
560
561	lfpdux	f15, SP, r10
562	lfpdux	f14, SP, r10
563	addi	SP, SP,  16
564	blr
565
566	EPILOGUE
567