1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define N	r3
43#define X	r4
44#define INCX	r5
45
46#define INCX2	r6
47#define X2	r7
48#define FLAG	r8
49
50#define C1	f1
51#define C2	f0
52#define C3	f2
53#define C4	f3
54
55#define A1	f4
56#define A2	f5
57#define A3	f6
58#define A4	f7
59#define A5	f8
60#define A6	f9
61#define A7	f10
62#define A8	f11
63
64#define T1	f12
65#define T2	f13
66#define T3	f14
67#define T4	f15
68
69	PROLOGUE
70	PROFCODE
71
72	li	r10, -16
73
74	stfpdux	f14, SP, r10
75	stfpdux	f15, SP, r10
76
77	li	r10,   0
78	stwu	r10,   -4(SP)
79	stwu	r10,   -4(SP)
80	stwu	r10,   -4(SP)
81	stwu	r10,   -4(SP)
82
83#ifdef F_INTERFACE
84	LDINT	N,    0(N)
85	LDINT	INCX, 0(INCX)
86#endif
87
88	lfpdx	C1, SP, r10		# Zero clear
89
90	slwi	INCX,  INCX, BASE_SHIFT
91	add	INCX2, INCX, INCX
92
93	fpmr	C2, C1
94	fpmr	C3, C1
95	li	FLAG, 0
96	fpmr	C4, C1
97
98	cmpwi	cr0, N, 0
99	ble	LL(999)
100	cmpwi	cr0, INCX, 0
101	ble	LL(999)
102
103	sub	X,  X, INCX2
104
105	cmpwi	cr0, INCX, SIZE
106	bne	LL(100)
107
108	andi.	r0, X, 2 * SIZE - 1
109	beq	LL(05)
110
111	LFD	C1, 2 * SIZE(X)
112	li	FLAG, 1
113	addi	X, X, 1 * SIZE
114	addi	N, N, -1
115	cmpwi	cr0, N, 0
116	fabs	C1, C1
117	ble	LL(99)
118	.align 4
119
120LL(05):
121	srawi.	r0, N, 3
122	mtspr	CTR,  r0
123	beq-	LL(15)
124
125	LFPDUX	A1,   X, INCX2
126	fpmr	T1, C2
127	LFPDUX	A2,   X, INCX2
128	fpmr	T2, C2
129	LFPDUX	A3,   X, INCX2
130	fpmr	T3, C2
131	LFPDUX	A4,   X, INCX2
132	fpmr	T4, C2
133	LFPDUX	A5,   X, INCX2
134	LFPDUX	A6,   X, INCX2
135	LFPDUX	A7,   X, INCX2
136	LFPDUX	A8,   X, INCX2
137	bdz	LL(13)
138	.align 4
139
140LL(12):
141	fpadd	C1, C1, T1
142	nop
143	fpabs	T1, A1
144	LFPDUX	A1,   X, INCX2
145
146	fpadd	C2, C2, T2
147	nop
148	fpabs	T2, A2
149	LFPDUX	A2,   X, INCX2
150
151	fpadd	C3, C3, T3
152	nop
153	fpabs	T3, A3
154	LFPDUX	A3,   X, INCX2
155
156	fpadd	C4, C4, T4
157	nop
158	fpabs	T4, A4
159	LFPDUX	A4,   X, INCX2
160
161	fpadd	C1, C1, T1
162	nop
163	fpabs	T1, A5
164	LFPDUX	A5,   X, INCX2
165
166	fpadd	C2, C2, T2
167	nop
168	fpabs	T2, A6
169	LFPDUX	A6,   X, INCX2
170
171	fpadd	C3, C3, T3
172	nop
173	fpabs	T3, A7
174	LFPDUX	A7,   X, INCX2
175
176	fpadd	C4, C4, T4
177	fpabs	T4, A8
178	LFPDUX	A8,   X, INCX2
179	bdnz	LL(12)
180	.align 4
181
182LL(13):
183	fpadd	C1, C1, T1
184	fpabs	T1, A1
185	fpadd	C2, C2, T2
186	fpabs	T2, A2
187	fpadd	C3, C3, T3
188	fpabs	T3, A3
189	fpadd	C4, C4, T4
190	fpabs	T4, A4
191
192	fpadd	C1, C1, T1
193	fpabs	T1, A5
194	fpadd	C2, C2, T2
195	fpabs	T2, A6
196	fpadd	C3, C3, T3
197	fpabs	T3, A7
198	fpadd	C4, C4, T4
199	fpabs	T4, A8
200
201	fpadd	C1, C1, T1
202	fpadd	C2, C2, T2
203	fpadd	C3, C3, T3
204	fpadd	C4, C4, T4
205	.align 4
206
207LL(15):
208	andi.	r0,  N, 7
209	beq	LL(99)
210	andi.	r0,  N, 4
211	beq	LL(16)
212
213	LFPDUX	A1,    X, INCX2
214	LFPDUX	A2,    X, INCX2
215	LFPDUX	A3,    X, INCX2
216	LFPDUX	A4,    X, INCX2
217
218	fpabs	T1, A1
219	fpabs	T2, A2
220	fpabs	T3, A3
221	fpabs	T4, A4
222
223	fpadd	C1, C1, T1
224	fpadd	C2, C2, T2
225	fpadd	C3, C3, T3
226	fpadd	C4, C4, T4
227	.align 4
228
229LL(16):
230	andi.	r0,  N, 2
231	beq	LL(17)
232
233	LFPDUX	A1,    X, INCX2
234	LFPDUX	A2,    X, INCX2
235	fpabs	T1, A1
236	fpabs	T2, A2
237
238	fpadd	C1, C1, T1
239	fpadd	C2, C2, T2
240	.align 4
241
242LL(17):
243	andi.	r0,  N, 1
244	beq	LL(99)
245
246	LFPDUX	A1,    X, INCX2
247	fpabs	T1, A1
248	fpadd	C1, C1, T1
249	.align 4
250
251LL(99):
252	cmpwi	cr0, FLAG, 0
253	beq	LL(999)
254
255	LFD	A1, 2 * SIZE(X)
256	fabs	T1, A1
257	fadd	C2, C2, T1
258	b	LL(999)
259	.align 4
260
261LL(100):
262	addi	X2, X, SIZE
263	andi.	r0, X, 2 * SIZE - 1
264	bne	LL(200)
265
266	srawi.	r0, N, 3
267	mtspr	CTR,  r0
268	beq-	LL(115)
269
270	LFPDUX	A1,   X, INCX2
271	fpmr	T1, C2
272	LFPDUX	A2,   X, INCX2
273	fpmr	T2, C2
274	LFPDUX	A3,   X, INCX2
275	fpmr	T3, C2
276	LFPDUX	A4,   X, INCX2
277	fpmr	T4, C2
278	LFPDUX	A5,   X, INCX2
279	LFPDUX	A6,   X, INCX2
280	LFPDUX	A7,   X, INCX2
281	LFPDUX	A8,   X, INCX2
282	bdz	LL(113)
283	.align 4
284
285LL(112):
286	fpadd	C1, C1, T1
287	nop
288	fpabs	T1, A1
289	LFPDUX	A1,   X, INCX2
290
291	fpadd	C2, C2, T2
292	nop
293	fpabs	T2, A2
294	LFPDUX	A2,   X, INCX2
295
296	fpadd	C3, C3, T3
297	nop
298	fpabs	T3, A3
299	LFPDUX	A3,   X, INCX2
300
301	fpadd	C4, C4, T4
302	nop
303	fpabs	T4, A4
304	LFPDUX	A4,   X, INCX2
305
306	fpadd	C1, C1, T1
307	nop
308	fpabs	T1, A5
309	LFPDUX	A5,   X, INCX2
310
311	fpadd	C2, C2, T2
312	nop
313	fpabs	T2, A6
314	LFPDUX	A6,   X, INCX2
315
316	fpadd	C3, C3, T3
317	nop
318	fpabs	T3, A7
319	LFPDUX	A7,   X, INCX2
320
321	fpadd	C4, C4, T4
322	fpabs	T4, A8
323	LFPDUX	A8,   X, INCX2
324	bdnz	LL(112)
325	.align 4
326
327LL(113):
328	fpadd	C1, C1, T1
329	fpabs	T1, A1
330	fpadd	C2, C2, T2
331	fpabs	T2, A2
332	fpadd	C3, C3, T3
333	fpabs	T3, A3
334	fpadd	C4, C4, T4
335	fpabs	T4, A4
336
337	fpadd	C1, C1, T1
338	fpabs	T1, A5
339	fpadd	C2, C2, T2
340	fpabs	T2, A6
341	fpadd	C3, C3, T3
342	fpabs	T3, A7
343	fpadd	C4, C4, T4
344	fpabs	T4, A8
345
346	fpadd	C1, C1, T1
347	fpadd	C2, C2, T2
348	fpadd	C3, C3, T3
349	fpadd	C4, C4, T4
350	.align 4
351
352LL(115):
353	andi.	r0,  N, 7
354	beq	LL(999)
355	andi.	r0,  N, 4
356	beq	LL(116)
357
358	LFPDUX	A1,    X, INCX2
359	LFPDUX	A2,    X, INCX2
360	LFPDUX	A3,    X, INCX2
361	LFPDUX	A4,    X, INCX2
362
363	fpabs	T1, A1
364	fpabs	T2, A2
365	fpabs	T3, A3
366	fpabs	T4, A4
367
368	fpadd	C1, C1, T1
369	fpadd	C2, C2, T2
370	fpadd	C3, C3, T3
371	fpadd	C4, C4, T4
372	.align 4
373
374LL(116):
375	andi.	r0,  N, 2
376	beq	LL(117)
377
378	LFPDUX	A1,    X, INCX2
379	LFPDUX	A2,    X, INCX2
380	fpabs	T1, A1
381	fpabs	T2, A2
382
383	fpadd	C1, C1, T1
384	fpadd	C2, C2, T2
385	.align 4
386
387LL(117):
388	andi.	r0,  N, 1
389	beq	LL(999)
390
391	LFPDUX	A1,    X, INCX2
392	fpabs	T1, A1
393	fpadd	C1, C1, T1
394	b	LL(999)
395	.align 4
396
397LL(200):
398	srawi.	r0, N, 3
399	mtspr	CTR,  r0
400	beq-	LL(215)
401
402
403	LFDUX	A1,   X, INCX2
404	fpmr	T1, C2
405	LFDUX	A2,   X, INCX2
406	fpmr	T2, C2
407	LFDUX	A3,   X, INCX2
408	fpmr	T3, C2
409	LFDUX	A4,   X, INCX2
410	fpmr	T4, C2
411
412	LFDUX	A5,   X, INCX2
413	LFSDUX	A1,   X2, INCX2
414
415	LFDUX	A6,   X, INCX2
416	LFSDUX	A2,   X2, INCX2
417
418	LFDUX	A7,   X, INCX2
419	LFSDUX	A3,   X2, INCX2
420
421	LFDUX	A8,   X, INCX2
422	LFSDUX	A4,   X2, INCX2
423	bdz	LL(213)
424	.align 4
425
426LL(212):
427	fpadd	C1, C1, T1
428	LFSDUX	A5,   X2, INCX2
429	fpabs	T1, A1
430	LFDUX	A1,   X, INCX2
431
432	fpadd	C2, C2, T2
433	LFSDUX	A6,   X2, INCX2
434	fpabs	T2, A2
435	LFDUX	A2,   X, INCX2
436
437	fpadd	C3, C3, T3
438	LFSDUX	A7,   X2, INCX2
439	fpabs	T3, A3
440	LFDUX	A3,   X, INCX2
441
442	fpadd	C4, C4, T4
443	LFSDUX	A8,   X2, INCX2
444	fpabs	T4, A4
445	LFDUX	A4,   X, INCX2
446
447	fpadd	C1, C1, T1
448	LFSDUX	A1,   X2, INCX2
449	fpabs	T1, A5
450	LFDUX	A5,   X, INCX2
451	fpadd	C2, C2, T2
452	LFSDUX	A2,   X2, INCX2
453	fpabs	T2, A6
454	LFDUX	A6,   X, INCX2
455
456	fpadd	C3, C3, T3
457	LFSDUX	A3,   X2, INCX2
458	fpabs	T3, A7
459	LFDUX	A7,   X, INCX2
460	fpadd	C4, C4, T4
461	LFSDUX	A4,   X2, INCX2
462	fpabs	T4, A8
463	LFDUX	A8,   X, INCX2
464
465	bdnz	LL(212)
466	.align 4
467
468LL(213):
469	fpadd	C1, C1, T1
470	nop
471	fpabs	T1, A1
472	LFSDUX	A5,   X2, INCX2
473	fpadd	C2, C2, T2
474	nop
475	fpabs	T2, A2
476	LFSDUX	A6,   X2, INCX2
477	fpadd	C3, C3, T3
478
479	nop
480	fpabs	T3, A3
481	LFSDUX	A7,   X2, INCX2
482	fpadd	C4, C4, T4
483	nop
484	fpabs	T4, A4
485	LFSDUX	A8,   X2, INCX2
486
487	fpadd	C1, C1, T1
488	fpabs	T1, A5
489	fpadd	C2, C2, T2
490	fpabs	T2, A6
491	fpadd	C3, C3, T3
492	fpabs	T3, A7
493	fpadd	C4, C4, T4
494	fpabs	T4, A8
495
496	fpadd	C1, C1, T1
497	fpadd	C2, C2, T2
498	fpadd	C3, C3, T3
499	fpadd	C4, C4, T4
500	.align 4
501
502LL(215):
503	andi.	r0,  N, 7
504	beq	LL(999)
505	andi.	r0,  N, 4
506	beq	LL(216)
507
508	LFDUX	A1,    X,  INCX2
509	LFDUX	A2,    X2, INCX2
510	LFDUX	A3,    X,  INCX2
511	LFDUX	A4,    X2, INCX2
512
513	fabs	T1, A1
514	LFDUX	A5,    X,  INCX2
515	fabs	T2, A2
516	LFDUX	A6,    X2, INCX2
517	fabs	T3, A3
518	LFDUX	A7,    X,  INCX2
519	fabs	T4, A4
520	LFDUX	A8,    X2, INCX2
521
522	fadd	C1, C1, T1
523	fabs	T1, A5
524	fadd	C2, C2, T2
525	fabs	T2, A6
526
527	fadd	C3, C3, T3
528	fabs	T3, A7
529	fadd	C4, C4, T4
530	fabs	T4, A8
531
532	fadd	C1, C1, T1
533	fadd	C2, C2, T2
534	fadd	C3, C3, T3
535	fadd	C4, C4, T4
536	.align 4
537
538LL(216):
539	andi.	r0,  N, 2
540	beq	LL(217)
541
542	LFDUX	A1,    X,  INCX2
543	LFDUX	A2,    X2, INCX2
544	LFDUX	A3,    X,  INCX2
545	LFDUX	A4,    X2, INCX2
546
547	fabs	T1, A1
548	fabs	T2, A2
549	fabs	T3, A3
550	fabs	T4, A4
551
552	fadd	C1, C1, T1
553	fadd	C2, C2, T2
554	fadd	C3, C3, T3
555	fadd	C4, C4, T4
556	.align 4
557
558LL(217):
559	andi.	r0,  N, 1
560	beq	LL(999)
561
562	LFDUX	A1,    X,  INCX2
563	LFDUX	A2,    X2, INCX2
564
565	fabs	T1, A1
566	fabs	T2, A2
567	fadd	C1, C1, T1
568	fadd	C2, C2, T2
569	.align 4
570
571LL(999):
572	fpadd	C1,  C1,  C2
573	li	r10, 16
574	fpadd	C3,  C3,  C4
575	fpadd	C1,  C1,  C3
576	lfpdux	f15, SP, r10
577	fsmtp	C2, C1
578	lfpdux	f14, SP, r10
579	addi	SP, SP,  16
580	fadd	C1, C2, C1
581	blr
582
583	EPILOGUE
584