1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define N	r3
43#define X	r4
44#define INCX	r5
45#define Y	r6
46#define INCY	r7
47#define PRE	r8
48
49#define FZERO	f0
50
51#define STACKSIZE 96
52
53	PROLOGUE
54	PROFCODE
55
56	addi	SP, SP, -STACKSIZE
57	li	r0,   0
58
59	stfd	f14,    0(SP)
60	stfd	f15,    8(SP)
61	stfd	f16,   16(SP)
62	stfd	f17,   24(SP)
63
64	stfd	f18,   32(SP)
65	stfd	f19,   40(SP)
66	stfd	f20,   48(SP)
67	stfd	f21,   56(SP)
68
69	stfd	f22,   64(SP)
70	stfd	f23,   72(SP)
71
72	stw	r0,    80(SP)
73	lfs	FZERO, 80(SP)
74
75#ifdef F_INTERFACE
76	LDINT	N,    0(N)
77	LDINT	INCX, 0(INCX)
78	LDINT	INCY, 0(INCY)
79#endif
80
81	slwi	INCX, INCX, BASE_SHIFT
82	slwi	INCY, INCY, BASE_SHIFT
83
84	fmr	f1,  FZERO
85	fmr	f2,  FZERO
86	fmr	f3,  FZERO
87	fmr	f4,  FZERO
88	fmr	f5,  FZERO
89	fmr	f6,  FZERO
90	fmr	f7,  FZERO
91
92	li	PRE, 3 * 16 * SIZE
93
94	cmpwi	cr0, N, 0
95	ble-	LL(999)
96
97#ifdef F_INTERFACE
98	cmpwi	cr0, INCX, 0
99	bge+	LL(102)
100
101	subi	r0, N, 1
102	mullw	r0, r0, INCX
103	sub	X, X, r0
104	.align 4
105
106LL(102):
107	cmpwi	cr0, INCY, 0
108	bge+	LL(104)
109
110	subi	r0, N, 1
111	mullw	r0, r0, INCY
112	sub	Y, Y, r0
113	.align 4
114
115LL(104):
116#endif
117	sub	X, X, INCX
118	sub	Y, Y, INCY
119
120	srawi.	r0, N, 4
121	mtspr	CTR,  r0
122	beq-	LL(150)
123
124	LFDUX	f8,    X, INCX
125	LFDUX	f16,   Y, INCY
126	LFDUX	f9,    X, INCX
127	LFDUX	f17,   Y, INCY
128
129	LFDUX	f10,   X, INCX
130	LFDUX	f18,   Y, INCY
131	LFDUX	f11,   X, INCX
132	LFDUX	f19,   Y, INCY
133
134	LFDUX	f12,   X, INCX
135	LFDUX	f20,   Y, INCY
136	LFDUX	f13,   X, INCX
137	LFDUX	f21,   Y, INCY
138
139	LFDUX	f14,   X, INCX
140	LFDUX	f22,   Y, INCY
141	LFDUX	f15,   X, INCX
142	LFDUX	f23,   Y, INCY
143	bdz	LL(120)
144	.align 4
145
146LL(110):
147	FMADD	f0,  f8,  f16, f0
148	LFDUX	f8,    X, INCX
149	LFDUX	f16,   Y, INCY
150#ifdef PPCG4
151	dcbt	X, PRE
152#endif
153	FMADD	f1,  f9,  f17, f1
154	LFDUX	f9,    X, INCX
155	LFDUX	f17,   Y, INCY
156	FMADD	f2,  f10, f18, f2
157	LFDUX	f10,   X, INCX
158	LFDUX	f18,   Y, INCY
159#ifdef PPCG4
160	dcbt	Y, PRE
161#endif
162	FMADD	f3,  f11, f19, f3
163	LFDUX	f11,   X, INCX
164	LFDUX	f19,   Y, INCY
165
166	FMADD	f4,  f12, f20, f4
167	LFDUX	f12,   X, INCX
168	LFDUX	f20,   Y, INCY
169#if defined(PPCG4) && defined(DOUBLE)
170	dcbt	X, PRE
171#endif
172	FMADD	f5,  f13, f21, f5
173	LFDUX	f13,   X, INCX
174	LFDUX	f21,   Y, INCY
175	FMADD	f6,  f14, f22, f6
176	LFDUX	f14,   X, INCX
177	LFDUX	f22,   Y, INCY
178#if defined(PPCG4) && defined(DOUBLE)
179	dcbt	Y, PRE
180#endif
181	FMADD	f7,  f15, f23, f7
182	LFDUX	f15,   X, INCX
183	LFDUX	f23,   Y, INCY
184
185	FMADD	f0,  f8,  f16, f0
186	LFDUX	f8,    X, INCX
187	LFDUX	f16,   Y, INCY
188#ifdef PPCG4
189	dcbt	X, PRE
190#endif
191	FMADD	f1,  f9,  f17, f1
192	LFDUX	f9,    X, INCX
193	LFDUX	f17,   Y, INCY
194	FMADD	f2,  f10, f18, f2
195	LFDUX	f10,   X, INCX
196	LFDUX	f18,   Y, INCY
197#ifdef PPCG4
198	dcbt	Y, PRE
199#endif
200	FMADD	f3,  f11, f19, f3
201	LFDUX	f11,   X, INCX
202	LFDUX	f19,   Y, INCY
203
204	FMADD	f4,  f12, f20, f4
205	LFDUX	f12,   X, INCX
206	LFDUX	f20,   Y, INCY
207#if defined(PPCG4) && defined(DOUBLE)
208	dcbt	X, PRE
209#endif
210	FMADD	f5,  f13, f21, f5
211	LFDUX	f13,   X, INCX
212	LFDUX	f21,   Y, INCY
213	FMADD	f6,  f14, f22, f6
214	LFDUX	f14,   X, INCX
215	LFDUX	f22,   Y, INCY
216#if defined(PPCG4) && defined(DOUBLE)
217	dcbt	Y, PRE
218#endif
219	FMADD	f7,  f15, f23, f7
220	LFDUX	f15,   X, INCX
221	LFDUX	f23,   Y, INCY
222	bdnz	LL(110)
223	.align 4
224
225LL(120):
226	FMADD	f0,  f8,  f16, f0
227	LFDUX	f8,    X, INCX
228	LFDUX	f16,   Y, INCY
229	FMADD	f1,  f9,  f17, f1
230	LFDUX	f9,    X, INCX
231	LFDUX	f17,   Y, INCY
232	FMADD	f2,  f10, f18, f2
233	LFDUX	f10,   X, INCX
234	LFDUX	f18,   Y, INCY
235	FMADD	f3,  f11, f19, f3
236	LFDUX	f11,   X, INCX
237	LFDUX	f19,   Y, INCY
238
239	FMADD	f4,  f12, f20, f4
240	LFDUX	f12,   X, INCX
241	LFDUX	f20,   Y, INCY
242	FMADD	f5,  f13, f21, f5
243	LFDUX	f13,   X, INCX
244	LFDUX	f21,   Y, INCY
245	FMADD	f6,  f14, f22, f6
246	LFDUX	f14,   X, INCX
247	LFDUX	f22,   Y, INCY
248	FMADD	f7,  f15, f23, f7
249	LFDUX	f15,   X, INCX
250	LFDUX	f23,   Y, INCY
251
252	FMADD	f0,  f8,  f16, f0
253	FMADD	f1,  f9,  f17, f1
254	FMADD	f2,  f10, f18, f2
255	FMADD	f3,  f11, f19, f3
256	FMADD	f4,  f12, f20, f4
257	FMADD	f5,  f13, f21, f5
258	FMADD	f6,  f14, f22, f6
259	FMADD	f7,  f15, f23, f7
260	.align 4
261
262LL(150):
263	andi.	r0,  N, 15
264	mtspr	CTR, r0
265	beq	LL(999)
266	.align 4
267
268LL(160):
269	LFDUX	f8,    X, INCX
270	LFDUX	f16,   Y, INCY
271	FMADD	f0,  f8,  f16, f0
272	bdnz	LL(160)
273	.align 4
274
275LL(999):
276	FADD	f0,  f0,  f1
277	FADD	f2,  f2,  f3
278	FADD	f4,  f4,  f5
279	FADD	f6,  f6,  f7
280
281	FADD	f0,  f0,  f2
282	FADD	f4,  f4,  f6
283	FADD	f1,  f0,  f4
284
285	lfd	f14,    0(SP)
286	lfd	f15,    8(SP)
287	lfd	f16,   16(SP)
288	lfd	f17,   24(SP)
289
290	lfd	f18,   32(SP)
291	lfd	f19,   40(SP)
292	lfd	f20,   48(SP)
293	lfd	f21,   56(SP)
294
295	lfd	f22,   64(SP)
296	lfd	f23,   72(SP)
297
298	addi	SP, SP, STACKSIZE
299	blr
300
301	EPILOGUE
302