1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define N	$4
43#define	X	$5
44#define INCX	$6
45#define Y	$7
46#define INCY	$8
47
48#define I	$2
49#define TEMP	$3
50
51#define a1	$f4
52#define a2	$f5
53#define a3	$f6
54#define a4	$f7
55#define b1	$f8
56#define b2	$f9
57#define b3	$f10
58#define b4	$f11
59
60#define s1	$f0
61#define s2	$f1
62#define s3	$f2
63#define s4	$f3
64
65
66	PROLOGUE
67
68#ifdef F_INTERFACE
69	LDINT	N,     0(N)
70	LDINT	INCX,  0(INCX)
71	LDINT	INCY,  0(INCY)
72#endif
73
74	MTC	$0,  s1
75
76	MOV	s2, s1
77	MOV	s3, s2
78	MOV	s4, s3
79
80	dsll	INCX, INCX, ZBASE_SHIFT
81	li	TEMP, 2 * SIZE
82
83	blez	N, .L999
84	dsll	INCY, INCY, ZBASE_SHIFT
85
86	bne	INCX, TEMP, .L20
87	dsra	I, N, 2
88
89	bne	INCY, TEMP, .L20
90	NOP
91
92	blez	I, .L15
93	NOP
94
95	LD	a1,  0 * SIZE(X)
96	LD	a2,  1 * SIZE(X)
97	LD	b1,  0 * SIZE(Y)
98	daddiu	I, I, -1
99
100	blez	I, .L14
101	LD	b2,  1 * SIZE(Y)
102	.align 3
103
104.L13:
105	MADD	s1, s1, a1, b1
106	LD	a3,  2 * SIZE(X)
107	MADD	s2, s2, a2, b1
108	LD	a4,  3 * SIZE(X)
109	MADD	s3, s3, a1, b2
110	LD	b3,  2 * SIZE(Y)
111	MADD	s4, s4, a2, b2
112	LD	b4,  3 * SIZE(Y)
113
114	MADD	s1, s1, a3, b3
115	LD	a1,  4 * SIZE(X)
116	MADD	s2, s2, a4, b3
117	LD	a2,  5 * SIZE(X)
118	MADD	s3, s3, a3, b4
119	LD	b1,  4 * SIZE(Y)
120	MADD	s4, s4, a4, b4
121	LD	b2,  5 * SIZE(Y)
122
123	MADD	s1, s1, a1, b1
124	LD	a3,  6 * SIZE(X)
125	MADD	s2, s2, a2, b1
126	LD	a4,  7 * SIZE(X)
127	MADD	s3, s3, a1, b2
128	LD	b3,  6 * SIZE(Y)
129	MADD	s4, s4, a2, b2
130	LD	b4,  7 * SIZE(Y)
131
132	MADD	s1, s1, a3, b3
133	LD	a1,  8 * SIZE(X)
134	MADD	s2, s2, a4, b3
135	LD	a2,  9 * SIZE(X)
136	MADD	s3, s3, a3, b4
137	LD	b1,  8 * SIZE(Y)
138	MADD	s4, s4, a4, b4
139	LD	b2,  9 * SIZE(Y)
140
141	daddiu	I, I, -1
142	daddiu	X, X, 8 * SIZE
143
144	bgtz	I, .L13
145	daddiu	Y, Y, 8 * SIZE
146	.align 3
147
148.L14:
149	MADD	s1, s1, a1, b1
150	LD	a3,  2 * SIZE(X)
151	MADD	s2, s2, a2, b1
152	LD	a4,  3 * SIZE(X)
153	MADD	s3, s3, a1, b2
154	LD	b3,  2 * SIZE(Y)
155	MADD	s4, s4, a2, b2
156	LD	b4,  3 * SIZE(Y)
157
158	MADD	s1, s1, a3, b3
159	LD	a1,  4 * SIZE(X)
160	MADD	s2, s2, a4, b3
161	LD	a2,  5 * SIZE(X)
162	MADD	s3, s3, a3, b4
163	LD	b1,  4 * SIZE(Y)
164	MADD	s4, s4, a4, b4
165	LD	b2,  5 * SIZE(Y)
166
167	MADD	s1, s1, a1, b1
168	LD	a3,  6 * SIZE(X)
169	MADD	s2, s2, a2, b1
170	LD	a4,  7 * SIZE(X)
171	MADD	s3, s3, a1, b2
172	LD	b3,  6 * SIZE(Y)
173	MADD	s4, s4, a2, b2
174	LD	b4,  7 * SIZE(Y)
175
176	MADD	s1, s1, a3, b3
177	daddiu	X, X, 8 * SIZE
178	MADD	s2, s2, a4, b3
179	daddiu	Y, Y, 8 * SIZE
180	MADD	s3, s3, a3, b4
181	MADD	s4, s4, a4, b4
182	.align 3
183
184.L15:
185	andi	I,  N, 3
186
187	blez	I, .L999
188	NOP
189
190	LD	a1,  0 * SIZE(X)
191	LD	a2,  1 * SIZE(X)
192
193	LD	b1,  0 * SIZE(Y)
194	daddiu	I, I, -1
195
196	blez	I, .L17
197	LD	b2,  1 * SIZE(Y)
198	.align	3
199
200.L16:
201	MADD	s1, s1, a1, b1
202	daddiu	I, I, -1
203	MADD	s2, s2, a2, b1
204	LD	b1,  2 * SIZE(Y)
205	MADD	s3, s3, a1, b2
206	LD	a1,  2 * SIZE(X)
207	MADD	s4, s4, a2, b2
208	LD	a2,  3 * SIZE(X)
209
210	LD	b2,  3 * SIZE(Y)
211	daddiu	X, X, 2 * SIZE
212
213	bgtz	I, .L16
214	daddiu	Y, Y, 2 * SIZE
215	.align 3
216
217.L17:
218	MADD	s1, s1, a1, b1
219	MADD	s2, s2, a2, b1
220	NOP
221	MADD	s3, s3, a1, b2
222	j	.L999
223	MADD	s4, s4, a2, b2
224	.align 3
225
226.L20:
227#ifdef F_INTERFACE
228	bgez	INCX, .L21
229	daddiu	TEMP, N, -1
230
231	mult	TEMP, INCX
232
233	mflo	TEMP
234	dsub	X, X, TEMP
235	.align 3
236
237.L21:
238	bgez	INCY, .L22
239	daddiu	TEMP, N, -1
240
241	mult	TEMP, INCY
242
243	mflo	TEMP
244	dsub	Y, Y, TEMP
245	.align 3
246
247.L22:
248#endif
249	blez	I, .L25
250	NOP
251
252	LD	a1,  0 * SIZE(X)
253	LD	a2,  1 * SIZE(X)
254	LD	b1,  0 * SIZE(Y)
255	LD	b2,  1 * SIZE(Y)
256
257	dadd	X, X, INCX
258	daddiu	I, I, -1
259
260	blez	I, .L24
261	dadd	Y, Y, INCY
262	.align 3
263
264.L23:
265	MADD	s1, s1, a1, b1
266	LD	a3,  0 * SIZE(X)
267	MADD	s2, s2, a2, b1
268	LD	a4,  1 * SIZE(X)
269	MADD	s3, s3, a1, b2
270	LD	b3,  0 * SIZE(Y)
271	MADD	s4, s4, a2, b2
272	LD	b4,  1 * SIZE(Y)
273
274	dadd	X, X, INCX
275	dadd	Y, Y, INCY
276
277	MADD	s1, s1, a3, b3
278	LD	a1,  0 * SIZE(X)
279	MADD	s2, s2, a4, b3
280	LD	a2,  1 * SIZE(X)
281	MADD	s3, s3, a3, b4
282	LD	b1,  0 * SIZE(Y)
283	MADD	s4, s4, a4, b4
284	LD	b2,  1 * SIZE(Y)
285
286	dadd	X, X, INCX
287	dadd	Y, Y, INCY
288
289	MADD	s1, s1, a1, b1
290	LD	a3,  0 * SIZE(X)
291	MADD	s2, s2, a2, b1
292	LD	a4,  1 * SIZE(X)
293	MADD	s3, s3, a1, b2
294	LD	b3,  0 * SIZE(Y)
295	MADD	s4, s4, a2, b2
296	LD	b4,  1 * SIZE(Y)
297
298	dadd	X, X, INCX
299	dadd	Y, Y, INCY
300
301	MADD	s1, s1, a3, b3
302	LD	a1,  0 * SIZE(X)
303	MADD	s2, s2, a4, b3
304	LD	a2,  1 * SIZE(X)
305	MADD	s3, s3, a3, b4
306	LD	b1,  0 * SIZE(Y)
307	MADD	s4, s4, a4, b4
308	LD	b2,  1 * SIZE(Y)
309
310	dadd	X, X, INCX
311	daddiu	I, I, -1
312
313	bgtz	I, .L23
314	dadd	Y, Y, INCY
315	.align 3
316
317.L24:
318	MADD	s1, s1, a1, b1
319	LD	a3,  0 * SIZE(X)
320	MADD	s2, s2, a2, b1
321	LD	a4,  1 * SIZE(X)
322	MADD	s3, s3, a1, b2
323	LD	b3,  0 * SIZE(Y)
324	MADD	s4, s4, a2, b2
325	LD	b4,  1 * SIZE(Y)
326
327	dadd	X, X, INCX
328	dadd	Y, Y, INCY
329
330	MADD	s1, s1, a3, b3
331	LD	a1,  0 * SIZE(X)
332	MADD	s2, s2, a4, b3
333	LD	a2,  1 * SIZE(X)
334	MADD	s3, s3, a3, b4
335	LD	b1,  0 * SIZE(Y)
336	MADD	s4, s4, a4, b4
337	LD	b2,  1 * SIZE(Y)
338
339	dadd	X, X, INCX
340	dadd	Y, Y, INCY
341
342	MADD	s1, s1, a1, b1
343	LD	a3,  0 * SIZE(X)
344	MADD	s2, s2, a2, b1
345	LD	a4,  1 * SIZE(X)
346	MADD	s3, s3, a1, b2
347	LD	b3,  0 * SIZE(Y)
348	MADD	s4, s4, a2, b2
349	LD	b4,  1 * SIZE(Y)
350
351	MADD	s1, s1, a3, b3
352	dadd	X, X, INCX
353	MADD	s2, s2, a4, b3
354	dadd	Y, Y, INCY
355	MADD	s3, s3, a3, b4
356	MADD	s4, s4, a4, b4
357	.align 3
358
359.L25:
360	andi	I,  N, 3
361
362	blez	I, .L999
363	NOP
364	.align	3
365
366.L26:
367	LD	a1,  0 * SIZE(X)
368	LD	a2,  1 * SIZE(X)
369	LD	b1,  0 * SIZE(Y)
370	LD	b2,  1 * SIZE(Y)
371
372	MADD	s1, s1, a1, b1
373	MADD	s2, s2, a2, b1
374	MADD	s3, s3, a1, b2
375	MADD	s4, s4, a2, b2
376
377
378	dadd	X, X, INCX
379	dadd	Y, Y, INCY
380
381	daddiu	I, I, -1
382
383	bgtz	I, .L26
384	NOP
385	.align 3
386
387.L999:
388	NOP
389#ifndef CONJ
390	SUB	s1, s1, s4
391#else
392	ADD	s1, s1, s4
393#endif
394
395	j	$31
396#ifndef CONJ
397	ADD	s3, s3, s2
398#else
399	SUB	s3, s3, s2
400#endif
401
402	EPILOGUE
403