1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define N	$4
26#define	X	$5
27#define INCX	$6
28#define Y	$7
29#define INCY	$8
30
31#define XX	$9
32#define YY	$10
33
34#define C	$f17
35#define S	$f18
36
37#define I	$2
38#define TEMP	$3
39
40#define a1	$f4
41#define a2	$f5
42#define a3	$f6
43#define a4	$f7
44
45#define b1	$f8
46#define b2	$f9
47#define b3	$f10
48#define b4	$f11
49
50#define t1	$f0
51#define t2	$f1
52#define t3	$f2
53#define t4	$f3
54
55	PROLOGUE
56
57	dsll	INCX, INCX, BASE_SHIFT
58	li	TEMP, SIZE
59
60	blez	N, .L999
61	dsll	INCY, INCY, BASE_SHIFT
62
63	bne	INCX, TEMP, .L20
64	dsra	I, N, 2
65
66	bne	INCY, TEMP, .L20
67	NOP
68
69	blez	I, .L15
70	daddiu	I, I, -1
71
72	LD	a1,  0 * SIZE(X)
73	LD	b1,  0 * SIZE(Y)
74	LD	a2,  1 * SIZE(X)
75	LD	b2,  1 * SIZE(Y)
76
77	LD	a3,  2 * SIZE(X)
78	LD	b3,  2 * SIZE(Y)
79	MUL	t1, S, b1
80
81	LD	a4,  3 * SIZE(X)
82	MUL	t2, C, b1
83	LD	b4,  3 * SIZE(Y)
84	MUL	t3, S, b2
85
86	blez	I, .L13
87	MUL	t4, C, b2
88	.align 3
89
90.L12:
91	MADD	t1, t1, C, a1
92	LD	b1,  4 * SIZE(Y)
93	NMSUB	t2, t2, S, a1
94	LD	a1,  4 * SIZE(X)
95	MADD	t3, t3, C, a2
96	LD	b2,  5 * SIZE(Y)
97	NMSUB	t4, t4, S, a2
98	LD	a2,  5 * SIZE(X)
99
100	ST	t1,  0 * SIZE(X)
101	MUL	t1, S, b3
102	ST	t2,  0 * SIZE(Y)
103	MUL	t2, C, b3
104	ST	t3,  1 * SIZE(X)
105	MUL	t3, S, b4
106	ST	t4,  1 * SIZE(Y)
107	MUL	t4, C, b4
108
109
110	MADD	t1, t1, C, a3
111	LD	b3,  6 * SIZE(Y)
112	NMSUB	t2, t2, S, a3
113	LD	a3,  6 * SIZE(X)
114	MADD	t3, t3, C, a4
115	LD	b4,  7 * SIZE(Y)
116	NMSUB	t4, t4, S, a4
117	LD	a4,  7 * SIZE(X)
118
119	ST	t1,  2 * SIZE(X)
120	MUL	t1, S, b1
121	ST	t2,  2 * SIZE(Y)
122	MUL	t2, C, b1
123	ST	t3,  3 * SIZE(X)
124	MUL	t3, S, b2
125	ST	t4,  3 * SIZE(Y)
126	MUL	t4, C, b2
127
128	daddiu	I, I, -1
129	daddiu	X, X, 4 * SIZE
130
131	bgtz	I, .L12
132	daddiu	Y, Y, 4 * SIZE
133	.align 3
134
135.L13:
136	MADD	t1, t1, C, a1
137	NMSUB	t2, t2, S, a1
138	MADD	t3, t3, C, a2
139	NMSUB	t4, t4, S, a2
140
141	ST	t1,  0 * SIZE(X)
142	MUL	t1, S, b3
143	ST	t2,  0 * SIZE(Y)
144	MUL	t2, C, b3
145	ST	t3,  1 * SIZE(X)
146	MUL	t3, S, b4
147	ST	t4,  1 * SIZE(Y)
148	MUL	t4, C, b4
149
150	MADD	t1, t1, C, a3
151	NMSUB	t2, t2, S, a3
152	MADD	t3, t3, C, a4
153	daddiu	X, X, 4 * SIZE
154	NMSUB	t4, t4, S, a4
155	daddiu	Y, Y, 4 * SIZE
156
157	ST	t1, -2 * SIZE(X)
158	ST	t2, -2 * SIZE(Y)
159	ST	t3, -1 * SIZE(X)
160	ST	t4, -1 * SIZE(Y)
161	.align 3
162
163.L15:
164	andi	I,  N, 3
165
166	blez	I, .L999
167	NOP
168	.align	3
169
170.L16:
171	LD	a1,  0 * SIZE(X)
172	LD	b1,  0 * SIZE(Y)
173
174	MUL	t1, S, b1
175	MUL	t2, C, b1
176
177	MADD	t1, t1, C, a1
178	NMSUB	t2, t2, S, a1
179
180	ST	t1,  0 * SIZE(X)
181	ST	t2,  0 * SIZE(Y)
182
183	daddiu I, I, -1
184
185	daddiu	X, X, SIZE
186	daddiu	Y, Y, SIZE
187
188	bgtz	I, .L16
189	NOP
190	j	.L999
191	NOP
192	.align 3
193
194.L20:
195	move	XX, X
196	move	YY, Y
197
198	blez	I, .L25
199	daddiu	I, I, -1
200
201	LD	a1,  0 * SIZE(X)
202	dadd	X, X, INCX
203	LD	b1,  0 * SIZE(Y)
204	dadd	Y, Y, INCY
205
206	LD	a2,  0 * SIZE(X)
207	dadd	X, X, INCX
208	LD	b2,  0 * SIZE(Y)
209	dadd	Y, Y, INCY
210
211	LD	a3,  0 * SIZE(X)
212	dadd	X, X, INCX
213	LD	b3,  0 * SIZE(Y)
214	dadd	Y, Y, INCY
215
216	MUL	t1, S, b1
217
218	LD	a4,  0 * SIZE(X)
219	dadd	X, X, INCX
220	MUL	t2, C, b1
221	LD	b4,  0 * SIZE(Y)
222	dadd	Y, Y, INCY
223
224	MUL	t3, S, b2
225	blez	I, .L23
226	MUL	t4, C, b2
227	.align 3
228
229.L22:
230	MADD	t1, t1, C, a1
231	LD	b1,  0 * SIZE(Y)
232	dadd	Y, Y, INCY
233	NMSUB	t2, t2, S, a1
234	LD	a1,  0 * SIZE(X)
235	dadd	X, X, INCX
236	MADD	t3, t3, C, a2
237	LD	b2,  0 * SIZE(Y)
238	dadd	Y, Y, INCY
239	NMSUB	t4, t4, S, a2
240	LD	a2,  0 * SIZE(X)
241	dadd	X, X, INCX
242
243	ST	t1,  0 * SIZE(XX)
244	dadd	XX, XX, INCX
245	MUL	t1, S, b3
246	ST	t2,  0 * SIZE(YY)
247	dadd	YY, YY, INCY
248	MUL	t2, C, b3
249	ST	t3,  0 * SIZE(XX)
250	dadd	XX, XX, INCX
251	MUL	t3, S, b4
252	ST	t4,  0 * SIZE(YY)
253	dadd	YY, YY, INCY
254	MUL	t4, C, b4
255
256	MADD	t1, t1, C, a3
257	LD	b3,  0 * SIZE(Y)
258	dadd	Y, Y, INCY
259	NMSUB	t2, t2, S, a3
260	LD	a3,  0 * SIZE(X)
261	dadd	X, X, INCX
262	MADD	t3, t3, C, a4
263	LD	b4,  0 * SIZE(Y)
264	dadd	Y, Y, INCY
265	NMSUB	t4, t4, S, a4
266	LD	a4,  0 * SIZE(X)
267	dadd	X, X, INCX
268
269	ST	t1,  0 * SIZE(XX)
270	dadd	XX, XX, INCX
271	MUL	t1, S, b1
272	ST	t2,  0 * SIZE(YY)
273	dadd	YY, YY, INCY
274	MUL	t2, C, b1
275	ST	t3,  0 * SIZE(XX)
276	dadd	XX, XX, INCX
277	MUL	t3, S, b2
278	ST	t4,  0 * SIZE(YY)
279	MUL	t4, C, b2
280	daddiu	I, I, -1
281
282	bgtz	I, .L22
283	dadd	YY, YY, INCY
284	.align 3
285
286.L23:
287	MADD	t1, t1, C, a1
288	NMSUB	t2, t2, S, a1
289	MADD	t3, t3, C, a2
290	NMSUB	t4, t4, S, a2
291
292	ST	t1,  0 * SIZE(XX)
293	dadd	XX, XX, INCX
294	MUL	t1, S, b3
295	ST	t2,  0 * SIZE(YY)
296	dadd	YY, YY, INCY
297	MUL	t2, C, b3
298	ST	t3,  0 * SIZE(XX)
299	dadd	XX, XX, INCX
300	MUL	t3, S, b4
301	ST	t4,  0 * SIZE(YY)
302	dadd	YY, YY, INCY
303	MUL	t4, C, b4
304
305	MADD	t1, t1, C, a3
306	NMSUB	t2, t2, S, a3
307	MADD	t3, t3, C, a4
308	NMSUB	t4, t4, S, a4
309
310	ST	t1,  0 * SIZE(XX)
311	dadd	XX, XX, INCX
312	ST	t2,  0 * SIZE(YY)
313	dadd	YY, YY, INCY
314	ST	t3,  0 * SIZE(XX)
315	dadd	XX, XX, INCX
316	ST	t4,  0 * SIZE(YY)
317	dadd	YY, YY, INCY
318	.align 3
319
320.L25:
321	andi	I,  N, 3
322
323	blez	I, .L999
324	NOP
325	.align	3
326
327.L26:
328	LD	a1,  0 * SIZE(X)
329	LD	b1,  0 * SIZE(Y)
330
331	MUL	t1, S, b1
332	MUL	t2, C, b1
333
334	MADD	t1, t1, C, a1
335	daddiu	I, I, -1
336	NMSUB	t2, t2, S, a1
337
338	ST	t1,  0 * SIZE(X)
339	ST	t2,  0 * SIZE(Y)
340
341	dadd	X, X, INCX
342	bgtz	I, .L26
343	dadd	Y, Y, INCY
344	.align 3
345
346.L999:
347	j	$31
348	NOP
349
350	EPILOGUE
351