1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define M	%i0
43#define N	%i1
44
45#if defined(DOUBLE) && !defined(__64BIT__)
46#define X	%i5
47#define INCX	%i2
48#define Y	%i3
49#define INCY	%i4
50#else
51#define X	%i4
52#define INCX	%i5
53#define Y	%i2
54#define INCY	%i3
55#endif
56
57#define A	%l0
58#define LDA	%l1
59#define BUFFER	%l2
60
61#define I	%l3
62#define J	%l4
63
64#define A1	%o0
65#define X1	%o2
66#define XX	%o3
67
68#ifdef DOUBLE
69#define t1	%f0
70#define	t2 	%f2
71#define t3	%f4
72#define	t4 	%f6
73
74#define x1	%f8
75#define x2	%f10
76#define x3	%f12
77#define x4	%f14
78#define x5	%f16
79#define x6	%f18
80#define x7	%f20
81#define x8	%f22
82
83#define a1	%f24
84#define a2	%f26
85#define a3	%f28
86#define a4	%f30
87#define a5	%f32
88#define a6	%f34
89#define a7	%f36
90#define a8	%f38
91
92#define a9	%f40
93#define a10	%f42
94#define a11	%f44
95#define a12	%f46
96#define a13	%f48
97#define a14	%f50
98#define a15	%f52
99#define a16	%f54
100
101#define y1	%f56
102#define y2	%f58
103
104#define ALPHA	%f60
105
106#else
107#define t1	%f0
108#define	t2 	%f1
109#define t3	%f2
110#define	t4 	%f3
111
112#define x1	%f4
113#define x2	%f5
114#define x3	%f6
115#define x4	%f7
116#define x5	%f8
117#define x6	%f9
118#define x7	%f10
119#define x8	%f11
120
121#define a1	%f12
122#define a2	%f13
123#define a3	%f14
124#define a4	%f15
125#define a5	%f16
126#define a6	%f17
127#define a7	%f18
128#define a8	%f19
129
130#define a9	%f20
131#define a10	%f21
132#define a11	%f22
133#define a12	%f23
134#define a13	%f24
135#define a14	%f25
136#define a15	%f26
137#define a16	%f27
138
139#define y1	%f28
140#define y2	%f29
141#define ALPHA	%f30
142#endif
143
144#define PREFETCHSIZE 60
145
146	PROLOGUE
147	SAVESP
148	nop
149
150#ifndef __64BIT__
151
152#ifdef DOUBLE
153	st	%i3, [%sp + STACK_START + 16]
154	st	%i4, [%sp + STACK_START + 20]
155
156	ld	[%sp + STACK_START + 28], INCX
157	ld	[%sp + STACK_START + 32], Y
158	ld	[%sp + STACK_START + 36], INCY
159	ld	[%sp + STACK_START + 40], A
160	ld	[%sp + STACK_START + 44], LDA
161	ld	[%sp + STACK_START + 48], BUFFER
162#else
163	st	%i3, [%sp + STACK_START + 16]
164
165	ld	[%sp + STACK_START + 28], Y
166	ld	[%sp + STACK_START + 32], INCY
167	ld	[%sp + STACK_START + 36], A
168	ld	[%sp + STACK_START + 40], LDA
169	ld	[%sp + STACK_START + 44], BUFFER
170#endif
171	LDF	[%sp + STACK_START + 16], ALPHA
172#else
173	ldx	[%sp + STACK_START + 56], Y
174	ldx	[%sp + STACK_START + 64], INCY
175	ldx	[%sp + STACK_START + 72], A
176	ldx	[%sp + STACK_START + 80], LDA
177	ldx	[%sp + STACK_START + 88], BUFFER
178#ifdef DOUBLE
179	FMOV	%f6, ALPHA
180#else
181	FMOV	%f7, ALPHA
182#endif
183#endif
184
185	sll	LDA, BASE_SHIFT, LDA
186
187	cmp	M, 0
188	ble	%icc, .LL999
189	sll	INCX, BASE_SHIFT, INCX
190	cmp	N, 0
191	ble	%icc, .LL999
192	sll	INCY, BASE_SHIFT, INCY
193
194	cmp	INCX, SIZE
195	be	%icc, .LL10
196	mov	X, XX
197
198	mov	BUFFER, XX
199	mov	BUFFER, X1
200
201	sra	M, 3, J
202	cmp	J, 0
203	ble,pn	%icc, .LL05
204	nop
205
206.LL01:
207	LDF	[X], a1
208	add	X,  INCX, X
209	LDF	[X], a2
210	add	X,  INCX, X
211	LDF	[X], a3
212	add	X,  INCX, X
213	LDF	[X], a4
214	add	X,  INCX, X
215	LDF	[X], a5
216	add	X,  INCX, X
217	LDF	[X], a6
218	add	X,  INCX, X
219	LDF	[X], a7
220	add	X,  INCX, X
221	LDF	[X], a8
222	add	X,  INCX, X
223
224	STF	a1, [X1 +  0 * SIZE]
225	STF	a2, [X1 +  1 * SIZE]
226	STF	a3, [X1 +  2 * SIZE]
227	STF	a4, [X1 +  3 * SIZE]
228	STF	a5, [X1 +  4 * SIZE]
229	STF	a6, [X1 +  5 * SIZE]
230	STF	a7, [X1 +  6 * SIZE]
231	STF	a8, [X1 +  7 * SIZE]
232
233	add	X1, 8 * SIZE, X1
234
235	deccc	J
236	bg,pn	%icc, .LL01
237	nop
238
239.LL05:
240	andcc	M, 7, J
241	ble,pn	%icc, .LL10
242	nop
243
244.LL06:
245	LDF	[X], a1
246	add	X,  INCX, X
247
248	STF	a1, [X1 +  0 * SIZE]
249	add	X1, 1 * SIZE, X1
250
251	deccc	J
252	bg,pn	%icc, .LL06
253	nop
254
255.LL10:
256	mov	N, J
257	cmp	N, 0
258	ble,pn	%icc, .LL999
259	nop
260
261.LL11:
262	mov	XX, X1
263
264	mov	A,  A1
265	add	A, LDA, A
266
267	LDF	[Y], y1
268	add	Y, INCY, Y
269
270	FMUL	ALPHA, y1, y1
271
272	sra	M, 3, I
273	cmp	I, 0
274	ble,pn	%icc, .LL15
275	nop
276
277	LDF	[X1 + 0 * SIZE], x1
278	LDF	[A1 + 0 * SIZE], a1
279	LDF	[X1 + 1 * SIZE], x2
280	LDF	[A1 + 1 * SIZE], a2
281	LDF	[X1 + 2 * SIZE], x3
282	LDF	[A1 + 2 * SIZE], a3
283	LDF	[X1 + 3 * SIZE], x4
284	LDF	[A1 + 3 * SIZE], a4
285
286	LDF	[X1 + 4 * SIZE], x5
287	LDF	[A1 + 4 * SIZE], a5
288	LDF	[X1 + 5 * SIZE], x6
289	LDF	[A1 + 5 * SIZE], a6
290	LDF	[X1 + 6 * SIZE], x7
291	LDF	[A1 + 6 * SIZE], a7
292	LDF	[X1 + 7 * SIZE], x8
293	LDF	[A1 + 7 * SIZE], a8
294
295	FMUL	x1,  y1, t1
296	FMUL	x2,  y1, t2
297	FMUL	x3,  y1, t3
298	FMUL	x4,  y1, t4
299
300	FADD	a1,  t1, a1
301	FMUL	x5,  y1, t1
302	FADD	a2,  t2, a2
303	FMUL	x6,  y1, t2
304
305	deccc	I
306	ble,pn	%icc, .LL13
307	nop
308
309.LL12:
310	prefetch  [A1 +  PREFETCHSIZE * SIZE], 0
311
312	FADD	a3,  t3, a3
313	LDF	[X1 +  8 * SIZE], x1
314	FMUL	x7,  y1, t3
315	LDF	[X1 +  9 * SIZE], x2
316	FADD	a4,  t4, a4
317	LDF	[X1 + 10 * SIZE], x3
318	FMUL	x8,  y1, t4
319	LDF	[X1 + 11 * SIZE], x4
320
321	FADD	a5,  t1, a5
322	STF	a1,  [A1 + 0 * SIZE]
323	LDF	[A1 +  8 * SIZE], a1
324	FMUL	x1,  y1, t1
325	STF	a2,  [A1 + 1 * SIZE]
326	LDF	[A1 +  9 * SIZE], a2
327
328	FADD	a6,  t2, a6
329	STF	a3,  [A1 + 2 * SIZE]
330	LDF	[A1 + 10 * SIZE], a3
331	FMUL	x2,  y1, t2
332	STF	a4,  [A1 + 3 * SIZE]
333	LDF	[A1 + 11 * SIZE], a4
334
335	FADD	a7,  t3, a7
336	LDF	[X1 + 12 * SIZE], x5
337	FMUL	x3,  y1, t3
338	LDF	[X1 + 13 * SIZE], x6
339	FADD	a8,  t4, a8
340	LDF	[X1 + 14 * SIZE], x7
341	FMUL	x4,  y1, t4
342	LDF	[X1 + 15 * SIZE], x8
343
344	FADD	a1,  t1, a1
345	STF	a5,  [A1 + 4 * SIZE]
346	deccc	I
347	LDF	[A1 + 12 * SIZE], a5
348	FMUL	x5,  y1, t1
349	STF	a6,  [A1 + 5 * SIZE]
350	LDF	[A1 + 13 * SIZE], a6
351	FADD	a2,  t2, a2
352	STF	a7,  [A1 + 6 * SIZE]
353	LDF	[A1 + 14 * SIZE], a7
354	FMUL	x6,  y1, t2
355	STF	a8,  [A1 + 7 * SIZE]
356	LDF	[A1 + 15 * SIZE], a8
357	add	A1, 8 * SIZE, A1
358
359	bg,pn	%icc, .LL12
360	add	X1, 8 * SIZE, X1
361
362.LL13:
363	FADD	a3,  t3, a3
364	FMUL	x7,  y1, t3
365	FADD	a4,  t4, a4
366	FMUL	x8,  y1, t4
367
368	FADD	a5,  t1, a5
369	FADD	a6,  t2, a6
370	FADD	a7,  t3, a7
371	FADD	a8,  t4, a8
372
373	STF	a1,  [A1 + 0 * SIZE]
374	STF	a2,  [A1 + 1 * SIZE]
375	STF	a3,  [A1 + 2 * SIZE]
376	STF	a4,  [A1 + 3 * SIZE]
377
378	STF	a5,  [A1 + 4 * SIZE]
379	STF	a6,  [A1 + 5 * SIZE]
380	STF	a7,  [A1 + 6 * SIZE]
381	STF	a8,  [A1 + 7 * SIZE]
382
383	add	A1, 8 * SIZE, A1
384	add	X1, 8 * SIZE, X1
385
386.LL15:
387	andcc	M, 4, I
388	ble,pn	%icc, .LL16
389	nop
390
391	LDF	[X1 + 0 * SIZE], x1
392	LDF	[A1 + 0 * SIZE], a1
393	LDF	[X1 + 1 * SIZE], x2
394	LDF	[A1 + 1 * SIZE], a2
395
396	LDF	[X1 + 2 * SIZE], x3
397	LDF	[A1 + 2 * SIZE], a3
398	LDF	[X1 + 3 * SIZE], x4
399	LDF	[A1 + 3 * SIZE], a4
400
401	FMUL	x1,  y1, t1
402	FMUL	x2,  y1, t2
403	FMUL	x3,  y1, t3
404	FMUL	x4,  y1, t4
405
406	FADD	a1,  t1, a1
407	FADD	a2,  t2, a2
408	FADD	a3,  t3, a3
409	FADD	a4,  t4, a4
410
411	STF	a1,  [A1 + 0 * SIZE]
412	STF	a2,  [A1 + 1 * SIZE]
413	STF	a3,  [A1 + 2 * SIZE]
414	add	X1, 4 * SIZE, X1
415	STF	a4,  [A1 + 3 * SIZE]
416	add	A1, 4 * SIZE, A1
417
418.LL16:
419	andcc	M, 2, I
420	ble,pn	%icc, .LL17
421	nop
422
423	LDF	[X1 + 0 * SIZE], x1
424	LDF	[X1 + 1 * SIZE], x2
425	LDF	[A1 + 0 * SIZE], a1
426	LDF	[A1 + 1 * SIZE], a2
427
428	FMUL	x1,  y1, t1
429	FMUL	x2,  y1, t2
430
431	FADD	a1,  t1, a1
432	FADD	a2,  t2, a2
433
434	STF	a1, [A1 + 0 * SIZE]
435	add	X1, 2 * SIZE, X1
436	STF	a2, [A1 + 1 * SIZE]
437	add	A1, 2 * SIZE, A1
438
439.LL17:
440	andcc	M, 1, I
441	ble,pn	%icc, .LL19
442	nop
443
444	LDF	[X1 + 0 * SIZE], x1
445	add	X1, 1 * SIZE, X1
446
447	LDF	[A1 + 0 * SIZE], a1
448
449	FMUL	x1,  y1, t1
450	FADD	a1,  t1, a1
451
452	STF	a1, [A1 + 0 * SIZE]
453	add	A1, 1 * SIZE, A1
454
455.LL19:
456	deccc	J
457	bg	%icc, .LL11
458	nop
459
460.LL999:
461	return	%i7 + 8
462	clr	%o0
463
464	EPILOGUE
465