1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#if defined(DOUBLE) && !defined(__64BIT__)
43#define N	%i0
44#define X	%i5
45#define INCX	%i1
46#define Y	%i2
47#define INCY	%i3
48#define I	%i4
49#else
50#define N	%i0
51#define X	%i4
52#define INCX	%i5
53#define Y	%i1
54#define INCY	%i2
55#define I	%i3
56#endif
57
58#define YY	%l1
59
60#ifdef DOUBLE
61#define a1	%f0
62#define a2	%f2
63#define a3	%f4
64#define a4	%f6
65#define a5	%f8
66#define a6	%f10
67#define a7	%f12
68#define a8	%f14
69#define b1	%f16
70#define b2	%f18
71#define b3	%f20
72#define b4	%f22
73#define b5	%f24
74#define b6	%f26
75#define b7	%f28
76#define b8	%f30
77
78#define t1	%f32
79#define t2	%f34
80#define t3	%f36
81#define	t4	%f38
82#define c1	%f40
83#define c2	%f42
84#define c3	%f44
85#define c4	%f46
86
87#define c5	%f48
88#define c6	%f50
89#define c7	%f52
90#define c8	%f54
91
92#define ALPHA	%f62
93#else
94#define a1	%f0
95#define a2	%f1
96#define a3	%f2
97#define a4	%f3
98#define a5	%f4
99#define a6	%f5
100#define a7	%f6
101#define a8	%f7
102#define b1	%f8
103#define b2	%f9
104#define b3	%f10
105#define b4	%f11
106#define b5	%f12
107#define b6	%f13
108#define b7	%f14
109#define b8	%f15
110
111#define t1	%f16
112#define t2	%f17
113#define t3	%f18
114#define	t4	%f19
115#define c1	%f20
116#define c2	%f21
117#define c3	%f22
118#define c4	%f23
119
120#define c5	%f24
121#define c6	%f25
122#define c7	%f26
123#define c8	%f27
124
125#define ALPHA	%f31
126#endif
127
128	PROLOGUE
129	SAVESP
130
131#ifndef __64BIT__
132
133#ifdef DOUBLE
134	st	%i3, [%sp + STACK_START + 16]
135	st	%i4, [%sp + STACK_START + 20]
136
137	ld	[%sp + STACK_START + 28], INCX
138	ld	[%sp + STACK_START + 32], Y
139	ld	[%sp + STACK_START + 36], INCY
140#else
141	st	%i3, [%sp + STACK_START + 16]
142	ld	[%sp + STACK_START + 28], Y
143	ld	[%sp + STACK_START + 32], INCY
144#endif
145	LDF	[%sp + STACK_START + 16], ALPHA
146#else
147	ldx	[%sp + STACK_START + 56], Y
148	ldx	[%sp + STACK_START + 64], INCY
149#ifdef DOUBLE
150	FMOV	%f6, ALPHA
151#else
152	FMOV	%f7, ALPHA
153#endif
154#endif
155
156	sll	INCX, BASE_SHIFT, INCX
157	sll	INCY, BASE_SHIFT, INCY
158
159	cmp	INCX, SIZE
160	bne	.LL50
161	nop
162	cmp	INCY, SIZE
163	bne	.LL50
164	nop
165
166	sra	N, 3, I
167	cmp	I, 0
168	ble,pn	%icc, .LL15
169	nop
170
171	LDF	[X +  0 * SIZE], a1
172	LDF	[Y +  0 * SIZE], b1
173	LDF	[X +  1 * SIZE], a2
174	LDF	[Y +  1 * SIZE], b2
175	LDF	[X +  2 * SIZE], a3
176	LDF	[Y +  2 * SIZE], b3
177	LDF	[X +  3 * SIZE], a4
178	LDF	[Y +  3 * SIZE], b4
179	LDF	[X +  4 * SIZE], a5
180	LDF	[Y +  4 * SIZE], b5
181	LDF	[X +  5 * SIZE], a6
182	LDF	[Y +  5 * SIZE], b6
183	LDF	[X +  6 * SIZE], a7
184	LDF	[Y +  6 * SIZE], b7
185	LDF	[X +  7 * SIZE], a8
186	LDF	[Y +  7 * SIZE], b8
187
188	FMUL	ALPHA, a1, t1
189	FMUL	ALPHA, a2, t2
190	FMUL	ALPHA, a3, t3
191	FMUL	ALPHA, a4, t4
192
193	FADD	b1, t1, c1
194	FMUL	ALPHA, a5, t1
195	FADD	b2, t2, c2
196	FMUL	ALPHA, a6, t2
197
198	add	I, -1, I
199	cmp	I, 0
200	ble,pt	%icc, .LL12
201	nop
202
203#ifdef DOUBLE
204#define PREFETCHSIZE  54
205#else
206#define PREFETCHSIZE 108
207#endif
208
209.LL11:
210	prefetch [Y  + PREFETCHSIZE * SIZE], 0
211
212	LDF	[X +  8 * SIZE], a1
213	LDF	[X +  9 * SIZE], a2
214	LDF	[X + 10 * SIZE], a3
215	LDF	[X + 11 * SIZE], a4
216
217	FADD	b3, t3, c3
218	STF	c1, [Y +  0 * SIZE]
219	FMUL	ALPHA, a7, t3
220
221	FADD	b4, t4, c4
222	STF	c2, [Y +  1 * SIZE]
223	FMUL	ALPHA, a8, t4
224
225	LDF	[Y +  8 * SIZE], b1
226	LDF	[Y +  9 * SIZE], b2
227	LDF	[Y + 10 * SIZE], b3
228	LDF	[Y + 11 * SIZE], b4
229
230	FADD	b5, t1, c5
231	STF	c3, [Y +  2 * SIZE]
232	FMUL	ALPHA, a1, t1
233
234	FADD	b6, t2, c6
235	STF	c4, [Y +  3 * SIZE]
236	FMUL	ALPHA, a2, t2
237
238	prefetch [X  + PREFETCHSIZE * SIZE], 0
239
240	LDF	[X + 12 * SIZE], a5
241	LDF	[X + 13 * SIZE], a6
242	LDF	[X + 14 * SIZE], a7
243	LDF	[X + 15 * SIZE], a8
244
245	FADD	b7, t3, c7
246	STF	c5, [Y +  4 * SIZE]
247	FMUL	ALPHA, a3, t3
248
249	FADD	b8, t4, c8
250	STF	c6, [Y +  5 * SIZE]
251	FMUL	ALPHA, a4, t4
252
253	LDF	[Y + 12 * SIZE], b5
254	LDF	[Y + 13 * SIZE], b6
255	LDF	[Y + 14 * SIZE], b7
256	LDF	[Y + 15 * SIZE], b8
257
258	FADD	b1, t1, c1
259	STF	c7, [Y +  6 * SIZE]
260	FMUL	ALPHA, a5, t1
261	deccc	I
262
263	FADD	b2, t2, c2
264	STF	c8, [Y +  7 * SIZE]
265	FMUL	ALPHA, a6, t2
266	add	Y, 8 * SIZE, Y
267
268	bg,pt	%icc, .LL11
269	add	X, 8 * SIZE, X
270
271.LL12:
272	FADD	b3, t3, c3
273	FMUL	ALPHA, a7, t3
274	FADD	b4, t4, c4
275	FMUL	ALPHA, a8, t4
276
277	FADD	b5, t1, c5
278	FADD	b6, t2, c6
279	FADD	b7, t3, c7
280	FADD	b8, t4, c8
281
282	STF	c1, [Y +  0 * SIZE]
283	STF	c2, [Y +  1 * SIZE]
284	STF	c3, [Y +  2 * SIZE]
285	STF	c4, [Y +  3 * SIZE]
286
287	STF	c5, [Y +  4 * SIZE]
288	STF	c6, [Y +  5 * SIZE]
289	STF	c7, [Y +  6 * SIZE]
290	STF	c8, [Y +  7 * SIZE]
291
292	add	Y, 8 * SIZE, Y
293	add	X, 8 * SIZE, X
294
295
296.LL15:
297	and	N, 7, I
298	cmp	I,  0
299	ble,a,pn %icc, .LL19
300	nop
301
302.LL16:
303	LDF	[X +  0 * SIZE], a1
304	LDF	[Y +  0 * SIZE], b1
305
306	FMUL	ALPHA, a1, t1
307	FADD	b1, t1, c1
308
309	add	I, -1, I
310	cmp	I, 0
311	STF	c1, [Y +  0 * SIZE]
312	add	Y, 1 * SIZE, Y
313	bg,pt	%icc, .LL16
314	add	X, 1 * SIZE, X
315
316.LL19:
317	return	%i7 + 8
318	clr	%g0
319
320.LL50:
321	sra	N, 3, I
322	cmp	I, 0
323	ble,pn	%icc, .LL55
324	mov	Y, YY
325
326	LDF	[X +  0 * SIZE], a1
327	add	I, -1, I
328	add	X, INCX, X
329	LDF	[Y +  0 * SIZE], b1
330	cmp	I, 0
331	add	Y, INCY, Y
332	LDF	[X +  0 * SIZE], a2
333	add	X, INCX, X
334	LDF	[Y +  0 * SIZE], b2
335	add	Y, INCY, Y
336	LDF	[X +  0 * SIZE], a3
337	add	X, INCX, X
338	LDF	[Y +  0 * SIZE], b3
339	add	Y, INCY, Y
340	LDF	[X +  0 * SIZE], a4
341	add	X, INCX, X
342	LDF	[Y +  0 * SIZE], b4
343	add	Y, INCY, Y
344	LDF	[X +  0 * SIZE], a5
345	add	X, INCX, X
346	LDF	[Y +  0 * SIZE], b5
347	add	Y, INCY, Y
348	LDF	[X +  0 * SIZE], a6
349	add	X, INCX, X
350	LDF	[Y +  0 * SIZE], b6
351	add	Y, INCY, Y
352	LDF	[X +  0 * SIZE], a7
353	add	X, INCX, X
354	LDF	[Y +  0 * SIZE], b7
355	add	Y, INCY, Y
356	LDF	[X +  0 * SIZE], a8
357	add	X, INCX, X
358	LDF	[Y +  0 * SIZE], b8
359	ble,pt	%icc, .LL52
360	add	Y, INCY, Y
361
362
363.LL51:
364	FMUL	ALPHA, a1, t1
365	LDF	[X +  0 * SIZE], a1
366	add	X, INCX, X
367
368	FMUL	ALPHA, a2, t2
369	LDF	[X +  0 * SIZE], a2
370	add	X, INCX, X
371
372	FMUL	ALPHA, a3, t3
373	LDF	[X +  0 * SIZE], a3
374	add	X, INCX, X
375	FMUL	ALPHA, a4, t4
376	LDF	[X +  0 * SIZE], a4
377	add	X, INCX, X
378
379	FADD	b1, t1, c1
380	LDF	[Y +  0 * SIZE], b1
381	add	Y, INCY, Y
382
383	FMUL	ALPHA, a5, t1
384	LDF	[X +  0 * SIZE], a5
385	add	X, INCX, X
386	FADD	b2, t2, c2
387	LDF	[Y +  0 * SIZE], b2
388	add	Y, INCY, Y
389
390	FMUL	ALPHA, a6, t2
391	LDF	[X +  0 * SIZE], a6
392	add	X, INCX, X
393	FADD	b3, t3, c3
394	LDF	[Y +  0 * SIZE], b3
395	add	Y, INCY, Y
396
397	FMUL	ALPHA, a7, t3
398	LDF	[X +  0 * SIZE], a7
399	add	X, INCX, X
400	FADD	b4, t4, c4
401	LDF	[Y +  0 * SIZE], b4
402	add	Y, INCY, Y
403	FMUL	ALPHA, a8, t4
404	LDF	[X +  0 * SIZE], a8
405	add	X, INCX, X
406
407	STF	c1, [YY +  0 * SIZE]
408	add	YY, INCY, YY
409	FADD	b5, t1, c1
410	STF	c2, [YY +  0 * SIZE]
411	add	YY, INCY, YY
412	FADD	b6, t2, c2
413	STF	c3, [YY +  0 * SIZE]
414	add	YY, INCY, YY
415	FADD	b7, t3, c3
416	STF	c4, [YY +  0 * SIZE]
417	add	YY, INCY, YY
418	FADD	b8, t4, c4
419
420	LDF	[Y +  0 * SIZE], b5
421	add	I, -1, I
422	add	Y, INCY, Y
423	LDF	[Y +  0 * SIZE], b6
424	cmp	I, 0
425	add	Y, INCY, Y
426	LDF	[Y +  0 * SIZE], b7
427	add	Y, INCY, Y
428	LDF	[Y +  0 * SIZE], b8
429	add	Y, INCY, Y
430
431	STF	c1, [YY +  0 * SIZE]
432	add	YY, INCY, YY
433	STF	c2, [YY +  0 * SIZE]
434	add	YY, INCY, YY
435	STF	c3, [YY +  0 * SIZE]
436	add	YY, INCY, YY
437	STF	c4, [YY +  0 * SIZE]
438
439	bg,pt	%icc, .LL51
440	add	YY, INCY, YY
441
442.LL52:
443	FMUL	ALPHA, a1, t1
444	FMUL	ALPHA, a2, t2
445	FMUL	ALPHA, a3, t3
446	FMUL	ALPHA, a4, t4
447
448	FADD	b1, t1, c1
449	FMUL	ALPHA, a5, t1
450	FADD	b2, t2, c2
451	FMUL	ALPHA, a6, t2
452	FADD	b3, t3, c3
453	FMUL	ALPHA, a7, t3
454	FADD	b4, t4, c4
455	FMUL	ALPHA, a8, t4
456
457	STF	c1, [YY +  0 * SIZE]
458	add	YY, INCY, YY
459	FADD	b5, t1, c1
460	STF	c2, [YY +  0 * SIZE]
461	add	YY, INCY, YY
462	FADD	b6, t2, c2
463	STF	c3, [YY +  0 * SIZE]
464	add	YY, INCY, YY
465	FADD	b7, t3, c3
466	STF	c4, [YY +  0 * SIZE]
467	add	YY, INCY, YY
468	FADD	b8, t4, c4
469
470	STF	c1, [YY +  0 * SIZE]
471	add	YY, INCY, YY
472	STF	c2, [YY +  0 * SIZE]
473	add	YY, INCY, YY
474	STF	c3, [YY +  0 * SIZE]
475	add	YY, INCY, YY
476	STF	c4, [YY +  0 * SIZE]
477	add	YY, INCY, YY
478
479.LL55:
480	and	N, 7, I
481	cmp	I,  0
482	ble,a,pn %icc, .LL59
483	nop
484
485.LL56:
486	LDF	[X +  0 * SIZE], a1
487	LDF	[Y +  0 * SIZE], b1
488
489	FMUL	ALPHA, a1, t1
490	FADD	b1, t1, c1
491
492	add	I, -1, I
493	cmp	I, 0
494	STF	c1, [Y +  0 * SIZE]
495	add	Y, INCY, Y
496	bg,pt	%icc, .LL56
497	add	X, INCX, X
498
499.LL59:
500	return	%i7 + 8
501	clr	%o0
502
503	EPILOGUE
504