1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define N	%i0
26#define X	%i1
27#define INCX	%i2
28#define Y	%i3
29#define INCY	%i4
30#define I	%i5
31
32#ifdef DOUBLE
33#define a1	%f0
34#define a2	%f2
35#define a3	%f4
36#define a4	%f6
37#define a5	%f8
38#define a6	%f10
39#define a7	%f12
40#define a8	%f14
41#else
42#define a1	%f0
43#define a2	%f1
44#define a3	%f2
45#define a4	%f3
46#define a5	%f4
47#define a6	%f5
48#define a7	%f6
49#define a8	%f7
50#endif
51
52	PROLOGUE
53	SAVESP
54
55	sll	INCX, ZBASE_SHIFT, INCX
56	sll	INCY, ZBASE_SHIFT, INCY
57
58	cmp	INCX, 2 * SIZE
59	bne	.LL50
60	nop
61	cmp	INCY, 2 * SIZE
62	bne	.LL50
63	nop
64
65	sra	N, 2, I
66	cmp	I, 0
67	ble,pn	%icc, .LL15
68	nop
69
70#define PREFETCHSIZE 32
71
72.LL11:
73	prefetch [X  + PREFETCHSIZE * SIZE], 0
74	prefetch [Y  + PREFETCHSIZE * SIZE], 0
75
76	LDF	[X +  0 * SIZE], a1
77	LDF	[X +  1 * SIZE], a2
78	LDF	[X +  2 * SIZE], a3
79	LDF	[X +  3 * SIZE], a4
80	LDF	[X +  4 * SIZE], a5
81	LDF	[X +  5 * SIZE], a6
82	LDF	[X +  6 * SIZE], a7
83	LDF	[X +  7 * SIZE], a8
84
85	STF	a1, [Y +  0 * SIZE]
86	add	I, -1, I
87	STF	a2, [Y +  1 * SIZE]
88	cmp	I, 0
89	STF	a3, [Y +  2 * SIZE]
90	add	X, 8 * SIZE, X
91	STF	a4, [Y +  3 * SIZE]
92	STF	a5, [Y +  4 * SIZE]
93	STF	a6, [Y +  5 * SIZE]
94	STF	a7, [Y +  6 * SIZE]
95	STF	a8, [Y +  7 * SIZE]
96
97	bg,pt	%icc, .LL11
98	add	Y, 8 * SIZE, Y
99
100.LL15:
101	and	N, 3, I
102	cmp	I,  0
103	ble,a,pn %icc, .LL19
104	nop
105
106.LL16:
107	LDF	[X +  0 * SIZE], a1
108	LDF	[X +  1 * SIZE], a2
109	add	I, -1, I
110	cmp	I, 0
111	STF	a1, [Y +  0 * SIZE]
112	add	X, 2 * SIZE, X
113	STF	a2, [Y +  1 * SIZE]
114	bg,pt	%icc, .LL16
115	add	Y, 2 * SIZE, Y
116
117.LL19:
118	return	%i7 + 8
119	clr	%g0
120
121.LL50:
122	sra	N, 2, I
123	cmp	I, 0
124	ble,pn	%icc, .LL55
125	nop
126
127.LL51:
128	LDF	[X +  0 * SIZE], a1
129	LDF	[X +  1 * SIZE], a2
130	add	X, INCX, X
131	LDF	[X +  0 * SIZE], a3
132	LDF	[X +  1 * SIZE], a4
133	add	X, INCX, X
134	LDF	[X +  0 * SIZE], a5
135	LDF	[X +  1 * SIZE], a6
136	add	X, INCX, X
137	LDF	[X +  0 * SIZE], a7
138	LDF	[X +  1 * SIZE], a8
139	add	X, INCX, X
140
141	STF	a1, [Y +  0 * SIZE]
142	add	I, -1, I
143	STF	a2, [Y +  1 * SIZE]
144	add	Y, INCY, Y
145	cmp	I, 0
146	STF	a3, [Y +  0 * SIZE]
147	STF	a4, [Y +  1 * SIZE]
148	add	Y, INCY, Y
149	STF	a5, [Y +  0 * SIZE]
150	STF	a6, [Y +  1 * SIZE]
151	add	Y, INCY, Y
152	STF	a7, [Y +  0 * SIZE]
153	STF	a8, [Y +  1 * SIZE]
154
155	bg,pt	%icc, .LL51
156	add	Y, INCY, Y
157
158.LL55:
159	and	N, 3, I
160	cmp	I,  0
161	ble,a,pn %icc, .LL59
162	nop
163
164.LL56:
165	LDF	[X +  0 * SIZE], a1
166	LDF	[X +  1 * SIZE], a2
167	add	I, -1, I
168	cmp	I, 0
169	add	X, INCX, X
170	STF	a1, [Y +  0 * SIZE]
171	STF	a2, [Y +  1 * SIZE]
172	bg,pt	%icc, .LL56
173	add	Y, INCY, Y
174
175.LL59:
176	return	%i7 + 8
177	clr	%o0
178
179	EPILOGUE
180