1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define N	%i0
26#define X	%i1
27#define INCX	%i2
28#define I	%i3
29
30#define v1	%o0
31#define v2	%o1
32#define v3	%o2
33#define v4	%o3
34#define count	%o4
35
36#ifdef DOUBLE
37#define c1	%f0
38#define c2	%f2
39#define c3	%f4
40#define c4	%f6
41#define t1	%f8
42#define t2	%f10
43#define t3	%f12
44#define t4	%f14
45
46#define a1	%f16
47#define a2	%f18
48#define a3	%f20
49#define a4	%f22
50#define a5	%f24
51#define a6	%f26
52#define a7	%f28
53#define a8	%f30
54#else
55#define c1	%f0
56#define c2	%f1
57#define c3	%f2
58#define c4	%f3
59#define t1	%f4
60#define t2	%f5
61#define t3	%f6
62#define t4	%f7
63
64#define a1	%f8
65#define a2	%f9
66#define a3	%f10
67#define a4	%f11
68#define a5	%f12
69#define a6	%f13
70#define a7	%f14
71#define a8	%f15
72#endif
73
74#ifndef USE_MIN
75#define FCMOV	FMOVG
76#define CMOV	movg
77#else
78#define FCMOV	FMOVL
79#define CMOV	movl
80#endif
81
82	PROLOGUE
83	SAVESP
84
85	FCLR(0)
86
87	cmp	N, 0
88	ble	.LL20
89	clr	v1
90
91	cmp	INCX, 0
92	ble	.LL20
93	sll	INCX, BASE_SHIFT, INCX
94
95	mov	1, v1
96
97	add	N, -1, N
98	LDF	[X], c4
99	add	X, INCX, X
100	cmp	N, 0
101	ble	.LL20
102	FABS	c4, c1
103
104	FABS	c4, c2
105	mov	1, v2
106	FABS	c4, c3
107	mov	1, v3
108	FABS	c4, c4
109	mov	1, v4
110	mov	2, count
111
112	cmp	INCX, SIZE
113	bne	.LL50
114	nop
115
116	sra	N, 3, I
117	cmp	I, 0
118	ble,pn	%icc, .LL15
119	nop
120
121	LDF	[X +  0 * SIZE], a1
122	LDF	[X +  1 * SIZE], a2
123	LDF	[X +  2 * SIZE], a3
124	LDF	[X +  3 * SIZE], a4
125
126	LDF	[X +  4 * SIZE], a5
127	add	I, -1, I
128	LDF	[X +  5 * SIZE], a6
129	cmp	I, 0
130	LDF	[X +  6 * SIZE], a7
131	LDF	[X +  7 * SIZE], a8
132
133	ble,pt	%icc, .LL12
134	add	X, 8 * SIZE, X
135
136#define PREFETCHSIZE 40
137
138.LL11:
139	FABS	a1, t1
140	prefetch [X + PREFETCHSIZE * SIZE], 0
141	FABS	a2, t2
142	LDF	[X +  0 * SIZE], a1
143	FABS	a3, t3
144	LDF	[X +  1 * SIZE], a2
145	FABS	a4, t4
146	LDF	[X +  2 * SIZE], a3
147
148	FCMP	%fcc0, t1, c1
149	LDF	[X +  3 * SIZE], a4
150	FCMP	%fcc1, t2, c2
151	nop
152
153	FCMP	%fcc2, t3, c3
154	FCMP	%fcc3, t4, c4
155
156	FCMOV	%fcc0, t1, c1
157	CMOV	%fcc0, count, v1
158	FCMOV	%fcc1, t2, c2
159	CMOV	%fcc1, count, v2
160	FCMOV	%fcc2, t3, c3
161	CMOV	%fcc2, count, v3
162	FCMOV	%fcc3, t4, c4
163	CMOV	%fcc3, count, v4
164	add	count, 4, count
165
166	FABS	a5, t1
167	LDF	[X +  4 * SIZE], a5
168	FABS	a6, t2
169	LDF	[X +  5 * SIZE], a6
170	FABS	a7, t3
171	LDF	[X +  6 * SIZE], a7
172	FABS	a8, t4
173	LDF	[X +  7 * SIZE], a8
174
175	FCMP	%fcc0, t1, c1
176	FCMP	%fcc1, t2, c2
177	FCMP	%fcc2, t3, c3
178	FCMP	%fcc3, t4, c4
179
180	FCMOV	%fcc0, t1, c1
181	nop
182	CMOV	%fcc0, count, v1
183	add	I, -1, I
184
185	FCMOV	%fcc1, t2, c2
186	cmp	I, 0
187	CMOV	%fcc1, count, v2
188	add	X, 8 * SIZE, X
189
190	FCMOV	%fcc2, t3, c3
191	CMOV	%fcc2, count, v3
192	FCMOV	%fcc3, t4, c4
193	CMOV	%fcc3, count, v4
194	bg,pt	%icc, .LL11
195	add	count, 4, count
196
197.LL12:
198	FABS	a1, t1
199	FABS	a2, t2
200	FABS	a3, t3
201	FABS	a4, t4
202
203	FCMP	%fcc0, t1, c1
204	FCMP	%fcc1, t2, c2
205	FCMP	%fcc2, t3, c3
206	FCMP	%fcc3, t4, c4
207
208	FCMOV	%fcc0, t1, c1
209	CMOV	%fcc0, count, v1
210	FCMOV	%fcc1, t2, c2
211	CMOV	%fcc1, count, v2
212	FCMOV	%fcc2, t3, c3
213	CMOV	%fcc2, count, v3
214	FCMOV	%fcc3, t4, c4
215	CMOV	%fcc3, count, v4
216	add	count, 4, count
217
218	FABS	a5, t1
219	FABS	a6, t2
220	FABS	a7, t3
221	FABS	a8, t4
222
223	FCMP	%fcc0, t1, c1
224	FCMP	%fcc1, t2, c2
225	FCMP	%fcc2, t3, c3
226	FCMP	%fcc3, t4, c4
227
228	FCMOV	%fcc0, t1, c1
229	CMOV	%fcc0, count, v1
230	FCMOV	%fcc1, t2, c2
231	CMOV	%fcc1, count, v2
232	FCMOV	%fcc2, t3, c3
233	CMOV	%fcc2, count, v3
234	FCMOV	%fcc3, t4, c4
235	CMOV	%fcc3, count, v4
236	add	count, 4, count
237
238.LL15:
239	and	N, 7, I
240	cmp	I,  0
241	ble,a,pn %icc, .LL19
242	nop
243
244.LL16:
245	LDF	[X +  0 * SIZE], a1
246	FABS	a1, t1
247	FCMP	%fcc0, t1, c1
248	FCMOV	%fcc0, t1, c1
249	CMOV	%fcc0, count, v1
250	add	I, -1, I
251	add	count, 1, count
252	cmp	I, 0
253	bg,pt	%icc, .LL16
254	add	X, 1 * SIZE, X
255
256.LL19:
257	FCMP	%fcc0, c2, c1
258	add	v2, 1, v2
259	FCMP	%fcc1, c4, c3
260	add	v3, 2, v3
261	add	v4, 3, v4
262
263	FCMOV	%fcc0, c2, c1
264	CMOV	%fcc0, v2, v1
265	FCMOV	%fcc1, c4, c3
266	CMOV	%fcc1, v4, v3
267	FCMP	%fcc0, c3, c1
268	CMOV	%fcc0, v3, v1
269
270.LL20:
271	mov	v1, %i0
272	return	%i7 + 8
273	nop
274
275.LL50:
276	sra	N, 3, I
277	cmp	I, 0
278	ble,pn	%icc, .LL55
279	nop
280
281	LDF	[X +  0 * SIZE], a1
282	add	X, INCX, X
283	LDF	[X +  0 * SIZE], a2
284	add	X, INCX, X
285	LDF	[X +  0 * SIZE], a3
286	add	X, INCX, X
287	LDF	[X +  0 * SIZE], a4
288	add	X, INCX, X
289	LDF	[X +  0 * SIZE], a5
290	add	X, INCX, X
291	LDF	[X +  0 * SIZE], a6
292	add	X, INCX, X
293	add	I, -1, I
294	LDF	[X +  0 * SIZE], a7
295	cmp	I, 0
296	add	X, INCX, X
297	LDF	[X +  0 * SIZE], a8
298	ble,pt	%icc, .LL52
299	add	X, INCX, X
300
301.LL51:
302	FABS	a1, t1
303	LDF	[X +  0 * SIZE], a1
304	add	X, INCX, X
305	FABS	a2, t2
306	LDF	[X +  0 * SIZE], a2
307	add	X, INCX, X
308	FABS	a3, t3
309	LDF	[X +  0 * SIZE], a3
310	add	X, INCX, X
311	FABS	a4, t4
312	LDF	[X +  0 * SIZE], a4
313	add	X, INCX, X
314
315	FCMP	%fcc0, t1, c1
316	FCMP	%fcc1, t2, c2
317	FCMP	%fcc2, t3, c3
318	FCMP	%fcc3, t4, c4
319
320	FCMOV	%fcc0, t1, c1
321	CMOV	%fcc0, count, v1
322	FCMOV	%fcc1, t2, c2
323	CMOV	%fcc1, count, v2
324	FCMOV	%fcc2, t3, c3
325	CMOV	%fcc2, count, v3
326	FCMOV	%fcc3, t4, c4
327	CMOV	%fcc3, count, v4
328	add	count, 4, count
329
330	FABS	a5, t1
331	LDF	[X +  0 * SIZE], a5
332	add	X, INCX, X
333	FABS	a6, t2
334	LDF	[X +  0 * SIZE], a6
335	add	X, INCX, X
336	FABS	a7, t3
337	LDF	[X +  0 * SIZE], a7
338	add	X, INCX, X
339	FABS	a8, t4
340	LDF	[X +  0 * SIZE], a8
341
342	FCMP	%fcc0, t1, c1
343	FCMP	%fcc1, t2, c2
344	FCMP	%fcc2, t3, c3
345	FCMP	%fcc3, t4, c4
346
347	FCMOV	%fcc0, t1, c1
348	CMOV	%fcc0, count, v1
349	add	I, -1, I
350	FCMOV	%fcc1, t2, c2
351	CMOV	%fcc1, count, v2
352	cmp	I, 0
353	FCMOV	%fcc2, t3, c3
354	CMOV	%fcc2, count, v3
355	FCMOV	%fcc3, t4, c4
356	CMOV	%fcc3, count, v4
357	add	count, 4, count
358
359	bg,pt	%icc, .LL51
360	add	X, INCX, X
361
362.LL52:
363	FABS	a1, t1
364	FABS	a2, t2
365	FABS	a3, t3
366	FABS	a4, t4
367
368	FCMP	%fcc0, t1, c1
369	FCMP	%fcc1, t2, c2
370	FCMP	%fcc2, t3, c3
371	FCMP	%fcc3, t4, c4
372
373	FCMOV	%fcc0, t1, c1
374	CMOV	%fcc0, count, v1
375	FCMOV	%fcc1, t2, c2
376	CMOV	%fcc1, count, v2
377	FCMOV	%fcc2, t3, c3
378	CMOV	%fcc2, count, v3
379	FCMOV	%fcc3, t4, c4
380	CMOV	%fcc3, count, v4
381	add	count, 4, count
382
383	FABS	a5, t1
384	FABS	a6, t2
385	FABS	a7, t3
386	FABS	a8, t4
387
388	FCMP	%fcc0, t1, c1
389	FCMP	%fcc1, t2, c2
390	FCMP	%fcc2, t3, c3
391	FCMP	%fcc3, t4, c4
392
393	FCMOV	%fcc0, t1, c1
394	CMOV	%fcc0, count, v1
395	FCMOV	%fcc1, t2, c2
396	CMOV	%fcc1, count, v2
397	FCMOV	%fcc2, t3, c3
398	CMOV	%fcc2, count, v3
399	FCMOV	%fcc3, t4, c4
400	CMOV	%fcc3, count, v4
401	add	count, 4, count
402
403.LL55:
404	and	N, 7, I
405	cmp	I,  0
406	ble,a,pn %icc, .LL59
407	nop
408
409.LL56:
410	LDF	[X +  0 * SIZE], a1
411	FABS	a1, t1
412	FCMP	%fcc0, t1, c1
413	FCMOV	%fcc0, t1, c1
414	CMOV	%fcc0, count, v1
415	add	I, -1, I
416	add	count, 1, count
417	cmp	I, 0
418	bg,pt	%icc, .LL56
419	add	X, INCX, X
420
421.LL59:
422	FCMP	%fcc0, c2, c1
423	add	v2, 1, v2
424	FCMP	%fcc1, c4, c3
425	add	v3, 2, v3
426	add	v4, 3, v4
427
428	FCMOV	%fcc0, c2, c1
429	CMOV	%fcc0, v2, v1
430	FCMOV	%fcc1, c4, c3
431	CMOV	%fcc1, v4, v3
432	FCMP	%fcc0, c3, c1
433	CMOV	%fcc0, v3, v1
434
435	mov	v1, %i0
436	return	%i7 + 8
437	nop
438
439	EPILOGUE
440