1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define N	%i0
43
44#if defined(DOUBLE) && !defined(__64BIT__)
45#define X	%i3
46#define INCX	%i4
47#else
48#define X	%i5
49#define INCX	%i3
50#endif
51
52#define I	%i1
53#define XX	%i2
54
55#ifdef DOUBLE
56#define c1	%f0
57#define c2	%f2
58#define c3	%f4
59#define c4	%f6
60#define c5	%f8
61#define c6	%f10
62#define c7	%f12
63#define c8	%f14
64
65#define t1	%f16
66#define t2	%f18
67#define t3	%f20
68#define t4	%f22
69#define t5	%f24
70#define t6	%f26
71#define t7	%f28
72#define t8	%f30
73
74#define c9	%f32
75#define c10	%f34
76#define c11	%f36
77#define c12	%f38
78#define c13	%f40
79#define c14	%f42
80#define c15	%f44
81#define c16	%f46
82
83#define s1	%f32
84#define s2	%f34
85#define s3	%f36
86#define s4	%f38
87#define s5	%f40
88#define s6	%f42
89#define s7	%f44
90#define s8	%f46
91
92#define FZERO	%f48
93#define ALPHA_R	%f50
94#define ALPHA_I	%f52
95#else
96#define c1	%f0
97#define c2	%f1
98#define c3	%f2
99#define c4	%f3
100#define c5	%f4
101#define c6	%f5
102#define c7	%f6
103#define c8	%f7
104
105#define c9	%f8
106#define c10	%f9
107#define c11	%f10
108#define c12	%f11
109#define c13	%f12
110#define c14	%f13
111#define c15	%f14
112#define c16	%f15
113
114#define s1	%f8
115#define s2	%f9
116#define s3	%f10
117#define s4	%f11
118#define s5	%f12
119#define s6	%f13
120#define s7	%f14
121#define s8	%f15
122
123#define t1	%f16
124#define t2	%f17
125#define t3	%f18
126#define t4	%f19
127#define t5	%f20
128#define t6	%f21
129#define t7	%f22
130#define t8	%f23
131
132#define FZERO	%f24
133#define ALPHA_R	%f25
134#define ALPHA_I	%f26
135#endif
136
137#define PREFETCHSIZE 128
138
139	PROLOGUE
140	SAVESP
141
142#ifndef __64BIT__
143#ifdef DOUBLE
144	st	%i3, [%sp + STACK_START + 16]
145	st	%i4, [%sp + STACK_START + 20]
146	st	%i5, [%sp + STACK_START + 24]
147
148	ld	[%sp+ STACK_START + 32], X
149	ld	[%sp+ STACK_START + 36], INCX
150#else
151	st	%i3, [%sp + STACK_START + 16]
152	st	%i4, [%sp + STACK_START + 24]
153	ld	[%sp+  STACK_START + 28], INCX
154#endif
155	LDF	[%sp + STACK_START + 16], ALPHA_R
156	LDF	[%sp + STACK_START + 24], ALPHA_I
157#else
158	ldx	[%sp + STACK_START + 56], INCX
159#ifdef DOUBLE
160	FMOV	%f6, ALPHA_R
161	FMOV	%f8, ALPHA_I
162#else
163	FMOV	%f7, ALPHA_R
164	FMOV	%f9, ALPHA_I
165#endif
166#endif
167
168#ifdef DOUBLE
169	FCLR(17)
170#else
171	FCLR(24)
172#endif
173
174	FCMP	ALPHA_R, FZERO
175	fbne	.LL100
176	sll	INCX, ZBASE_SHIFT, INCX
177
178	FCMP	ALPHA_I, FZERO
179	fbne	.LL100
180	nop
181	cmp	INCX, 2 * SIZE
182	bne	.LL50
183	nop
184	sra	N, 2, I
185	cmp	I, 0
186	ble,pn	%icc, .LL15
187	nop
188
189.LL11:
190	prefetch [X  + PREFETCHSIZE * SIZE], 0
191
192	STF	FZERO, [X +  0 * SIZE]
193	add	I, -1, I
194	STF	FZERO, [X +  1 * SIZE]
195	cmp	I, 0
196	STF	FZERO, [X +  2 * SIZE]
197	STF	FZERO, [X +  3 * SIZE]
198	STF	FZERO, [X +  4 * SIZE]
199	STF	FZERO, [X +  5 * SIZE]
200	add	X, 8 * SIZE, X
201	STF	FZERO, [X -  2 * SIZE]
202	bg,pt	%icc, .LL11
203	STF	FZERO, [X -  1 * SIZE]
204
205.LL15:
206	and	N, 3, I
207	cmp	I,  0
208	ble,a,pn %icc, .LL19
209	nop
210
211.LL16:
212	STF	FZERO, [X +  0 * SIZE]
213	STF	FZERO, [X +  1 * SIZE]
214	add	I, -1, I
215	cmp	I, 0
216	bg,pt	%icc, .LL16
217	add	X, 2 * SIZE, X
218
219.LL19:
220	return	%i7 + 8
221	clr	%o0
222
223.LL50:
224	sra	N, 2, I
225	cmp	I, 0
226	ble,pn	%icc, .LL55
227	nop
228
229.LL51:
230	STF	FZERO, [X +  0 * SIZE]
231	add	I, -1, I
232	STF	FZERO, [X +  1 * SIZE]
233	add	X, INCX, X
234	STF	FZERO, [X +  0 * SIZE]
235	cmp	I, 0
236	STF	FZERO, [X +  1 * SIZE]
237	add	X, INCX, X
238	STF	FZERO, [X +  0 * SIZE]
239	STF	FZERO, [X +  1 * SIZE]
240	add	X, INCX, X
241	STF	FZERO, [X +  0 * SIZE]
242	STF	FZERO, [X +  1 * SIZE]
243	bg,pt	%icc, .LL51
244	add	X, INCX, X
245
246.LL55:
247	and	N, 3, I
248	cmp	I,  0
249	ble,a,pn %icc, .LL59
250	nop
251
252.LL56:
253	STF	FZERO, [X +  0 * SIZE]
254	add	I, -1, I
255	STF	FZERO, [X +  1 * SIZE]
256	cmp	I, 0
257	bg,pt	%icc, .LL56
258	add	X, INCX, X
259
260.LL59:
261	return	%i7 + 8
262	clr	%o0
263
264.LL100:
265	cmp	INCX, 2 * SIZE
266	bne	.LL150
267	sra	N, 2, I
268
269	cmp	I, 0
270	ble,pn	%icc, .LL115
271	nop
272
273	LDF	[X +  0 * SIZE], c1
274	LDF	[X +  1 * SIZE], c2
275	LDF	[X +  2 * SIZE], c3
276	LDF	[X +  3 * SIZE], c4
277	LDF	[X +  4 * SIZE], c5
278	LDF	[X +  5 * SIZE], c6
279	LDF	[X +  6 * SIZE], c7
280	LDF	[X +  7 * SIZE], c8
281
282	FMUL	ALPHA_R, c1, t1
283	FMUL	ALPHA_I, c2, t3
284
285	FMUL	ALPHA_I, c1, t2
286	LDF	[X +  8 * SIZE], c1
287	FMUL	ALPHA_R, c2, t4
288	LDF	[X +  9 * SIZE], c2
289
290	FMUL	ALPHA_R, c3, t5
291	deccc	I
292	FMUL	ALPHA_I, c4, t7
293	FSUB	t1,  t3,  s1
294
295	FMUL	ALPHA_I, c3, t6
296	LDF	[X + 10 * SIZE], c3
297	FMUL	ALPHA_R, c4, t8
298	LDF	[X + 11 * SIZE], c4
299	FADD	t4,  t2,  s2
300
301	ble,pn	%icc, .LL112
302	nop
303
304.LL111:
305	prefetch [X  + PREFETCHSIZE * SIZE], 0
306
307	FMUL	ALPHA_R, c5, t1
308	FMUL	ALPHA_I, c6, t3
309	FSUB	t5,  t7,  s3
310	STF	s1, [X +  0 * SIZE]
311
312	FMUL	ALPHA_I, c5, t2
313	LDF	[X + 12 * SIZE], c5
314	FMUL	ALPHA_R, c6, t4
315	LDF	[X + 13 * SIZE], c6
316
317	FADD	t8,  t6,  s4
318	STF	s2, [X +  1 * SIZE]
319
320	FMUL	ALPHA_R, c7, t5
321	FMUL	ALPHA_I, c8, t7
322	FSUB	t1,  t3,  s5
323	STF	s3, [X +  2 * SIZE]
324
325	FMUL	ALPHA_I, c7, t6
326	LDF	[X + 14 * SIZE], c7
327	FMUL	ALPHA_R, c8, t8
328	LDF	[X + 15 * SIZE], c8
329
330	FADD	t4,  t2,  s6
331	STF	s4, [X +  3 * SIZE]
332
333	FMUL	ALPHA_R, c1, t1
334	FMUL	ALPHA_I, c2, t3
335	FSUB	t5,  t7,  s7
336	STF	s5, [X +  4 * SIZE]
337
338	FMUL	ALPHA_I, c1, t2
339	LDF	[X + 16 * SIZE], c1
340	FMUL	ALPHA_R, c2, t4
341	LDF	[X + 17 * SIZE], c2
342
343	FADD	t8,  t6,  s8
344	STF	s6, [X +  5 * SIZE]
345
346	FMUL	ALPHA_R, c3, t5
347	deccc	I
348	FMUL	ALPHA_I, c4, t7
349	FSUB	t1,  t3,  s1
350	STF	s7, [X +  6 * SIZE]
351
352	FMUL	ALPHA_I, c3, t6
353	LDF	[X + 18 * SIZE], c3
354	FMUL	ALPHA_R, c4, t8
355	LDF	[X + 19 * SIZE], c4
356
357	FADD	t4,  t2,  s2
358	STF	s8, [X +  7 * SIZE]
359
360	bg,pt	%icc, .LL111
361	add	X, 8 * SIZE, X
362
363
364.LL112:
365	FMUL	ALPHA_R, c5, t1
366	FMUL	ALPHA_I, c6, t3
367	FSUB	t5,  t7,  s3
368	STF	s1, [X +  0 * SIZE]
369
370	FMUL	ALPHA_I, c5, t2
371	FMUL	ALPHA_R, c6, t4
372	FADD	t8,  t6,  s4
373	STF	s2, [X +  1 * SIZE]
374
375	FMUL	ALPHA_R, c7, t5
376	FMUL	ALPHA_I, c8, t7
377	FSUB	t1,  t3,  s5
378	STF	s3, [X +  2 * SIZE]
379
380	FMUL	ALPHA_I, c7, t6
381	FMUL	ALPHA_R, c8, t8
382	FADD	t4,  t2,  s6
383	STF	s4, [X +  3 * SIZE]
384
385	FSUB	t5,  t7,  s7
386	FADD	t8,  t6,  s8
387
388	STF	s5, [X +  4 * SIZE]
389	STF	s6, [X +  5 * SIZE]
390	STF	s7, [X +  6 * SIZE]
391	STF	s8, [X +  7 * SIZE]
392	add	X, 8 * SIZE, X
393
394.LL115:
395	and	N, 3, I
396	cmp	I,  0
397	ble,a,pn %icc, .LL119
398	nop
399
400.LL116:
401	LDF	[X +  0 * SIZE], c1
402	LDF	[X +  1 * SIZE], c2
403
404	FMUL	ALPHA_R, c1, c3
405	FMUL	ALPHA_I, c1, c4
406	FMUL	ALPHA_I, c2, c1
407	FMUL	ALPHA_R, c2, c2
408
409	FSUB	c3, c1, c1
410	FADD	c2, c4, c2
411
412	STF	c1, [X +  0 * SIZE]
413	STF	c2, [X +  1 * SIZE]
414
415	add	I, -1, I
416	cmp	I, 0
417	bg,pt	%icc, .LL116
418	add	X, 2 * SIZE, X
419
420.LL119:
421	return	%i7 + 8
422	clr	%o0
423
424.LL150:
425	sra	N, 2, I
426	cmp	I, 0
427	ble,pn	%icc, .LL155
428	mov	X, XX
429
430.LL151:
431	LDF	[X +  0 * SIZE], c1
432	LDF	[X +  1 * SIZE], c2
433	add	X, INCX, X
434	LDF	[X +  0 * SIZE], c3
435	FMUL	ALPHA_R, c1, c9
436	LDF	[X +  1 * SIZE], c4
437	FMUL	ALPHA_I, c1, c10
438	add	X, INCX, X
439	LDF	[X +  0 * SIZE], c5
440	FMUL	ALPHA_I, c2, c1
441	LDF	[X +  1 * SIZE], c6
442	FMUL	ALPHA_R, c2, c2
443	add	X, INCX, X
444	LDF	[X +  0 * SIZE], c7
445	FMUL	ALPHA_R, c3, c11
446	LDF	[X +  1 * SIZE], c8
447	FMUL	ALPHA_I, c3, c12
448	add	X, INCX, X
449
450	FMUL	ALPHA_I, c4, c3
451	FMUL	ALPHA_R, c4, c4
452
453	FMUL	ALPHA_R, c5, c13
454	FMUL	ALPHA_I, c5, c14
455	FMUL	ALPHA_I, c6, c5
456	FMUL	ALPHA_R, c6, c6
457
458	FMUL	ALPHA_R, c7, c15
459	FSUB	c9,  c1,  c1
460	FMUL	ALPHA_I, c7, c16
461	FADD	c2,  c10, c2
462	FMUL	ALPHA_I, c8, c7
463	FSUB	c11, c3,  c3
464	FMUL	ALPHA_R, c8, c8
465	FADD	c4,  c12, c4
466
467	STF	c1, [XX +  0 * SIZE]
468	FSUB	c13, c5,  c5
469	add	I, -1, I
470	STF	c2, [XX +  1 * SIZE]
471	FADD	c6,  c14, c6
472	add	XX, INCX, XX
473	STF	c3, [XX +  0 * SIZE]
474	FSUB	c15, c7,  c7
475	cmp	I, 0
476	STF	c4, [XX +  1 * SIZE]
477	FADD	c8,  c16, c8
478	add	XX, INCX, XX
479	STF	c5, [XX +  0 * SIZE]
480	STF	c6, [XX +  1 * SIZE]
481	add	XX, INCX, XX
482	STF	c7, [XX +  0 * SIZE]
483	STF	c8, [XX +  1 * SIZE]
484	bg,pt	%icc, .LL151
485	add	XX, INCX, XX
486
487.LL155:
488	and	N, 3, I
489	cmp	I,  0
490	ble,a,pn %icc, .LL159
491	nop
492
493.LL156:
494	LDF	[X +  0 * SIZE], c1
495	LDF	[X +  1 * SIZE], c2
496
497	FMUL	ALPHA_R, c1, c3
498	FMUL	ALPHA_I, c1, c4
499	FMUL	ALPHA_I, c2, c1
500	FMUL	ALPHA_R, c2, c2
501
502	FSUB	c3, c1, c1
503	FADD	c2, c4, c2
504
505	STF	c1, [X +  0 * SIZE]
506	STF	c2, [X +  1 * SIZE]
507
508	add	I, -1, I
509	cmp	I, 0
510	bg,pt	%icc, .LL156
511	add	X, INCX, X
512
513.LL159:
514	return	%i7 + 8
515	clr	%o0
516
517
518	EPILOGUE
519