1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#if defined(DOUBLE) && !defined(__64BIT__)
43#define N	%i0
44#define X	%i1
45#define INCX	%i2
46#define Y	%i3
47#define INCY	%i4
48#define I	%i5
49#else
50#define N	%i0
51#define X	%i5
52#define INCX	%i1
53#define Y	%i2
54#define INCY	%i3
55#define I	%i4
56#endif
57
58#define YY	%l1
59
60#ifdef DOUBLE
61#define a1	%f0
62#define a2	%f2
63#define a3	%f4
64#define a4	%f6
65#define a5	%f8
66#define a6	%f10
67#define a7	%f12
68#define a8	%f14
69#define b1	%f16
70#define b2	%f18
71#define b3	%f20
72#define b4	%f22
73#define b5	%f24
74#define b6	%f26
75#define b7	%f28
76#define b8	%f30
77
78#define t1	%f32
79#define t2	%f34
80#define t3	%f36
81#define	t4	%f38
82#define c1	%f40
83#define c2	%f42
84#define c3	%f44
85#define c4	%f46
86
87#define c5	%f48
88#define c6	%f50
89#define c7	%f52
90#define c8	%f54
91
92#define ALPHA_R	%f60
93#define ALPHA_I	%f62
94#else
95#define a1	%f0
96#define a2	%f1
97#define a3	%f2
98#define a4	%f3
99#define a5	%f4
100#define a6	%f5
101#define a7	%f6
102#define a8	%f7
103#define b1	%f8
104#define b2	%f9
105#define b3	%f10
106#define b4	%f11
107#define b5	%f12
108#define b6	%f13
109#define b7	%f14
110#define b8	%f15
111
112#define t1	%f16
113#define t2	%f17
114#define t3	%f18
115#define	t4	%f19
116#define c1	%f20
117#define c2	%f21
118#define c3	%f22
119#define c4	%f23
120
121#define c5	%f24
122#define c6	%f25
123#define c7	%f26
124#define c8	%f27
125
126#define ALPHA_R	%f30
127#define ALPHA_I	%f31
128#endif
129
130#ifndef CONJ
131#define ADD1	FSUB
132#define ADD2	FADD
133#else
134#define ADD1	FADD
135#define ADD2	FSUB
136#endif
137
138	PROLOGUE
139	SAVESP
140
141#ifndef __64BIT__
142#ifdef DOUBLE
143	st	%i3, [%sp + STACK_START + 16]
144	st	%i4, [%sp + STACK_START + 20]
145	st	%i5, [%sp + STACK_START + 24]
146
147	ld	[%sp+ STACK_START + 32], X
148	ld	[%sp+ STACK_START + 36], INCX
149	ld	[%sp+ STACK_START + 40], Y
150	ld	[%sp+ STACK_START + 44], INCY
151
152	ldd	[%sp + STACK_START + 16], ALPHA_R
153	ldd	[%sp + STACK_START + 24], ALPHA_I
154#else
155	st	%i3, [%sp + STACK_START + 16]
156	st	%i4, [%sp + STACK_START + 20]
157
158	ld	[%sp+ STACK_START + 28], INCX
159	ld	[%sp+ STACK_START + 32], Y
160	ld	[%sp+ STACK_START + 36], INCY
161
162	ld	[%sp + STACK_START + 16], ALPHA_R
163	ld	[%sp + STACK_START + 20], ALPHA_I
164#endif
165#else
166	ldx	[%sp +  STACK_START + 56], INCX
167	ldx	[%sp +  STACK_START + 64], Y
168	ldx	[%sp +  STACK_START + 72], INCY
169#ifdef DOUBLE
170	FMOV	%f6, ALPHA_R
171	FMOV	%f8, ALPHA_I
172#else
173	FMOV	%f7, ALPHA_R
174	FMOV	%f9, ALPHA_I
175#endif
176#endif
177	sll	INCX, ZBASE_SHIFT, INCX
178	sll	INCY, ZBASE_SHIFT, INCY
179
180	cmp	INCX, 2 * SIZE
181	bne	.LL50
182	nop
183	cmp	INCY, 2 * SIZE
184	bne	.LL50
185	nop
186
187	sra	N, 2, I
188	cmp	I, 0
189	ble,pn	%icc, .LL15
190	nop
191
192	LDF	[X +  0 * SIZE], a1
193	LDF	[X +  1 * SIZE], a2
194	LDF	[Y +  0 * SIZE], b1
195	LDF	[Y +  1 * SIZE], b2
196
197	LDF	[X +  2 * SIZE], a3
198	LDF	[X +  3 * SIZE], a4
199	LDF	[Y +  2 * SIZE], b3
200	LDF	[Y +  3 * SIZE], b4
201
202	LDF	[X +  4 * SIZE], a5
203	LDF	[X +  5 * SIZE], a6
204	LDF	[Y +  4 * SIZE], b5
205	LDF	[Y +  5 * SIZE], b6
206
207	LDF	[X +  6 * SIZE], a7
208	LDF	[X +  7 * SIZE], a8
209	LDF	[Y +  6 * SIZE], b7
210	LDF	[Y +  7 * SIZE], b8
211
212	FMUL	ALPHA_R, a1, t1
213	FMUL	ALPHA_R, a2, t2
214	FMUL	ALPHA_R, a3, t3
215	FMUL	ALPHA_R, a4, t4
216
217	FADD	b1, t1, c1
218	FMUL	ALPHA_I, a2, t1
219	ADD2	b2, t2, c2
220	FMUL	ALPHA_I, a1, t2
221
222	deccc	I
223	ble,pt	%icc, .LL12
224	nop
225
226#ifdef DOUBLE
227#define PREFETCHSIZE  54
228#else
229#define PREFETCHSIZE 108
230#endif
231
232.LL11:
233	FADD	b3, t3, c3
234	prefetch [Y  + PREFETCHSIZE * SIZE], 0
235	FMUL	ALPHA_I, a4, t3
236	prefetch [X  + PREFETCHSIZE * SIZE], 0
237
238	ADD2	b4, t4, c4
239	LDF	[Y +  8 * SIZE], b1
240	FMUL	ALPHA_I, a3, t4
241	LDF	[X +  9 * SIZE], a2
242
243	ADD1	c1, t1, c1
244	LDF	[Y +  9 * SIZE], b2
245	FMUL	ALPHA_R, a5, t1
246	LDF	[X +  8 * SIZE], a1
247
248	FADD	c2, t2, c2
249	LDF	[Y + 10 * SIZE], b3
250	FMUL	ALPHA_R, a6, t2
251	LDF	[X + 11 * SIZE], a4
252
253	ADD1	c3, t3, c3
254	STF	c1, [Y +  0 * SIZE]
255	FMUL	ALPHA_R, a7, t3
256	LDF	[Y + 11 * SIZE], b4
257
258	FADD	c4, t4, c4
259	STF	c2, [Y +  1 * SIZE]
260	FMUL	ALPHA_R, a8, t4
261	LDF	[X + 10 * SIZE], a3
262
263	FADD	b5, t1, c5
264	STF	c3, [Y +  2 * SIZE]
265	FMUL	ALPHA_I, a6, t1
266
267	ADD2	b6, t2, c6
268	STF	c4, [Y +  3 * SIZE]
269	FMUL	ALPHA_I, a5, t2
270
271	FADD	b7, t3, c7
272	LDF	[Y + 12 * SIZE], b5
273	FMUL	ALPHA_I, a8, t3
274	LDF	[X + 13 * SIZE], a6
275
276	ADD2	b8, t4, c8
277	LDF	[Y + 13 * SIZE], b6
278	FMUL	ALPHA_I, a7, t4
279	LDF	[X + 12 * SIZE], a5
280
281	ADD1	c5, t1, c5
282	LDF	[Y + 14 * SIZE], b7
283	FMUL	ALPHA_R, a1, t1
284	LDF	[X + 15 * SIZE], a8
285
286	FADD	c6, t2, c6
287	LDF	[Y + 15 * SIZE], b8
288	FMUL	ALPHA_R, a2, t2
289	LDF	[X + 14 * SIZE], a7
290
291	ADD1	c7, t3, c7
292	STF	c5, [Y +  4 * SIZE]
293	FMUL	ALPHA_R, a3, t3
294	add	X, 8 * SIZE, X
295
296	FADD	c8, t4, c8
297	STF	c6, [Y +  5 * SIZE]
298	FMUL	ALPHA_R, a4, t4
299	deccc	I
300
301	FADD	b1, t1, c1
302	STF	c7, [Y +  6 * SIZE]
303	FMUL	ALPHA_I, a2, t1
304
305	ADD2	b2, t2, c2
306	STF	c8, [Y +  7 * SIZE]
307	FMUL	ALPHA_I, a1, t2
308
309	bg,pt	%icc, .LL11
310	add	Y, 8 * SIZE, Y
311
312
313.LL12:
314	FADD	b3, t3, c3
315	FMUL	ALPHA_I, a4, t3
316	ADD2	b4, t4, c4
317	FMUL	ALPHA_I, a3, t4
318
319	ADD1	c1, t1, c1
320	FMUL	ALPHA_R, a5, t1
321	FADD	c2, t2, c2
322	FMUL	ALPHA_R, a6, t2
323
324	ADD1	c3, t3, c3
325	FMUL	ALPHA_R, a7, t3
326	FADD	c4, t4, c4
327	FMUL	ALPHA_R, a8, t4
328
329	FADD	b5, t1, c5
330	FMUL	ALPHA_I, a6, t1
331	ADD2	b6, t2, c6
332	FMUL	ALPHA_I, a5, t2
333
334	FADD	b7, t3, c7
335	FMUL	ALPHA_I, a8, t3
336	ADD2	b8, t4, c8
337	FMUL	ALPHA_I, a7, t4
338
339	ADD1	c5, t1, c5
340	FADD	c6, t2, c6
341	ADD1	c7, t3, c7
342	FADD	c8, t4, c8
343
344	STF	c1, [Y +  0 * SIZE]
345	STF	c2, [Y +  1 * SIZE]
346	STF	c3, [Y +  2 * SIZE]
347	STF	c4, [Y +  3 * SIZE]
348
349	STF	c5, [Y +  4 * SIZE]
350	STF	c6, [Y +  5 * SIZE]
351	STF	c7, [Y +  6 * SIZE]
352	STF	c8, [Y +  7 * SIZE]
353
354	add	X, 8 * SIZE, X
355	add	Y, 8 * SIZE, Y
356
357
358.LL15:
359	and	N, 3, I
360	cmp	I,  0
361	ble,a,pn %icc, .LL19
362	nop
363
364.LL16:
365	LDF	[X +  0 * SIZE], a1
366	LDF	[X +  1 * SIZE], a2
367	LDF	[Y +  0 * SIZE], b1
368	LDF	[Y +  1 * SIZE], b2
369
370	FMUL	ALPHA_R, a1, t1
371	FMUL	ALPHA_R, a2, t2
372	FMUL	ALPHA_I, a2, t3
373	FMUL	ALPHA_I, a1, t4
374
375	FADD	b1, t1, b1
376	add	I, -1, I
377	ADD2	b2, t2, b2
378	cmp	I, 0
379	ADD1	b1, t3, c1
380	FADD	b2, t4, c2
381
382	STF	c1, [Y +  0 * SIZE]
383	STF	c2, [Y +  1 * SIZE]
384
385	add	Y, 2 * SIZE, Y
386	bg,pt	%icc, .LL16
387	add	X, 2 * SIZE, X
388
389.LL19:
390	return	%i7 + 8
391	clr	%g0
392
393.LL50:
394	sra	N, 2, I
395	cmp	I, 0
396	ble,pn	%icc, .LL55
397	mov	Y, YY
398
399	LDF	[X +  0 * SIZE], a1
400	LDF	[Y +  0 * SIZE], b1
401	LDF	[X +  1 * SIZE], a2
402	add	X, INCX, X
403	LDF	[Y +  1 * SIZE], b2
404	add	Y, INCY, Y
405	LDF	[X +  0 * SIZE], a3
406	LDF	[Y +  0 * SIZE], b3
407	LDF	[X +  1 * SIZE], a4
408	add	X, INCX, X
409	LDF	[Y +  1 * SIZE], b4
410	add	Y, INCY, Y
411	LDF	[X +  0 * SIZE], a5
412	add	I, -1, I
413	LDF	[Y +  0 * SIZE], b5
414	LDF	[X +  1 * SIZE], a6
415	cmp	I, 0
416	add	X, INCX, X
417	LDF	[Y +  1 * SIZE], b6
418	add	Y, INCY, Y
419	LDF	[X +  0 * SIZE], a7
420	FMUL	ALPHA_R, a1, t1
421	LDF	[Y +  0 * SIZE], b7
422	FMUL	ALPHA_R, a2, t2
423	LDF	[X +  1 * SIZE], a8
424	FMUL	ALPHA_R, a3, t3
425	add	X, INCX, X
426	LDF	[Y +  1 * SIZE], b8
427	FMUL	ALPHA_R, a4, t4
428
429	ble,pt	%icc, .LL52
430	add	Y, INCY, Y
431
432
433.LL51:
434	FADD	b1, t1, c1
435	LDF	[Y +  0 * SIZE], b1
436	FMUL	ALPHA_I, a2, t1
437	LDF	[X +  1 * SIZE], a2
438	ADD2	b2, t2, c2
439	LDF	[Y +  1 * SIZE], b2
440	add	Y, INCY, Y
441	FMUL	ALPHA_I, a1, t2
442	LDF	[X +  0 * SIZE], a1
443	add	X, INCX, X
444
445	FADD	b3, t3, c3
446	LDF	[Y +  0 * SIZE], b3
447	FMUL	ALPHA_I, a4, t3
448	LDF	[X +  1 * SIZE], a4
449	ADD2	b4, t4, c4
450	LDF	[Y +  1 * SIZE], b4
451	add	Y, INCY, Y
452	FMUL	ALPHA_I, a3, t4
453	LDF	[X +  0 * SIZE], a3
454	add	X, INCX, X
455
456	ADD1	c1, t1, c1
457	FMUL	ALPHA_R, a5, t1
458	FADD	c2, t2, c2
459	FMUL	ALPHA_R, a6, t2
460	ADD1	c3, t3, c3
461	FMUL	ALPHA_R, a7, t3
462	FADD	c4, t4, c4
463	FMUL	ALPHA_R, a8, t4
464
465	STF	c1, [YY +  0 * SIZE]
466	FADD	b5, t1, c1
467	FMUL	ALPHA_I, a6, t1
468	STF	c2, [YY +  1 * SIZE]
469	ADD2	b6, t2, c2
470	FMUL	ALPHA_I, a5, t2
471	add	YY, INCY, YY
472	STF	c3, [YY +  0 * SIZE]
473	FADD	b7, t3, c3
474	FMUL	ALPHA_I, a8, t3
475	STF	c4, [YY +  1 * SIZE]
476	ADD2	b8, t4, c4
477	FMUL	ALPHA_I, a7, t4
478	add	YY, INCY, YY
479
480	LDF	[X +  0 * SIZE], a5
481	ADD1	c1, t1, c1
482	LDF	[Y +  0 * SIZE], b5
483	FMUL	ALPHA_R, a1, t1
484	LDF	[X +  1 * SIZE], a6
485	add	X, INCX, X
486	FADD	c2, t2, c2
487	LDF	[Y +  1 * SIZE], b6
488	add	Y, INCY, Y
489	FMUL	ALPHA_R, a2, t2
490	LDF	[X +  0 * SIZE], a7
491	ADD1	c3, t3, c3
492	LDF	[Y +  0 * SIZE], b7
493	FMUL	ALPHA_R, a3, t3
494	LDF	[X +  1 * SIZE], a8
495	add	X, INCX, X
496	FADD	c4, t4, c4
497	LDF	[Y +  1 * SIZE], b8
498	add	Y, INCY, Y
499	FMUL	ALPHA_R, a4, t4
500
501	STF	c1, [YY +  0 * SIZE]
502	add	I, -1, I
503	STF	c2, [YY +  1 * SIZE]
504	add	YY, INCY, YY
505	STF	c3, [YY +  0 * SIZE]
506	cmp	I, 0
507	STF	c4, [YY +  1 * SIZE]
508
509	bg,pt	%icc, .LL51
510	add	YY, INCY, YY
511
512.LL52:
513	FADD	b1, t1, c1
514	FMUL	ALPHA_I, a2, t1
515	ADD2	b2, t2, c2
516	FMUL	ALPHA_I, a1, t2
517
518	FADD	b3, t3, c3
519	FMUL	ALPHA_I, a4, t3
520	ADD2	b4, t4, c4
521	FMUL	ALPHA_I, a3, t4
522
523	ADD1	c1, t1, c1
524	FMUL	ALPHA_R, a5, t1
525	FADD	c2, t2, c2
526	FMUL	ALPHA_R, a6, t2
527	ADD1	c3, t3, c3
528	FMUL	ALPHA_R, a7, t3
529	FADD	c4, t4, c4
530	FMUL	ALPHA_R, a8, t4
531
532	STF	c1, [YY +  0 * SIZE]
533	STF	c2, [YY +  1 * SIZE]
534	add	YY, INCY, YY
535	STF	c3, [YY +  0 * SIZE]
536	STF	c4, [YY +  1 * SIZE]
537	add	YY, INCY, YY
538
539	FADD	b5, t1, c1
540	FMUL	ALPHA_I, a6, t1
541	ADD2	b6, t2, c2
542	FMUL	ALPHA_I, a5, t2
543	FADD	b7, t3, c3
544	FMUL	ALPHA_I, a8, t3
545	ADD2	b8, t4, c4
546	FMUL	ALPHA_I, a7, t4
547
548	ADD1	c1, t1, c1
549	FADD	c2, t2, c2
550	ADD1	c3, t3, c3
551	FADD	c4, t4, c4
552
553	STF	c1, [YY +  0 * SIZE]
554	STF	c2, [YY +  1 * SIZE]
555	add	YY, INCY, YY
556	STF	c3, [YY +  0 * SIZE]
557	STF	c4, [YY +  1 * SIZE]
558	add	YY, INCY, YY
559
560.LL55:
561	and	N, 3, I
562	cmp	I,  0
563	ble,a,pn %icc, .LL59
564	nop
565
566.LL56:
567	LDF	[X +  0 * SIZE], a1
568	LDF	[X +  1 * SIZE], a2
569	LDF	[Y +  0 * SIZE], b1
570	LDF	[Y +  1 * SIZE], b2
571
572	FMUL	ALPHA_R, a1, t1
573	FMUL	ALPHA_R, a2, t2
574	FMUL	ALPHA_I, a2, t3
575	FMUL	ALPHA_I, a1, t4
576	FADD	b1, t1, b1
577	ADD2	b2, t2, b2
578	ADD1	b1, t3, c1
579	FADD	b2, t4, c2
580
581	add	I, -1, I
582	cmp	I, 0
583	STF	c1, [Y +  0 * SIZE]
584	STF	c2, [Y +  1 * SIZE]
585
586	add	Y, INCY, Y
587	bg,pt	%icc, .LL56
588	add	X, INCX, X
589
590.LL59:
591	return	%i7 + 8
592	clr	%o0
593
594	EPILOGUE
595