1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#ifdef DOUBLE
43#define PREFETCHSIZE 44
44#else
45#define PREFETCHSIZE 88
46#endif
47
48#define M	%i0
49#define N	%i1
50#define A	%i5
51#define LDA	%i2
52#define X	%i3
53#define INCX	%i4
54
55#define Y	%l0
56#define INCY	%l1
57#define BUFFER	%l2
58
59#define I	%l3
60#define J	%l5
61
62#define A1	%o0
63#define A2	%o1
64#define A3	%o2
65#define A4	%o3
66
67#define Y1	%l4
68#define YY	%l6
69
70#ifdef DOUBLE
71#define t1	%f0
72#define	t2 	%f2
73#define t3	%f4
74#define	t4 	%f6
75
76#define y1	%f8
77#define y2	%f10
78#define y3	%f12
79#define y4	%f14
80#define y5	%f16
81#define y6	%f18
82#define y7	%f20
83#define y8	%f22
84
85#define a1	%f24
86#define a2	%f26
87#define a3	%f28
88#define a4	%f30
89#define a5	%f32
90#define a6	%f34
91#define a7	%f36
92#define a8	%f38
93
94#define a9	%f40
95#define a10	%f42
96#define a11	%f44
97#define a12	%f46
98#define a13	%f48
99#define a14	%f50
100#define a15	%f52
101#define a16	%f54
102
103#define x1	%f56
104#define x2	%f58
105#define x3	%f60
106#define x4	%f62
107
108#define FZERO	%f50
109#define ALPHA_R	%f52
110#define ALPHA_I	%f54
111#else
112#define t1	%f0
113#define	t2 	%f1
114#define t3	%f2
115#define	t4 	%f3
116
117#define y1	%f4
118#define y2	%f5
119#define y3	%f6
120#define y4	%f7
121#define y5	%f8
122#define y6	%f9
123#define y7	%f10
124#define y8	%f11
125
126#define a1	%f12
127#define a2	%f13
128#define a3	%f14
129#define a4	%f15
130#define a5	%f16
131#define a6	%f17
132#define a7	%f18
133#define a8	%f19
134
135#define a9	%f20
136#define a10	%f21
137#define a11	%f22
138#define a12	%f23
139#define a13	%f24
140#define a14	%f25
141#define a15	%f26
142#define a16	%f27
143
144#define x1	%f28
145#define x2	%f29
146#define x3	%f30
147#define x4	%f31
148
149#define FZERO	%f25
150#define ALPHA_R	%f26
151#define ALPHA_I	%f27
152#endif
153
154#ifndef __64BIT__
155#define STACK_ALPHA_R	[%sp + STACK_START + 16]
156#ifndef DOUBLE
157#define STACK_ALPHA_I	[%sp + STACK_START + 20]
158#else
159#define STACK_ALPHA_I	[%sp + STACK_START + 24]
160#endif
161#else
162#define STACK_ALPHA_R	[%sp + STACK_START + 32]
163#define STACK_ALPHA_I	[%sp + STACK_START + 40]
164#endif
165
166#ifndef CONJ
167#define	FSUBX	FSUB
168#define FADDX	FADD
169#else
170#define	FSUBX	FADD
171#define FADDX	FSUB
172#endif
173
174	PROLOGUE
175	SAVESP
176
177#ifndef __64BIT__
178#ifdef DOUBLE
179	st	%i3, [%sp + STACK_START + 16]   /* ALPHA_R */
180	st	%i4, [%sp + STACK_START + 20]
181	st	%i5, [%sp + STACK_START + 24]   /* ALPHA_I */
182
183	ld	[%sp + STACK_START + 32], A
184	ld	[%sp + STACK_START + 36], LDA
185	ld	[%sp + STACK_START + 40], X
186	ld	[%sp + STACK_START + 44], INCX
187	ld	[%sp + STACK_START + 48], Y
188	ld	[%sp + STACK_START + 52], INCY
189	ld	[%sp + STACK_START + 56], BUFFER
190#else
191	st	%i3, [%sp + STACK_START + 16]   /* ALPHA_R */
192	st	%i4, [%sp + STACK_START + 20]   /* ALPHA_I */
193
194	ld	[%sp + STACK_START + 28], LDA
195	ld	[%sp + STACK_START + 32], X
196	ld	[%sp + STACK_START + 36], INCX
197	ld	[%sp + STACK_START + 40], Y
198	ld	[%sp + STACK_START + 44], INCY
199	ld	[%sp + STACK_START + 48], BUFFER
200#endif
201#else
202	ldx	[%sp + STACK_START + 56], LDA
203	ldx	[%sp + STACK_START + 64], X
204	ldx	[%sp + STACK_START + 72], INCX
205	ldx	[%sp + STACK_START + 80], Y
206	ldx	[%sp + STACK_START + 88], INCY
207	ldx	[%sp + STACK_START + 96], BUFFER
208
209#ifdef DOUBLE
210	std	%f6, STACK_ALPHA_R
211	std	%f8, STACK_ALPHA_I
212#else
213	st	%f7, STACK_ALPHA_R
214	st	%f9, STACK_ALPHA_I
215#endif
216#endif
217
218	sll	LDA, ZBASE_SHIFT, LDA
219
220	cmp	M, 0
221	ble	%icc, .LL999
222	sll	INCX, ZBASE_SHIFT, INCX
223
224	cmp	N, 0
225	ble	%icc, .LL999
226	sll	INCY, ZBASE_SHIFT, INCY
227
228	cmp	INCY, 2 * SIZE
229	be	%icc, .LL20
230	mov	Y, YY
231
232#ifdef DOUBLE
233	FCLR(19)
234#else
235	FCLR(25)
236#endif
237
238	add	M, 3, J
239	sra	J, 2, J
240	mov	BUFFER, YY
241	mov	BUFFER, Y1
242
243.LL01:
244	STF	FZERO, [Y1 +  0 * SIZE]
245	nop
246	STF	FZERO, [Y1 +  1 * SIZE]
247	STF	FZERO, [Y1 +  2 * SIZE]
248	STF	FZERO, [Y1 +  3 * SIZE]
249	STF	FZERO, [Y1 +  4 * SIZE]
250	nop
251	STF	FZERO, [Y1 +  5 * SIZE]
252	deccc	J
253	STF	FZERO, [Y1 +  6 * SIZE]
254	nop
255	STF	FZERO, [Y1 +  7 * SIZE]
256	bg,pn	%icc, .LL01
257	add	Y1, 8 * SIZE, Y1
258
259.LL20:
260	sra	N, 1, J
261	cmp	J, 0
262	ble,pn	%icc, .LL30
263	nop
264
265.LL21:
266	mov	YY, Y1
267	mov	A,  A1
268	LDF	STACK_ALPHA_R, ALPHA_R
269	LDF	STACK_ALPHA_I, ALPHA_I
270
271	add	A,  LDA, A2
272	add	A2, LDA, A
273
274	LDF	[X + 0 * SIZE], x1
275	LDF	[X + 1 * SIZE], x2
276	add	X, INCX, X
277	LDF	[X + 0 * SIZE], x3
278	LDF	[X + 1 * SIZE], x4
279	add	X, INCX, X
280
281	FMUL	ALPHA_R, x1, a1
282	FMUL	ALPHA_I, x2, a4
283	FMUL	ALPHA_I, x1, a2
284	FMUL	ALPHA_R, x2, a3
285
286	FMUL	ALPHA_R, x3, a5
287	FMUL	ALPHA_I, x4, a8
288	FMUL	ALPHA_I, x3, a6
289	FMUL	ALPHA_R, x4, a7
290
291#ifndef XCONJ
292	FSUB	a1, a4, x1
293	FADD	a2, a3, x2
294	FSUB	a5, a8, x3
295	FADD	a6, a7, x4
296#else
297	FADD	a1, a4, x1
298	FSUB	a2, a3, x2
299	FADD	a5, a8, x3
300	FSUB	a6, a7, x4
301#endif
302
303	sra	M, 2, I
304	cmp	I, 0
305	ble,pn	%icc, .LL27
306	nop
307
308	LDF	[A1 + 0 * SIZE], a1
309	LDF	[A1 + 1 * SIZE], a2
310	LDF	[A1 + 2 * SIZE], a3
311	LDF	[A1 + 3 * SIZE], a4
312
313	LDF	[A1 + 4 * SIZE], a9
314	LDF	[A1 + 5 * SIZE], a10
315	LDF	[A1 + 6 * SIZE], a11
316	LDF	[A1 + 7 * SIZE], a12
317
318	LDF	[A2 + 0 * SIZE], a5
319	LDF	[A2 + 1 * SIZE], a6
320	LDF	[A2 + 2 * SIZE], a7
321	LDF	[A2 + 3 * SIZE], a8
322
323	LDF	[A2 + 4 * SIZE], a13
324	LDF	[A2 + 5 * SIZE], a14
325	LDF	[A2 + 6 * SIZE], a15
326	LDF	[A2 + 7 * SIZE], a16
327
328	LDF	[Y1 + 0 * SIZE], y1
329	LDF	[Y1 + 1 * SIZE], y2
330	LDF	[Y1 + 2 * SIZE], y3
331
332
333	FMUL	a1, x1, t1
334	deccc	I
335	FMUL	a1, x2, t2
336	LDF	[A1 +  8 * SIZE], a1
337
338	FMUL	a3, x1, t3
339	FMUL	a3, x2, t4
340	ble,pn	%icc, .LL26
341	LDF	[A1 + 10 * SIZE], a3
342
343	FADD	y1, t1, y1
344	LDF	[Y1 + 3 * SIZE], y4
345	FMUL	a2, x2, t1
346
347	FADD	y2, t2, y2
348	FMUL	a2, x1, t2
349	LDF	[A1 +  9 * SIZE], a2
350
351	FADD	y3, t3, y3
352	LDF	[Y1 + 4 * SIZE], y5
353	FMUL	a4, x2, t3
354
355	FADD	y4, t4, y4
356	FMUL	a4, x1, t4
357	LDF	[A1 + 11 * SIZE], a4
358
359	FSUBX	y1, t1, y1
360	LDF	[Y1 + 5 * SIZE], y6
361	FMUL	a5, x3, t1
362
363	FADDX	y2, t2, y2
364	FMUL	a5, x4, t2
365	LDF	[A2 +  8 * SIZE], a5
366
367	FSUBX	y3, t3, y3
368	LDF	[Y1 + 6 * SIZE], y7
369	FMUL	a7, x3, t3
370
371	FADDX	y4, t4, y4
372	FMUL	a7, x4, t4
373	LDF	[A2 + 10 * SIZE], a7
374
375	FADD	y1, t1, y1
376	LDF	[Y1 + 7 * SIZE], y8
377	FMUL	a6, x4, t1
378
379	FADD	y2, t2, y2
380	FMUL	a6, x3, t2
381	LDF	[A2 +  9 * SIZE], a6
382
383	FADD	y3, t3, y3
384	FMUL	a8, x4, t3
385
386	FADD	y4, t4, y4
387	FMUL	a8, x3, t4
388	LDF	[A2 + 11 * SIZE], a8
389
390	FSUBX	y1, t1, y1
391	FMUL	a9,  x1, t1
392
393	FADDX	y2, t2, y2
394	FMUL	a9,  x2, t2
395	LDF	[A1 + 12 * SIZE], a9
396
397	FSUBX	y3, t3, y3
398	deccc	I
399	FMUL	a11, x1, t3
400
401	FADDX	y4, t4, y4
402	FMUL	a11, x2, t4
403	ble,pn	%icc, .LL23
404	LDF	[A1 + 14 * SIZE], a11
405
406.LL22:
407	FADD	y5, t1, y5
408	prefetch  [A1 +  PREFETCHSIZE * SIZE], 1
409	FMUL	a10, x2, t1
410	LDF	[Y1 + 7 * SIZE], y8
411
412	FADD	y6, t2, y6
413	FMUL	a10, x1, t2
414	LDF	[A1 + 13 * SIZE], a10
415
416	FADD	y7, t3, y7
417	FMUL	a12, x2, t3
418	STF	y1, [Y1 +  0 * SIZE]
419
420	FADD	y8, t4, y8
421	FMUL	a12, x1, t4
422	LDF	[A1 + 15 * SIZE], a12
423
424	FSUBX	y5, t1, y5
425	FMUL	a13, x3, t1
426	STF	y2, [Y1 +  1 * SIZE]
427
428	FADDX	y6, t2, y6
429	FMUL	a13, x4, t2
430	LDF	[A2 + 12 * SIZE], a13
431
432	FSUBX	y7, t3, y7
433	FMUL	a15, x3, t3
434	STF	y3, [Y1 +  2 * SIZE]
435
436	FADDX	y8, t4, y8
437	FMUL	a15, x4, t4
438	LDF	[A2 + 14 * SIZE], a15
439
440	FADD	y5, t1, y5
441	FMUL	a14, x4, t1
442	STF	y4, [Y1 +  3 * SIZE]
443
444	FADD	y6, t2, y6
445	FMUL	a14, x3, t2
446	LDF	[A2 + 13 * SIZE], a14
447
448	FADD	y7, t3, y7
449	FMUL	a16, x4, t3
450	LDF	[Y1 +  8 * SIZE], y1
451
452	FADD	y8, t4, y8
453	FMUL	a16, x3, t4
454	LDF	[A2 + 15 * SIZE], a16
455
456	FSUBX	y5, t1, y5
457	FMUL	a1, x1, t1
458	LDF	[Y1 +  9 * SIZE], y2
459
460	FADDX	y6, t2, y6
461	FMUL	a1, x2, t2
462	LDF	[A1 + 16 * SIZE], a1
463
464	FSUBX	y7, t3, y7
465	FMUL	a3, x1, t3
466	LDF	[Y1 + 10 * SIZE], y3
467
468	FADDX	y8, t4, y8
469	FMUL	a3, x2, t4
470	LDF	[A1 + 18 * SIZE], a3
471
472	FADD	y1, t1, y1
473	prefetch  [A2 +  PREFETCHSIZE * SIZE], 1
474	FMUL	a2, x2, t1
475	LDF	[Y1 + 11 * SIZE], y4
476
477	FADD	y2, t2, y2
478	FMUL	a2, x1, t2
479	LDF	[A1 + 17 * SIZE], a2
480
481	FADD	y3, t3, y3
482	FMUL	a4, x2, t3
483	STF	y5, [Y1 +  4 * SIZE]
484
485	FADD	y4, t4, y4
486	FMUL	a4, x1, t4
487	LDF	[A1 + 19 * SIZE], a4
488
489	FSUBX	y1, t1, y1
490	FMUL	a5, x3, t1
491	STF	y6, [Y1 +  5 * SIZE]
492
493	FADDX	y2, t2, y2
494	FMUL	a5, x4, t2
495	LDF	[A2 + 16 * SIZE], a5
496
497	FSUBX	y3, t3, y3
498	FMUL	a7, x3, t3
499	STF	y7, [Y1 +  6 * SIZE]
500
501	FADDX	y4, t4, y4
502	deccc	I
503	FMUL	a7, x4, t4
504	LDF	[A2 + 18 * SIZE], a7
505
506	FADD	y1, t1, y1
507	FMUL	a6, x4, t1
508	STF	y8, [Y1 +  7 * SIZE]
509
510	FADD	y2, t2, y2
511	FMUL	a6, x3, t2
512	LDF	[A2 + 17 * SIZE], a6
513
514	FADD	y3, t3, y3
515	add	A1, 8 * SIZE, A1
516	FMUL	a8, x4, t3
517	LDF	[Y1 + 12 * SIZE], y5
518
519	FADD	y4, t4, y4
520	FMUL	a8, x3, t4
521	LDF	[A2 + 19 * SIZE], a8
522
523	FSUBX	y1, t1, y1
524	add	A2, 8 * SIZE, A2
525	FMUL	a9,  x1, t1
526	LDF	[Y1 + 13 * SIZE], y6
527
528	FADDX	y2, t2, y2
529	add	Y1, 8 * SIZE, Y1
530	FMUL	a9,  x2, t2
531	LDF	[A1 + 12 * SIZE], a9
532
533	FSUBX	y3, t3, y3
534	FMUL	a11, x1, t3
535	LDF	[Y1 +  6 * SIZE], y7
536
537	FADDX	y4, t4, y4
538	FMUL	a11, x2, t4
539	bg,pn	%icc, .LL22
540	LDF	[A1 + 14 * SIZE], a11
541
542.LL23:
543	FADD	y5, t1, y5
544	FMUL	a10, x2, t1
545	LDF	[Y1 + 7 * SIZE], y8
546
547	FADD	y6, t2, y6
548	FMUL	a10, x1, t2
549	LDF	[A1 + 13 * SIZE], a10
550
551	FADD	y7, t3, y7
552	FMUL	a12, x2, t3
553	STF	y1, [Y1 +  0 * SIZE]
554
555	FADD	y8, t4, y8
556	FMUL	a12, x1, t4
557	LDF	[A1 + 15 * SIZE], a12
558
559	FSUBX	y5, t1, y5
560	FMUL	a13, x3, t1
561	STF	y2, [Y1 +  1 * SIZE]
562
563	FADDX	y6, t2, y6
564	FMUL	a13, x4, t2
565	LDF	[A2 + 12 * SIZE], a13
566
567	FSUBX	y7, t3, y7
568	FMUL	a15, x3, t3
569	STF	y3, [Y1 +  2 * SIZE]
570	FADDX	y8, t4, y8
571	FMUL	a15, x4, t4
572	LDF	[A2 + 14 * SIZE], a15
573
574	FADD	y5, t1, y5
575	FMUL	a14, x4, t1
576	STF	y4, [Y1 +  3 * SIZE]
577	FADD	y6, t2, y6
578	FMUL	a14, x3, t2
579	LDF	[A2 + 13 * SIZE], a14
580
581	FADD	y7, t3, y7
582	FMUL	a16, x4, t3
583	LDF	[Y1 +  8 * SIZE], y1
584	FADD	y8, t4, y8
585	FMUL	a16, x3, t4
586	LDF	[A2 + 15 * SIZE], a16
587
588	FSUBX	y5, t1, y5
589	add	A1, 8 * SIZE, A1
590	FMUL	a1, x1, t1
591	LDF	[Y1 +  9 * SIZE], y2
592
593	FADDX	y6, t2, y6
594	add	A2, 8 * SIZE, A2
595	FMUL	a1, x2, t2
596	LDF	[A1 +  8 * SIZE], a1
597
598	FSUBX	y7, t3, y7
599	FMUL	a3, x1, t3
600	LDF	[Y1 + 10 * SIZE], y3
601
602	FADDX	y8, t4, y8
603	add	Y1, 8 * SIZE, Y1
604	FMUL	a3, x2, t4
605	LDF	[A1 + 10 * SIZE], a3
606
607	STF	y5, [Y1 -  4 * SIZE]
608	STF	y6, [Y1 -  3 * SIZE]
609	STF	y7, [Y1 -  2 * SIZE]
610	STF	y8, [Y1 -  1 * SIZE]
611
612.LL26:
613	FADD	y1, t1, y1
614	LDF	[Y1 +  3 * SIZE], y4
615	FMUL	a2, x2, t1
616	FADD	y2, t2, y2
617	FMUL	a2, x1, t2
618
619	FADD	y3, t3, y3
620	LDF	[Y1 +  4 * SIZE], y5
621	FMUL	a4, x2, t3
622	FADD	y4, t4, y4
623	FMUL	a4, x1, t4
624
625	FSUBX	y1, t1, y1
626	LDF	[Y1 +  5 * SIZE], y6
627	FMUL	a5, x3, t1
628	FADDX	y2, t2, y2
629	FMUL	a5, x4, t2
630
631	FSUBX	y3, t3, y3
632	LDF	[Y1 +  6 * SIZE], y7
633	FADDX	y4, t4, y4
634	FMUL	a7, x4, t4
635
636	FADD	y1, t1, y1
637	LDF	[Y1 +  7 * SIZE], y8
638	FMUL	a7, x3, t3
639	FMUL	a6, x4, t1
640	FADD	y2, t2, y2
641	FMUL	a6, x3, t2
642
643	FADD	y3, t3, y3
644	FMUL	a8, x4, t3
645	FADD	y4, t4, y4
646	FMUL	a8, x3, t4
647
648	FSUBX	y1, t1, y1
649	FMUL	a9,  x1, t1
650	FADDX	y2, t2, y2
651	FMUL	a9,  x2, t2
652
653	FSUBX	y3, t3, y3
654	FMUL	a11, x1, t3
655	FADDX	y4, t4, y4
656	FMUL	a11, x2, t4
657
658	FADD	y5, t1, y5
659	FMUL	a10, x2, t1
660	FADD	y6, t2, y6
661	FMUL	a10, x1, t2
662
663	FADD	y7, t3, y7
664	FMUL	a12, x2, t3
665	FADD	y8, t4, y8
666	FMUL	a12, x1, t4
667
668	FSUBX	y5, t1, y5
669	FMUL	a13, x3, t1
670	FADDX	y6, t2, y6
671	FMUL	a13, x4, t2
672
673	FSUBX	y7, t3, y7
674	FMUL	a15, x3, t3
675	FADDX	y8, t4, y8
676	FMUL	a15, x4, t4
677
678	FADD	y5, t1, y5
679	FMUL	a14, x4, t1
680	FADD	y6, t2, y6
681	FMUL	a14, x3, t2
682
683	FADD	y7, t3, y7
684	FMUL	a16, x4, t3
685	FADD	y8, t4, y8
686	FMUL	a16, x3, t4
687
688	STF	y1, [Y1 + 0 * SIZE]
689	FSUBX	y5, t1, y5
690	STF	y2, [Y1 + 1 * SIZE]
691	FADDX	y6, t2, y6
692	STF	y3, [Y1 + 2 * SIZE]
693	FSUBX	y7, t3, y7
694	STF	y4, [Y1 + 3 * SIZE]
695	FADDX	y8, t4, y8
696
697	STF	y5, [Y1 + 4 * SIZE]
698	add	A1, 8 * SIZE, A1
699	STF	y6, [Y1 + 5 * SIZE]
700	add	A2, 8 * SIZE, A2
701	STF	y7, [Y1 + 6 * SIZE]
702	STF	y8, [Y1 + 7 * SIZE]
703	add	Y1, 8 * SIZE, Y1
704
705.LL27:
706	andcc	M, 2, I
707	ble,pn	%icc, .LL28
708	nop
709
710	LDF	[A1 + 0 * SIZE], a1
711	LDF	[A1 + 1 * SIZE], a2
712	LDF	[A1 + 2 * SIZE], a3
713	LDF	[A1 + 3 * SIZE], a4
714
715	LDF	[Y1 + 0 * SIZE], y1
716	LDF	[Y1 + 1 * SIZE], y2
717	LDF	[Y1 + 2 * SIZE], y3
718	LDF	[Y1 + 3 * SIZE], y4
719
720	FMUL	a1, x1, t1
721	LDF	[A2 + 0 * SIZE], a5
722	FMUL	a1, x2, t2
723	LDF	[A2 + 1 * SIZE], a6
724	FMUL	a3, x1, t3
725	LDF	[A2 + 2 * SIZE], a7
726	FMUL	a3, x2, t4
727	LDF	[A2 + 3 * SIZE], a8
728
729	FADD	y1, t1, y1
730	FMUL	a2, x2, t1
731	FADD	y2, t2, y2
732	FMUL	a2, x1, t2
733
734	FADD	y3, t3, y3
735	FMUL	a4, x2, t3
736	FADD	y4, t4, y4
737	FMUL	a4, x1, t4
738
739	FSUBX	y1, t1, y1
740	FMUL	a5, x3, t1
741	FADDX	y2, t2, y2
742	FMUL	a5, x4, t2
743
744	FSUBX	y3, t3, y3
745	FMUL	a7, x3, t3
746	FADDX	y4, t4, y4
747	FMUL	a7, x4, t4
748
749	FADD	y1, t1, y1
750	FMUL	a6, x4, t1
751	FADD	y2, t2, y2
752	FMUL	a6, x3, t2
753
754	FADD	y3, t3, y3
755	FMUL	a8, x4, t3
756	FADD	y4, t4, y4
757	FMUL	a8, x3, t4
758
759	FSUBX	y1, t1, y1
760	FADDX	y2, t2, y2
761	FSUBX	y3, t3, y3
762	FADDX	y4, t4, y4
763
764	STF	y1, [Y1 + 0 * SIZE]
765	add	A1, 4 * SIZE, A1
766	STF	y2, [Y1 + 1 * SIZE]
767	add	A2, 4 * SIZE, A2
768	STF	y3, [Y1 + 2 * SIZE]
769	nop
770	STF	y4, [Y1 + 3 * SIZE]
771	add	Y1, 4 * SIZE, Y1
772
773.LL28:
774	andcc	M, 1, I
775	ble,pn	%icc, .LL29
776	nop
777
778	LDF	[A1 + 0 * SIZE], a1
779	LDF	[A1 + 1 * SIZE], a2
780	LDF	[A2 + 0 * SIZE], a3
781	LDF	[A2 + 1 * SIZE], a4
782
783	LDF	[Y1 + 0 * SIZE], y1
784	LDF	[Y1 + 1 * SIZE], y2
785
786	FMUL	a1, x1, t1
787	FMUL	a1, x2, t2
788	FMUL	a2, x2, t3
789	FMUL	a2, x1, t4
790
791	FADD	y1, t1, y1
792	FMUL	a3, x3, t1
793	FADD	y2, t2, y2
794	FMUL	a3, x4, t2
795
796	FSUBX	y1, t3, y1
797	FMUL	a4, x4, t3
798	FADDX	y2, t4, y2
799	FMUL	a4, x3, t4
800
801	FADD	y1, t1, y1
802	FADD	y2, t2, y2
803	FSUBX	y1, t3, y1
804	FADDX	y2, t4, y2
805
806	STF	y1, [Y1 + 0 * SIZE]
807	STF	y2, [Y1 + 1 * SIZE]
808
809.LL29:
810	deccc	J
811	bg	%icc, .LL21
812	nop
813
814
815.LL30:
816	andcc	N, 1, J
817	ble,pn	%icc, .LL990
818	nop
819
820.LL31:
821	mov	YY, Y1
822	mov	A,  A1
823
824	LDF	STACK_ALPHA_R, ALPHA_R
825	LDF	STACK_ALPHA_I, ALPHA_I
826
827	LDF	[X + 0 * SIZE], x1
828	LDF	[X + 1 * SIZE], x2
829
830	FMUL	ALPHA_R, x1, a1		/* AC */
831	FMUL	ALPHA_I, x1, a2		/* AD */
832	FMUL	ALPHA_R, x2, a3		/* BC */
833	FMUL	ALPHA_I, x2, a4		/* BD */
834
835#ifndef XCONJ
836	FSUB	a1, a4, x1
837	FADD	a2, a3, x2
838#else
839	FADD	a1, a4, x1
840	FSUB	a2, a3, x2
841#endif
842
843	sra	M, 2, I
844	cmp	I, 0
845	ble,pn	%icc, .LL37
846	nop
847
848	LDF	[A1 + 0 * SIZE], a1
849	LDF	[A1 + 1 * SIZE], a2
850	LDF	[A1 + 2 * SIZE], a3
851	LDF	[A1 + 3 * SIZE], a4
852
853	LDF	[A1 + 4 * SIZE], a9
854	LDF	[A1 + 5 * SIZE], a10
855	LDF	[A1 + 6 * SIZE], a11
856	LDF	[A1 + 7 * SIZE], a12
857
858	LDF	[Y1 + 0 * SIZE], y1
859	LDF	[Y1 + 1 * SIZE], y2
860	LDF	[Y1 + 2 * SIZE], y3
861	LDF	[Y1 + 3 * SIZE], y4
862
863	LDF	[Y1 + 4 * SIZE], y5
864	LDF	[Y1 + 5 * SIZE], y6
865	LDF	[Y1 + 6 * SIZE], y7
866	LDF	[Y1 + 7 * SIZE], y8
867
868	FMUL	a1, x1, t1
869	deccc	I
870	FMUL	a1, x2, t2
871	LDF	[A1 +  8 * SIZE], a1
872	FMUL	a3, x1, t3
873	FMUL	a3, x2, t4
874	ble,pn	%icc, .LL33
875	LDF	[A1 + 10 * SIZE], a3
876
877.LL32:
878	FADD	y1, t1, y1
879	prefetch  [A1 +  PREFETCHSIZE * SIZE], 1
880	FMUL	a2, x2, t1
881	FADD	y2, t2, y2
882	FMUL	a2, x1, t2
883	LDF	[A1 +  9 * SIZE], a2
884
885	FADD	y3, t3, y3
886	FMUL	a4, x2, t3
887	FADD	y4, t4, y4
888	FMUL	a4, x1, t4
889	LDF	[A1 + 11 * SIZE], a4
890
891	FSUBX	y1, t1, y1
892	FMUL	a9,  x1, t1
893	FADDX	y2, t2, y2
894	FMUL	a9,  x2, t2
895	LDF	[A1 + 12 * SIZE], a9
896
897	FSUBX	y3, t3, y3
898	FMUL	a11, x1, t3
899	FADDX	y4, t4, y4
900	FMUL	a11, x2, t4
901	LDF	[A1 + 14 * SIZE], a11
902
903	STF	y1, [Y1 + 0 * SIZE]
904	STF	y2, [Y1 + 1 * SIZE]
905	STF	y3, [Y1 + 2 * SIZE]
906	STF	y4, [Y1 + 3 * SIZE]
907
908	FADD	y5, t1, y5
909	FMUL	a10, x2, t1
910	LDF	[Y1 +  8 * SIZE], y1
911	FADD	y6, t2, y6
912	FMUL	a10, x1, t2
913	LDF	[A1 + 13 * SIZE], a10
914
915	FADD	y7, t3, y7
916	deccc	I
917	FMUL	a12, x2, t3
918	LDF	[Y1 +  9 * SIZE], y2
919	FADD	y8, t4, y8
920	FMUL	a12, x1, t4
921	LDF	[A1 + 15 * SIZE], a12
922
923	FSUBX	y5, t1, y5
924	add	A1, 8 * SIZE, A1
925	FMUL	a1, x1, t1
926	LDF	[Y1 + 10 * SIZE], y3
927	FADDX	y6, t2, y6
928	FMUL	a1, x2, t2
929	LDF	[A1 +  8 * SIZE], a1
930
931	FSUBX	y7, t3, y7
932	FMUL	a3, x1, t3
933	LDF	[Y1 + 11 * SIZE], y4
934	FADDX	y8, t4, y8
935	FMUL	a3, x2, t4
936	LDF	[A1 + 10 * SIZE], a3
937
938	STF	y5, [Y1 + 4 * SIZE]
939	STF	y6, [Y1 + 5 * SIZE]
940	STF	y7, [Y1 + 6 * SIZE]
941	STF	y8, [Y1 + 7 * SIZE]
942
943	LDF	[Y1 + 12 * SIZE], y5
944	LDF	[Y1 + 13 * SIZE], y6
945	LDF	[Y1 + 14 * SIZE], y7
946	add	Y1, 8 * SIZE, Y1
947	bg,pn	%icc, .LL32
948	LDF	[Y1 +  7 * SIZE], y8
949
950.LL33:
951	FADD	y1, t1, y1
952	FMUL	a2, x2, t1
953	FADD	y2, t2, y2
954	FMUL	a2, x1, t2
955
956	FADD	y3, t3, y3
957	FMUL	a4, x2, t3
958	FADD	y4, t4, y4
959	FMUL	a4, x1, t4
960
961	FSUBX	y1, t1, y1
962	FMUL	a9,  x1, t1
963	FADDX	y2, t2, y2
964	FMUL	a9,  x2, t2
965
966	FSUBX	y3, t3, y3
967	FMUL	a11, x1, t3
968	FADDX	y4, t4, y4
969	FMUL	a11, x2, t4
970
971	FADD	y5, t1, y5
972	FMUL	a10, x2, t1
973	FADD	y6, t2, y6
974	FMUL	a10, x1, t2
975
976	FADD	y7, t3, y7
977	FMUL	a12, x2, t3
978	FADD	y8, t4, y8
979	FMUL	a12, x1, t4
980
981	FSUBX	y5, t1, y5
982	FADDX	y6, t2, y6
983	FSUBX	y7, t3, y7
984	FADDX	y8, t4, y8
985
986	STF	y1, [Y1 + 0 * SIZE]
987	STF	y2, [Y1 + 1 * SIZE]
988	STF	y3, [Y1 + 2 * SIZE]
989	STF	y4, [Y1 + 3 * SIZE]
990
991	STF	y5, [Y1 + 4 * SIZE]
992	STF	y6, [Y1 + 5 * SIZE]
993	STF	y7, [Y1 + 6 * SIZE]
994	STF	y8, [Y1 + 7 * SIZE]
995
996	add	A1, 8 * SIZE, A1
997	add	Y1, 8 * SIZE, Y1
998
999
1000.LL37:
1001	andcc	M, 2, I
1002	ble,pn	%icc, .LL38
1003	nop
1004
1005	LDF	[A1 + 0 * SIZE], a1
1006	LDF	[A1 + 1 * SIZE], a2
1007	LDF	[A1 + 2 * SIZE], a3
1008	LDF	[A1 + 3 * SIZE], a4
1009
1010	LDF	[Y1 + 0 * SIZE], y1
1011	FMUL	a1, x1, t1
1012	LDF	[Y1 + 1 * SIZE], y2
1013	FMUL	a1, x2, t2
1014	LDF	[Y1 + 2 * SIZE], y3
1015	FMUL	a3, x1, t3
1016	LDF	[Y1 + 3 * SIZE], y4
1017	FMUL	a3, x2, t4
1018
1019	FADD	y1, t1, y1
1020	FMUL	a2, x2, t1
1021	FADD	y2, t2, y2
1022	FMUL	a2, x1, t2
1023	FADD	y3, t3, y3
1024	FMUL	a4, x2, t3
1025	FADD	y4, t4, y4
1026	FMUL	a4, x1, t4
1027
1028	FSUBX	y1, t1, y1
1029	FADDX	y2, t2, y2
1030	FSUBX	y3, t3, y3
1031	FADDX	y4, t4, y4
1032
1033	STF	y1, [Y1 + 0 * SIZE]
1034	STF	y2, [Y1 + 1 * SIZE]
1035	STF	y3, [Y1 + 2 * SIZE]
1036	STF	y4, [Y1 + 3 * SIZE]
1037
1038	add	A1, 4 * SIZE, A1
1039	add	Y1, 4 * SIZE, Y1
1040
1041.LL38:
1042	andcc	M, 1, I
1043	ble,pn	%icc, .LL990
1044	nop
1045
1046	LDF	[A1 + 0 * SIZE], a1
1047	LDF	[A1 + 1 * SIZE], a2
1048	LDF	[Y1 + 0 * SIZE], y1
1049	LDF	[Y1 + 1 * SIZE], y2
1050
1051	FMUL	a1, x1, t1
1052	FMUL	a1, x2, t2
1053	FMUL	a2, x2, t3
1054	FMUL	a2, x1, t4
1055
1056	FADD	y1, t1, y1
1057	FADD	y2, t2, y2
1058	FSUBX	y1, t3, y1
1059	FADDX	y2, t4, y2
1060
1061	STF	y1, [Y1 + 0 * SIZE]
1062	STF	y2, [Y1 + 1 * SIZE]
1063
1064.LL990:
1065	cmp	INCY, 2 * SIZE
1066	be	%icc, .LL999
1067	mov	Y, Y1
1068
1069	sra	M, 2, I
1070	cmp	I, 0
1071	ble,pn	%icc, .LL995
1072	nop
1073
1074.LL991:
1075	LDF	[BUFFER +  0 * SIZE], a1
1076	LDF	[BUFFER +  1 * SIZE], a2
1077	LDF	[Y + 0 * SIZE], y1
1078	LDF	[Y + 1 * SIZE], y2
1079	add	Y, INCY, Y
1080
1081	LDF	[BUFFER +  2 * SIZE], a3
1082	LDF	[BUFFER +  3 * SIZE], a4
1083	LDF	[Y + 0 * SIZE], y3
1084	LDF	[Y + 1 * SIZE], y4
1085	add	Y, INCY, Y
1086
1087	LDF	[BUFFER +  4 * SIZE], a5
1088	LDF	[BUFFER +  5 * SIZE], a6
1089	LDF	[Y + 0 * SIZE], y5
1090	LDF	[Y + 1 * SIZE], y6
1091	add	Y, INCY, Y
1092
1093	LDF	[BUFFER +  6 * SIZE], a7
1094	LDF	[BUFFER +  7 * SIZE], a8
1095	LDF	[Y + 0 * SIZE], y7
1096	LDF	[Y + 1 * SIZE], y8
1097	add	Y, INCY, Y
1098
1099	FADD	y1, a1, y1
1100	FADD	y2, a2, y2
1101	FADD	y3, a3, y3
1102	FADD	y4, a4, y4
1103	FADD	y5, a5, y5
1104	FADD	y6, a6, y6
1105	FADD	y7, a7, y7
1106	FADD	y8, a8, y8
1107
1108	STF	y1, [Y1 + 0 * SIZE]
1109	STF	y2, [Y1 + 1 * SIZE]
1110	add	Y1, INCY, Y1
1111	STF	y3, [Y1 + 0 * SIZE]
1112	STF	y4, [Y1 + 1 * SIZE]
1113	add	Y1, INCY, Y1
1114	STF	y5, [Y1 + 0 * SIZE]
1115	STF	y6, [Y1 + 1 * SIZE]
1116	add	Y1, INCY, Y1
1117	STF	y7, [Y1 + 0 * SIZE]
1118	STF	y8, [Y1 + 1 * SIZE]
1119	add	Y1, INCY, Y1
1120
1121	deccc	I
1122	bg,pn	%icc, .LL991
1123	add	BUFFER, 8 * SIZE, BUFFER
1124
1125.LL995:
1126	andcc	M, 2, I
1127	ble,pn	%icc, .LL996
1128	nop
1129
1130	LDF	[BUFFER +  0 * SIZE], a1
1131	LDF	[BUFFER +  1 * SIZE], a2
1132	LDF	[Y + 0 * SIZE], y1
1133	LDF	[Y + 1 * SIZE], y2
1134	add	Y, INCY, Y
1135
1136	LDF	[BUFFER +  2 * SIZE], a3
1137	LDF	[BUFFER +  3 * SIZE], a4
1138	LDF	[Y + 0 * SIZE], y3
1139	LDF	[Y + 1 * SIZE], y4
1140	add	Y, INCY, Y
1141
1142	FADD	y1, a1, y1
1143	FADD	y2, a2, y2
1144	FADD	y3, a3, y3
1145	FADD	y4, a4, y4
1146
1147	STF	y1, [Y1 + 0 * SIZE]
1148	STF	y2, [Y1 + 1 * SIZE]
1149	add	Y1, INCY, Y1
1150	STF	y3, [Y1 + 0 * SIZE]
1151	STF	y4, [Y1 + 1 * SIZE]
1152	add	Y1, INCY, Y1
1153
1154	add	BUFFER, 4 * SIZE, BUFFER
1155
1156.LL996:
1157	andcc	M, 1, I
1158	ble,pn	%icc, .LL999
1159	nop
1160
1161	LDF	[BUFFER +  0 * SIZE], a1
1162	LDF	[BUFFER +  1 * SIZE], a2
1163	LDF	[Y + 0 * SIZE], y1
1164	LDF	[Y + 1 * SIZE], y2
1165
1166	FADD	y1, a1, y1
1167	FADD	y2, a2, y2
1168
1169	STF	y1, [Y1 + 0 * SIZE]
1170	STF	y2, [Y1 + 1 * SIZE]
1171
1172.LL999:
1173	return	%i7 + 8
1174	clr	%o0
1175
1176	EPILOGUE
1177