1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define P 4000
43
44#define M	%i0
45#define N	%i1
46#define A	%i5
47#define LDA	%i2
48#define X	%i3
49#define INCX	%i4
50
51#define Y	%l0
52#define INCY	%l1
53#define BUFFER	%l2
54
55#define I	%l3
56#define IS	%l4
57#define J	%l5
58#define MIN_M	%l6
59#define XP	%l7
60
61#define A1	%o0
62#define A2	%o1
63#define A3	%o2
64#define A4	%o3
65
66#define X1	%o4
67#define Y1	%o5
68#define PNLDA	%g1
69#define Y2	%o7	/* Danger? */
70
71#ifdef DOUBLE
72#define t1	%f0
73#define	t2 	%f2
74#define t3	%f4
75#define	t4 	%f6
76
77#define c1	%f8
78#define c2	%f10
79#define c3	%f12
80#define c4	%f14
81#define c5	%f16
82#define c6	%f18
83#define c7	%f20
84#define c8	%f22
85#define c9	%f24
86#define c10	%f26
87#define c11	%f28
88#define c12	%f30
89#define c13	%f32
90#define c14	%f34
91#define c15	%f36
92#define c16	%f38
93
94#define a1	%f40
95#define a2	%f42
96#define a3	%f44
97#define a4	%f46
98#define a5	%f48
99#define a6	%f50
100#define a7	%f52
101#define a8	%f54
102
103#define b1	%f56
104#define b2	%f58
105#define b3	%f60
106#define b4	%f62
107#else
108#define t1	%f0
109#define	t2 	%f1
110#define t3	%f2
111#define	t4 	%f3
112
113#define c1	%f4
114#define c2	%f5
115#define c3	%f6
116#define c4	%f7
117#define c5	%f8
118#define c6	%f9
119#define c7	%f10
120#define c8	%f11
121#define c9	%f12
122#define c10	%f13
123#define c11	%f14
124#define c12	%f15
125#define c13	%f16
126#define c14	%f17
127#define c15	%f18
128#define c16	%f19
129
130#define a1	%f20
131#define a2	%f21
132#define a3	%f22
133#define a4	%f23
134#define a5	%f24
135#define a6	%f25
136#define a7	%f26
137#define a8	%f27
138
139#define b1	%f28
140#define b2	%f29
141#define b3	%f30
142#define b4	%f31
143#endif
144
145#ifndef __64BIT__
146#define ALPHA_R	[%sp + STACK_START + 16]
147#ifndef DOUBLE
148#define ALPHA_I	[%sp + STACK_START + 20]
149#else
150#define ALPHA_I	[%sp + STACK_START + 24]
151#endif
152#else
153#define ALPHA_R	[%sp + STACK_START + 32]
154#define ALPHA_I	[%sp + STACK_START + 40]
155#endif
156
157#ifdef DOUBLE
158#define PREFETCHSIZE 18
159#else
160#define PREFETCHSIZE 36
161#endif
162
163	PROLOGUE
164	SAVESP
165	nop
166
167#ifndef __64BIT__
168
169#ifdef DOUBLE
170	st	%i3, [%sp + STACK_START + 16]   /* ALPHA_R */
171	st	%i4, [%sp + STACK_START + 20]
172	st	%i5, [%sp + STACK_START + 24]   /* ALPHA_I */
173
174	ld	[%sp + STACK_START + 32], A
175	ld	[%sp + STACK_START + 36], LDA
176	ld	[%sp + STACK_START + 40], X
177	ld	[%sp + STACK_START + 44], INCX
178	ld	[%sp + STACK_START + 48], Y
179	ld	[%sp + STACK_START + 52], INCY
180	ld	[%sp + STACK_START + 56], BUFFER
181#else
182	st	%i3, [%sp + STACK_START + 16]   /* ALPHA_R */
183	st	%i4, [%sp + STACK_START + 20]   /* ALPHA_I */
184
185	ld	[%sp + STACK_START + 28], LDA
186	ld	[%sp + STACK_START + 32], X
187	ld	[%sp + STACK_START + 36], INCX
188	ld	[%sp + STACK_START + 40], Y
189	ld	[%sp + STACK_START + 44], INCY
190	ld	[%sp + STACK_START + 48], BUFFER
191#endif
192#else
193	ldx	[%sp + STACK_START + 56], LDA
194	ldx	[%sp + STACK_START + 64], X
195	ldx	[%sp + STACK_START + 72], INCX
196	ldx	[%sp + STACK_START + 80], Y
197	ldx	[%sp + STACK_START + 88], INCY
198	ldx	[%sp + STACK_START + 96], BUFFER
199#ifdef DOUBLE
200	std	%f6, ALPHA_R
201	std	%f8, ALPHA_I
202#else
203	st	%f7, ALPHA_R
204	st	%f9, ALPHA_I
205#endif
206#endif
207
208	clr	IS
209	mov	P, I
210	sll	LDA, ZBASE_SHIFT, LDA
211	sll	I, ZBASE_SHIFT, I
212	smul	LDA, N, PNLDA
213	sll	INCX, ZBASE_SHIFT, INCX
214	sll	INCY, ZBASE_SHIFT, INCY
215	sub	I, PNLDA, PNLDA
216
217.LL10:
218	sll	IS, ZBASE_SHIFT, I
219	sub	M, IS, MIN_M
220	mov	P, J
221
222	cmp	MIN_M, J
223	nop
224	movg	%icc, J, MIN_M
225	nop
226	cmp	INCX, 2 * SIZE
227	beq	.LL100
228	add	X, I, XP
229
230	sra	MIN_M, 2, I
231	mov	BUFFER, XP
232	cmp	I, 0
233	ble,pn	%icc, .LL15
234	mov	BUFFER, Y1
235
236.LL11:
237	LDF	[X + 0 * SIZE], a1
238	LDF	[X + 1 * SIZE], a2
239	add	X, INCX, X
240	LDF	[X + 0 * SIZE], a3
241	LDF	[X + 1 * SIZE], a4
242	add	X, INCX, X
243	LDF	[X + 0 * SIZE], a5
244	LDF	[X + 1 * SIZE], a6
245	add	X, INCX, X
246	LDF	[X + 0 * SIZE], a7
247	LDF	[X + 1 * SIZE], a8
248	add	X, INCX, X
249
250	STF	a1, [Y1 + 0 * SIZE]
251	add	I, -1, I
252	STF	a2, [Y1 + 1 * SIZE]
253	cmp	I, 0
254	STF	a3, [Y1 + 2 * SIZE]
255	STF	a4, [Y1 + 3 * SIZE]
256	STF	a5, [Y1 + 4 * SIZE]
257	STF	a6, [Y1 + 5 * SIZE]
258	STF	a7, [Y1 + 6 * SIZE]
259	STF	a8, [Y1 + 7 * SIZE]
260	bg,pn	%icc, .LL11
261	add	Y1, 8 * SIZE, Y1
262
263.LL15:
264	and	MIN_M, 3, I
265	cmp	I, 0
266	ble,pn	%icc, .LL100
267	nop
268
269.LL16:
270	LDF	[X + 0 * SIZE], a1
271	LDF	[X + 1 * SIZE], a2
272	add	X, INCX, X
273	add	I, -1, I
274	cmp	I, 0
275	nop
276	STF	a1, [Y1 + 0 * SIZE]
277	STF	a2, [Y1 + 1 * SIZE]
278	bg,pn	%icc, .LL16
279	add	Y1, 2 * SIZE, Y1
280
281.LL100:
282	sra	N, 2, J
283	cmp	J, 0
284	ble	%icc, .LL200
285	mov	Y, Y1
286
287.LL110:
288	FCLR(0)
289
290	FMOV	t1, c1
291	sra	MIN_M, 2, I
292	FMOV	t1, c2
293	add	A,  LDA, A2
294	FMOV	t1, c3
295	mov	A,  A1
296	FMOV	t1, c4
297	add	A2, LDA, A3
298
299	FMOV	t1, c5
300	FMOV	t1, c6
301	FMOV	t1, c7
302	FMOV	t1, c8
303	FMOV	t1, c9
304	FMOV	t1, c10
305	FMOV	t1, c11
306	FMOV	t1, c12
307	FMOV	t1, c13
308	FMOV	t1, c14
309	FMOV	t1, c15
310	FMOV	t1, c16
311
312	add	A3, LDA, A4
313	FMOV	t1, t2
314	mov	XP, X1
315	FMOV	t1, t3
316	add	A4, LDA, A
317	cmp	I, 0
318	ble	%icc, .LL115
319	FMOV	t1, t4
320
321	LDF	[A1 + 0 * SIZE], a1
322	nop
323	LDF	[A1 + 1 * SIZE], a2
324	add	A1, 2 * SIZE, A1
325	LDF	[A2 + 0 * SIZE], a3
326	LDF	[A2 + 1 * SIZE], a4
327	add	A2, 2 * SIZE, A2
328	LDF	[A3 + 0 * SIZE], a5
329	LDF	[A3 + 1 * SIZE], a6
330	add	A3, 2 * SIZE, A3
331	LDF	[A4 + 0 * SIZE], a7
332	LDF	[A4 + 1 * SIZE], a8
333	add	A4, 2 * SIZE, A4
334
335	LDF	[X1 + 0 * SIZE], b1
336	nop
337	LDF	[X1 + 1 * SIZE], b2
338	nop
339	LDF	[X1 + 2 * SIZE], b3
340	add	X1, 4 * SIZE, X1
341
342	deccc	 I
343	ble	 .LL112
344	prefetch [Y1 + 7 * SIZE], 2
345
346#ifndef XCONJ
347#define FADDX	FADD
348#else
349#define FADDX	FSUB
350#endif
351
352.LL111:
353	FADD	c13, t1, c13
354	prefetch [A1 + PREFETCHSIZE * SIZE], 1
355	FMUL	a1, b1, t1
356	nop
357
358	FADDX	c14, t2, c14
359	nop
360	FMUL	a1, b2, t2
361	LDF	[A1 + 0 * SIZE], a1
362
363	FADD	c15, t3, c15
364	nop
365	FMUL	a2, b1, t3
366	LDF	[X1 - 1 * SIZE], b4
367
368	FADD	c16, t4, c16
369	nop
370	FMUL	a2, b2, t4
371	LDF	[A1 + 1 * SIZE], a2
372
373	FADD	c1, t1, c1
374	nop
375	FMUL	a3, b1, t1
376	nop
377
378	FADDX	c2, t2, c2
379	nop
380	FMUL	a3, b2, t2
381	LDF	[A2 + 0 * SIZE], a3
382
383	FADD	c3, t3, c3
384	nop
385	FMUL	a4, b1, t3
386	nop
387
388	FADD	c4, t4, c4
389	nop
390	FMUL	a4, b2, t4
391	LDF	[A2 + 1 * SIZE], a4
392
393	FADD	c5, t1, c5
394	nop
395	FMUL	a5, b1, t1
396	nop
397
398	FADDX	c6, t2, c6
399	nop
400	FMUL	a5, b2, t2
401	LDF	[A3 + 0 * SIZE], a5
402
403	FADD	c7, t3, c7
404	nop
405	FMUL	a6, b1, t3
406	nop
407
408	FADD	c8, t4, c8
409	nop
410	FMUL	a6, b2, t4
411	LDF	[A3 + 1 * SIZE], a6
412
413	FADD	c9, t1, c9
414	nop
415	FMUL	a7, b1, t1
416	nop
417
418	FADDX	c10, t2, c10
419	nop
420	FMUL	a7, b2, t2
421	LDF	[A4 + 0 * SIZE], a7
422
423	FADD	c11, t3, c11
424	nop
425	FMUL	a8, b1, t3
426	LDF	[X1 + 0 * SIZE], b1
427
428	FADD	c12, t4, c12
429	nop
430	FMUL	a8, b2, t4
431	LDF	[A4 + 1 * SIZE], a8
432
433	FADD	c13, t1, c13
434	nop
435	FMUL	a1, b3, t1
436	prefetch [A2 + PREFETCHSIZE * SIZE], 1
437
438	FADDX	c14, t2, c14
439	nop
440	FMUL	a1, b4, t2
441	LDF	[A1 + 2 * SIZE], a1
442
443	FADD	c15, t3, c15
444	nop
445	FMUL	a2, b3, t3
446	LDF	[X1 + 1 * SIZE], b2
447
448	FADD	c16, t4, c16
449	nop
450	FMUL	a2, b4, t4
451	LDF	[A1 + 3 * SIZE], a2
452
453	FADD	c1, t1, c1
454	nop
455	FMUL	a3, b3, t1
456	nop
457
458	FADDX	c2, t2, c2
459	nop
460	FMUL	a3, b4, t2
461	LDF	[A2 + 2 * SIZE], a3
462
463	FADD	c3, t3, c3
464	nop
465	FMUL	a4, b3, t3
466	nop
467
468	FADD	c4, t4, c4
469	nop
470	FMUL	a4, b4, t4
471	LDF	[A2 + 3 * SIZE], a4
472
473	FADD	c5, t1, c5
474	nop
475	FMUL	a5, b3, t1
476	nop
477
478	FADDX	c6, t2, c6
479	nop
480	FMUL	a5, b4, t2
481	LDF	[A3 + 2 * SIZE], a5
482
483	FADD	c7, t3, c7
484	nop
485	FMUL	a6, b3, t3
486	nop
487
488	FADD	c8, t4, c8
489	nop
490	FMUL	a6, b4, t4
491	LDF	[A3 + 3 * SIZE], a6
492
493	FADD	c9, t1, c9
494	nop
495	FMUL	a7, b3, t1
496	nop
497
498	FADDX	c10, t2, c10
499	nop
500	FMUL	a7, b4, t2
501	LDF	[A4 + 2 * SIZE], a7
502
503	FADD	c11, t3, c11
504	nop
505	FMUL	a8, b3, t3
506	LDF	[X1 + 2 * SIZE], b3
507
508	FADD	c12, t4, c12
509	nop
510	FMUL	a8, b4, t4
511	LDF	[A4 + 3 * SIZE], a8
512
513	FADD	c13, t1, c13
514	prefetch [A3 + PREFETCHSIZE * SIZE], 1
515	FMUL	a1, b1, t1
516	nop
517
518	FADDX	c14, t2, c14
519	nop
520	FMUL	a1, b2, t2
521	LDF	[A1 + 4 * SIZE], a1
522
523	FADD	c15, t3, c15
524	nop
525	FMUL	a2, b1, t3
526	LDF	[X1 + 3 * SIZE], b4
527
528	FADD	c16, t4, c16
529	nop
530	FMUL	a2, b2, t4
531	LDF	[A1 + 5 * SIZE], a2
532
533	FADD	c1, t1, c1
534	nop
535	FMUL	a3, b1, t1
536	nop
537
538	FADDX	c2, t2, c2
539	nop
540	FMUL	a3, b2, t2
541	LDF	[A2 + 4 * SIZE], a3
542
543	FADD	c3, t3, c3
544	nop
545	FMUL	a4, b1, t3
546	nop
547
548	FADD	c4, t4, c4
549	nop
550	FMUL	a4, b2, t4
551	LDF	[A2 + 5 * SIZE], a4
552
553	FADD	c5, t1, c5
554	nop
555	FMUL	a5, b1, t1
556	nop
557
558	FADDX	c6, t2, c6
559	nop
560	FMUL	a5, b2, t2
561	LDF	[A3 + 4 * SIZE], a5
562
563	FADD	c7, t3, c7
564	deccc	I
565	FMUL	a6, b1, t3
566	nop
567
568	FADD	c8, t4, c8
569	nop
570	FMUL	a6, b2, t4
571	LDF	[A3 + 5 * SIZE], a6
572
573	FADD	c9, t1, c9
574	nop
575	FMUL	a7, b1, t1
576	nop
577
578	FADDX	c10, t2, c10
579	nop
580	FMUL	a7, b2, t2
581	LDF	[A4 + 4 * SIZE], a7
582
583	FADD	c11, t3, c11
584	nop
585	FMUL	a8, b1, t3
586	LDF	[X1 + 4 * SIZE], b1
587
588	FADD	c12, t4, c12
589	nop
590	FMUL	a8, b2, t4
591	LDF	[A4 + 5 * SIZE], a8
592
593	FADD	c13, t1, c13
594	prefetch [A4 + PREFETCHSIZE * SIZE], 1
595	FMUL	a1, b3, t1
596	nop
597
598	FADDX	c14, t2, c14
599	nop
600	FMUL	a1, b4, t2
601	LDF	[A1 + 6 * SIZE], a1
602
603	FADD	c15, t3, c15
604	nop
605	FMUL	a2, b3, t3
606	LDF	[X1 + 5 * SIZE], b2
607
608	FADD	c16, t4, c16
609	nop
610	FMUL	a2, b4, t4
611	LDF	[A1 + 7 * SIZE], a2
612
613	FADD	c1, t1, c1
614	add	A1, 8 * SIZE, A1
615	FMUL	a3, b3, t1
616	nop
617
618	FADDX	c2, t2, c2
619	nop
620	FMUL	a3, b4, t2
621	LDF	[A2 + 6 * SIZE], a3
622
623	FADD	c3, t3, c3
624	nop
625	FMUL	a4, b3, t3
626	nop
627
628	FADD	c4, t4, c4
629	nop
630	FMUL	a4, b4, t4
631	LDF	[A2 + 7 * SIZE], a4
632
633	FADD	c5, t1, c5
634	add	A2, 8 * SIZE, A2
635	FMUL	a5, b3, t1
636	nop
637
638	FADDX	c6, t2, c6
639	nop
640	FMUL	a5, b4, t2
641	LDF	[A3 + 6 * SIZE], a5
642
643	FADD	c7, t3, c7
644	add	A4, 8 * SIZE, A4
645	FMUL	a6, b3, t3
646	nop
647
648	FADD	c8, t4, c8
649	nop
650	FMUL	a6, b4, t4
651	LDF	[A3 + 7 * SIZE], a6
652
653	FADD	c9, t1, c9
654	add	A3, 8 * SIZE, A3
655	FMUL	a7, b3, t1
656	nop
657
658	FADDX	c10, t2, c10
659	add	X1, 8 * SIZE, X1
660	FMUL	a7, b4, t2
661	LDF	[A4 - 2 * SIZE], a7
662
663	FADD	c11, t3, c11
664	nop
665	FMUL	a8, b3, t3
666	LDF	[X1 - 2 * SIZE], b3
667
668	FADD	c12, t4, c12
669	FMUL	a8, b4, t4
670	bg,pn	%icc, .LL111
671	LDF	[A4 - 1 * SIZE], a8
672
673.LL112:
674	FADD	c13, t1, c13
675	nop
676	FMUL	a1, b1, t1
677	LDF	[X1 - 1 * SIZE], b4
678
679	FADDX	c14, t2, c14
680	nop
681	FMUL	a1, b2, t2
682	LDF	[A1 + 0 * SIZE], a1
683
684	FADD	c15, t3, c15
685	nop
686	FMUL	a2, b1, t3
687	LDF	[X1 - 1 * SIZE], b4
688
689	FADD	c16, t4, c16
690	nop
691	FMUL	a2, b2, t4
692	LDF	[A1 + 1 * SIZE], a2
693
694	FADD	c1, t1, c1
695	nop
696	FMUL	a3, b1, t1
697	nop
698
699	FADDX	c2, t2, c2
700	nop
701	FMUL	a3, b2, t2
702	LDF	[A2 + 0 * SIZE], a3
703
704	FADD	c3, t3, c3
705	nop
706	FMUL	a4, b1, t3
707	nop
708
709	FADD	c4, t4, c4
710	nop
711	FMUL	a4, b2, t4
712	LDF	[A2 + 1 * SIZE], a4
713
714	FADD	c5, t1, c5
715	nop
716	FMUL	a5, b1, t1
717	nop
718
719	FADDX	c6, t2, c6
720	nop
721	FMUL	a5, b2, t2
722	LDF	[A3 + 0 * SIZE], a5
723
724	FADD	c7, t3, c7
725	nop
726	FMUL	a6, b1, t3
727	nop
728
729	FADD	c8, t4, c8
730	nop
731	FMUL	a6, b2, t4
732	LDF	[A3 + 1 * SIZE], a6
733
734	FADD	c9, t1, c9
735	nop
736	FMUL	a7, b1, t1
737	nop
738
739	FADDX	c10, t2, c10
740	nop
741	FMUL	a7, b2, t2
742	LDF	[A4 + 0 * SIZE], a7
743
744	FADD	c11, t3, c11
745	nop
746	FMUL	a8, b1, t3
747	LDF	[X1 + 0 * SIZE], b1
748
749	FADD	c12, t4, c12
750	nop
751	FMUL	a8, b2, t4
752	LDF	[A4 + 1 * SIZE], a8
753
754	FADD	c13, t1, c13
755	nop
756	FMUL	a1, b3, t1
757	LDF	[X1 + 1 * SIZE], b2
758
759	FADDX	c14, t2, c14
760	nop
761	FMUL	a1, b4, t2
762	LDF	[A1 + 2 * SIZE], a1
763
764	FADD	c15, t3, c15
765	nop
766	FMUL	a2, b3, t3
767	nop
768
769	FADD	c16, t4, c16
770	nop
771	FMUL	a2, b4, t4
772	LDF	[A1 + 3 * SIZE], a2
773
774	FADD	c1, t1, c1
775	nop
776	FMUL	a3, b3, t1
777	nop
778
779	FADDX	c2, t2, c2
780	nop
781	FMUL	a3, b4, t2
782	LDF	[A2 + 2 * SIZE], a3
783
784	FADD	c3, t3, c3
785	nop
786	FMUL	a4, b3, t3
787	nop
788
789	FADD	c4, t4, c4
790	nop
791	FMUL	a4, b4, t4
792	LDF	[A2 + 3 * SIZE], a4
793
794	FADD	c5, t1, c5
795	nop
796	FMUL	a5, b3, t1
797	nop
798
799	FADDX	c6, t2, c6
800	nop
801	FMUL	a5, b4, t2
802	LDF	[A3 + 2 * SIZE], a5
803
804	FADD	c7, t3, c7
805	nop
806	FMUL	a6, b3, t3
807	nop
808
809	FADD	c8, t4, c8
810	nop
811	FMUL	a6, b4, t4
812	LDF	[A3 + 3 * SIZE], a6
813
814	FADD	c9, t1, c9
815	nop
816	FMUL	a7, b3, t1
817	nop
818
819	FADDX	c10, t2, c10
820	nop
821	FMUL	a7, b4, t2
822	LDF	[A4 + 2 * SIZE], a7
823
824	FADD	c11, t3, c11
825	nop
826	FMUL	a8, b3, t3
827	LDF	[X1 + 2 * SIZE], b3
828
829	FADD	c12, t4, c12
830	nop
831	FMUL	a8, b4, t4
832	LDF	[A4 + 3 * SIZE], a8
833
834	FADD	c13, t1, c13
835	nop
836	FMUL	a1, b1, t1
837	LDF	[X1 + 3 * SIZE], b4
838
839	FADDX	c14, t2, c14
840	add	X1, 4 * SIZE, X1
841	FMUL	a1, b2, t2
842	LDF	[A1 + 4 * SIZE], a1
843
844	FADD	c15, t3, c15
845	nop
846	FMUL	a2, b1, t3
847	nop
848
849	FADD	c16, t4, c16
850	nop
851	FMUL	a2, b2, t4
852	LDF	[A1 + 5 * SIZE], a2
853
854	FADD	c1, t1, c1
855	add	A1, 6 * SIZE, A1
856	FMUL	a3, b1, t1
857	nop
858
859	FADDX	c2, t2, c2
860	nop
861	FMUL	a3, b2, t2
862	LDF	[A2 + 4 * SIZE], a3
863
864	FADD	c3, t3, c3
865	nop
866	FMUL	a4, b1, t3
867	nop
868
869	FADD	c4, t4, c4
870	nop
871	FMUL	a4, b2, t4
872	LDF	[A2 + 5 * SIZE], a4
873
874	FADD	c5, t1, c5
875	add	A2, 6 * SIZE, A2
876	FMUL	a5, b1, t1
877	nop
878
879	FADDX	c6, t2, c6
880	nop
881	FMUL	a5, b2, t2
882	LDF	[A3 + 4 * SIZE], a5
883
884	FADD	c7, t3, c7
885	nop
886	FMUL	a6, b1, t3
887	nop
888
889	FADD	c8, t4, c8
890	nop
891	FMUL	a6, b2, t4
892	LDF	[A3 + 5 * SIZE], a6
893
894	FADD	c9, t1, c9
895	add	A3, 6 * SIZE, A3
896	FMUL	a7, b1, t1
897	nop
898
899	FADDX	c10, t2, c10
900	nop
901	FMUL	a7, b2, t2
902	LDF	[A4 + 4 * SIZE], a7
903
904	FADD	c11, t3, c11
905	nop
906	FMUL	a8, b1, t3
907	nop
908
909	FADD	c12, t4, c12
910	nop
911	FMUL	a8, b2, t4
912	LDF	[A4 + 5 * SIZE], a8
913
914	FADD	c13, t1, c13
915	add	A4, 6 * SIZE, A4
916	FMUL	a1, b3, t1
917	nop
918
919	FADDX	c14, t2, c14
920	nop
921	FMUL	a1, b4, t2
922	nop
923
924	FADD	c15, t3, c15
925	FMUL	a2, b3, t3
926	FADD	c16, t4, c16
927	FMUL	a2, b4, t4
928
929	FADD	c1, t1, c1
930	FMUL	a3, b3, t1
931	FADDX	c2, t2, c2
932	FMUL	a3, b4, t2
933	FADD	c3, t3, c3
934	FMUL	a4, b3, t3
935	FADD	c4, t4, c4
936	FMUL	a4, b4, t4
937
938	FADD	c5, t1, c5
939	FMUL	a5, b3, t1
940	FADDX	c6, t2, c6
941	FMUL	a5, b4, t2
942	FADD	c7, t3, c7
943	FMUL	a6, b3, t3
944	FADD	c8, t4, c8
945	FMUL	a6, b4, t4
946
947	FADD	c9, t1, c9
948	FMUL	a7, b3, t1
949	FADDX	c10, t2, c10
950	FMUL	a7, b4, t2
951	FADD	c11, t3, c11
952	FMUL	a8, b3, t3
953	FADD	c12, t4, c12
954	FMUL	a8, b4, t4
955
956.LL115:
957	andcc	MIN_M, 3, I
958	LDF	ALPHA_R, b3
959	mov	Y1, Y2
960	ble,pn	%icc, .LL119
961	LDF	ALPHA_I, b4
962
963.L116:
964	LDF	[A1 + 0 * SIZE], a1
965	LDF	[A1 + 1 * SIZE], a2
966	add	A1, 2 * SIZE, A1
967	LDF	[X1 + 0 * SIZE], b1
968	LDF	[X1 + 1 * SIZE], b2
969	add	X1, 2 * SIZE, X1
970	LDF	[A2 + 0 * SIZE], a3
971	LDF	[A2 + 1 * SIZE], a4
972	add	A2, 2 * SIZE, A2
973	LDF	[A3 + 0 * SIZE], a5
974	LDF	[A3 + 1 * SIZE], a6
975	add	A3, 2 * SIZE, A3
976	LDF	[A4 + 0 * SIZE], a7
977	LDF	[A4 + 1 * SIZE], a8
978	add	A4, 2 * SIZE, A4
979
980	FADD	c13, t1, c13
981	FMUL	a1, b1, t1
982	FADDX	c14, t2, c14
983	FMUL	a1, b2, t2
984	FADD	c15, t3, c15
985	FMUL	a2, b1, t3
986	FADD	c16, t4, c16
987	FMUL	a2, b2, t4
988
989	FADD	c1, t1, c1
990	FMUL	a3, b1, t1
991	FADDX	c2, t2, c2
992	FMUL	a3, b2, t2
993	FADD	c3, t3, c3
994	FMUL	a4, b1, t3
995	FADD	c4, t4, c4
996	FMUL	a4, b2, t4
997
998	FADD	c5, t1, c5
999	FMUL	a5, b1, t1
1000	FADDX	c6, t2, c6
1001	FMUL	a5, b2, t2
1002	FADD	c7, t3, c7
1003	FMUL	a6, b1, t3
1004	FADD	c8, t4, c8
1005	FMUL	a6, b2, t4
1006
1007	FADD	c9, t1, c9
1008	FMUL	a7, b1, t1
1009	FADDX	c10, t2, c10
1010	FMUL	a7, b2, t2
1011	FADD	c11, t3, c11
1012	FMUL	a8, b1, t3
1013	FADD	c12, t4, c12
1014	FMUL	a8, b2, t4
1015
1016	deccc	I
1017	bg	%icc, .L116
1018	nop
1019
1020.LL119:
1021	FADD	c13, t1, c13
1022	LDF	[Y1 + 0 * SIZE], a1
1023	FADDX	c14, t2, c14
1024	LDF	[Y1 + 1 * SIZE] ,a2
1025	add	Y1, INCY, Y1
1026	FADD	c15, t3, c15
1027	LDF	[Y1 + 0 * SIZE], a3
1028	FADD	c16, t4, c16
1029	LDF	[Y1 + 1 * SIZE] ,a4
1030	add	Y1, INCY, Y1
1031
1032#if (!defined(XCONJ) && !defined(CONJ)) || (defined(XCONJ) && defined(CONJ))
1033	FSUB	c1,  c4,  c1
1034	LDF	[Y1 + 0 * SIZE], a5
1035	FSUB	c5,  c8,  c5
1036	LDF	[Y1 + 1 * SIZE] ,a6
1037	add	Y1, INCY, Y1
1038	FSUB	c9,  c12, c9
1039	LDF	[Y1 + 0 * SIZE], a7
1040	FSUB	c13, c16, c13
1041	LDF	[Y1 + 1 * SIZE] ,a8
1042	add	Y1, INCY, Y1
1043#else
1044	FADD	c1,  c4,  c1
1045	LDF	[Y1 + 0 * SIZE], a5
1046	FADD	c5,  c8,  c5
1047	LDF	[Y1 + 1 * SIZE] ,a6
1048	add	Y1, INCY, Y1
1049	FADD	c9,  c12, c9
1050	LDF	[Y1 + 0 * SIZE], a7
1051	FADD	c13, c16, c13
1052	LDF	[Y1 + 1 * SIZE] ,a8
1053	add	Y1, INCY, Y1
1054#endif
1055
1056#ifndef CONJ
1057	FADD	c2,  c3,  c2
1058	FCLR(0)
1059	FADD	c6,  c7,  c6
1060	FADD	c10, c11, c10
1061	FADD	c14, c15, c14
1062#else
1063	FSUB	c2,  c3,  c2
1064	FCLR(0)
1065	FSUB	c6,  c7,  c6
1066	FSUB	c10, c11, c10
1067	FSUB	c14, c15, c14
1068#endif
1069
1070	FMUL	b3, c1, c3
1071	FMOV	t1, t2
1072	FMUL	b4, c1, c4
1073	FMOV	t1, t3
1074	FMUL	b4, c2, c1
1075	FMOV	t1, t4
1076	FMUL	b3, c2, c2
1077
1078	FMUL	b3, c5, c7
1079	FMUL	b4, c5, c8
1080	FMUL	b4, c6, c5
1081	FMUL	b3, c6, c6
1082
1083	FMUL	b3, c9,  c11
1084	FMUL	b4, c9,  c12
1085	FMUL	b4, c10, c9
1086	FMUL	b3, c10, c10
1087
1088	FMUL	b3, c13, c15
1089	FSUB	c3,  c1,  c1
1090	FMUL	b4, c13, c16
1091	FADD	c2,  c4,  c2
1092	FMUL	b4, c14, c13
1093	FSUB	c7,  c5,  c5
1094	FMUL	b3, c14, c14
1095	FADD	c6,  c8,  c6
1096
1097	FSUB	c11, c9,  c9
1098	FADD	c10, c12, c10
1099	FSUB	c15, c13, c13
1100	FADD	c14, c16, c14
1101
1102	FADD	a1, c1, a1
1103	FADD	a2, c2, a2
1104	FADD	a3, c5, a3
1105	FADD	a4, c6, a4
1106
1107	STF	a1, [Y2 + 0 * SIZE]
1108	FADD	a5, c9,  a5
1109	STF	a2, [Y2 + 1 * SIZE]
1110	FADD	a6, c10, a6
1111	add	Y2, INCY, Y2
1112	STF	a3, [Y2 + 0 * SIZE]
1113	FADD	a7, c13, a7
1114	STF	a4, [Y2 + 1 * SIZE]
1115	FADD	a8, c14, a8
1116	add	Y2, INCY, Y2
1117
1118	STF	a5, [Y2 + 0 * SIZE]
1119	FMOV	t1, c1
1120	add	J, -1, J
1121	STF	a6, [Y2 + 1 * SIZE]
1122	FMOV	t1, c2
1123	cmp	J, 0
1124	add	Y2, INCY, Y2
1125	STF	a7, [Y2 + 0 * SIZE]
1126	FMOV	t1, c3
1127	STF	a8, [Y2 + 1 * SIZE]
1128	FMOV	t1, c4
1129	add	Y2, INCY, Y2
1130
1131	FMOV	t1, c5
1132	bg	%icc, .LL110
1133	FMOV	t1, c6
1134
1135.LL200:
1136	FCLR(0)
1137
1138	and	N, 2, J
1139	cmp	J, 0
1140	FMOV	t1, c1
1141	ble	%icc, .LL300
1142
1143	FMOV	t1, c2
1144	sra	MIN_M, 2, I
1145	FMOV	t1, t2
1146	add	A,  LDA, A2
1147	FMOV	t1, c3
1148	mov	A,  A1
1149	FMOV	t1, t3
1150	cmp	I, 0
1151	FMOV	t1, c4
1152
1153	FMOV	t1, c5
1154	FMOV	t1, c6
1155	FMOV	t1, c7
1156	FMOV	t1, c8
1157
1158	add	A2, LDA, A
1159	FMOV	t1, t4
1160	ble	%icc, .LL215
1161	mov	XP, X1
1162
1163	LDF	[A1 + 0 * SIZE], a1
1164	LDF	[A1 + 1 * SIZE], a2
1165	LDF	[A1 + 2 * SIZE], a5
1166	LDF	[A1 + 3 * SIZE], a6
1167	add	A1, 4 * SIZE, A1
1168
1169	LDF	[A2 + 0 * SIZE], a3
1170	LDF	[A2 + 1 * SIZE], a4
1171	LDF	[A2 + 2 * SIZE], a7
1172	LDF	[A2 + 3 * SIZE], a8
1173	add	A2, 4 * SIZE, A2
1174
1175	LDF	[X1 + 0 * SIZE], b1
1176	add	I, -1, I
1177	LDF	[X1 + 1 * SIZE], b2
1178	cmp	I, 0
1179	LDF	[X1 + 2 * SIZE], b3
1180	LDF	[X1 + 3 * SIZE], b4
1181	ble	%icc, .LL212
1182	add	X1, 4 * SIZE, X1
1183
1184.LL211:
1185	prefetch [A1 + PREFETCHSIZE * SIZE], 1
1186
1187	FADD	c5, t1, c5
1188	FMUL	a1, b1, t1
1189	FADDX	c6, t2, c6
1190	FMUL	a1, b2, t2
1191	LDF	[A1 + 0 * SIZE], a1
1192	FADD	c7, t3, c7
1193	FMUL	a2, b1, t3
1194	FADD	c8, t4, c8
1195	FMUL	a2, b2, t4
1196	LDF	[A1 + 1 * SIZE], a2
1197
1198	FADD	c1, t1, c1
1199	FMUL	a3, b1, t1
1200	FADDX	c2, t2, c2
1201	FMUL	a3, b2, t2
1202	LDF	[A2 + 0 * SIZE], a3
1203	FADD	c3, t3, c3
1204	FMUL	a4, b1, t3
1205	LDF	[X1 + 0 * SIZE], b1
1206	FADD	c4, t4, c4
1207	FMUL	a4, b2, t4
1208	LDF	[A2 + 1 * SIZE], a4
1209
1210	FADD	c5, t1, c5
1211	LDF	[X1 + 1 * SIZE], b2
1212	FMUL	a5, b3, t1
1213	FADDX	c6, t2, c6
1214	FMUL	a5, b4, t2
1215	LDF	[A1 + 2 * SIZE], a5
1216	FADD	c7, t3, c7
1217	add	I, -1, I
1218	FMUL	a6, b3, t3
1219	FADD	c8, t4, c8
1220	cmp	I, 0
1221	FMUL	a6, b4, t4
1222	LDF	[A1 + 3 * SIZE], a6
1223
1224	FADD	c1, t1, c1
1225	FMUL	a7, b3, t1
1226	FADDX	c2, t2, c2
1227	FMUL	a7, b4, t2
1228	LDF	[A2 + 2 * SIZE], a7
1229	FADD	c3, t3, c3
1230	FMUL	a8, b3, t3
1231	LDF	[X1 + 2 * SIZE], b3
1232	FADD	c4, t4, c4
1233	FMUL	a8, b4, t4
1234	LDF	[A2 + 3 * SIZE], a8
1235
1236	prefetch [A2 + PREFETCHSIZE * SIZE], 1
1237	FADD	c5, t1, c5
1238	LDF	[X1 + 3 * SIZE], b4
1239	FMUL	a1, b1, t1
1240	FADDX	c6, t2, c6
1241	FMUL	a1, b2, t2
1242	LDF	[A1 + 4 * SIZE], a1
1243	FADD	c7, t3, c7
1244	FMUL	a2, b1, t3
1245	FADD	c8, t4, c8
1246	FMUL	a2, b2, t4
1247	LDF	[A1 + 5 * SIZE], a2
1248
1249	FADD	c1, t1, c1
1250	FMUL	a3, b1, t1
1251	FADDX	c2, t2, c2
1252	FMUL	a3, b2, t2
1253	LDF	[A2 + 4 * SIZE], a3
1254	FADD	c3, t3, c3
1255	FMUL	a4, b1, t3
1256	LDF	[X1 + 4 * SIZE], b1
1257	FADD	c4, t4, c4
1258	FMUL	a4, b2, t4
1259	LDF	[A2 + 5 * SIZE], a4
1260
1261	FADD	c5, t1, c5
1262	LDF	[X1 + 5 * SIZE], b2
1263	FMUL	a5, b3, t1
1264	FADDX	c6, t2, c6
1265	FMUL	a5, b4, t2
1266	LDF	[A1 + 6 * SIZE], a5
1267	FADD	c7, t3, c7
1268	FMUL	a6, b3, t3
1269	FADD	c8, t4, c8
1270	FMUL	a6, b4, t4
1271	LDF	[A1 + 7 * SIZE], a6
1272	add	A1, 8 * SIZE, A1
1273
1274	FADD	c1, t1, c1
1275	FMUL	a7, b3, t1
1276	FADDX	c2, t2, c2
1277	FMUL	a7, b4, t2
1278	LDF	[A2 + 6 * SIZE], a7
1279	FADD	c3, t3, c3
1280	FMUL	a8, b3, t3
1281	LDF	[X1 + 6 * SIZE], b3
1282	FADD	c4, t4, c4
1283	add	X1, 8 * SIZE, X1
1284	FMUL	a8, b4, t4
1285	LDF	[A2 + 7 * SIZE], a8
1286	add	A2, 8 * SIZE, A2
1287	bg,pn	%icc, .LL211
1288	LDF	[X1 - 1 * SIZE], b4
1289
1290.LL212:
1291	FADD	c5, t1, c5
1292	FMUL	a1, b1, t1
1293	FADDX	c6, t2, c6
1294	FMUL	a1, b2, t2
1295	LDF	[A1 + 0 * SIZE], a1
1296	FADD	c7, t3, c7
1297	FMUL	a2, b1, t3
1298	FADD	c8, t4, c8
1299	FMUL	a2, b2, t4
1300	LDF	[A1 + 1 * SIZE], a2
1301
1302	FADD	c1, t1, c1
1303	FMUL	a3, b1, t1
1304	FADDX	c2, t2, c2
1305	FMUL	a3, b2, t2
1306	LDF	[A2 + 0 * SIZE], a3
1307	FADD	c3, t3, c3
1308	FMUL	a4, b1, t3
1309	LDF	[X1 + 0 * SIZE], b1
1310	FADD	c4, t4, c4
1311	FMUL	a4, b2, t4
1312	LDF	[A2 + 1 * SIZE], a4
1313
1314	FADD	c5, t1, c5
1315	LDF	[X1 + 1 * SIZE], b2
1316	FMUL	a5, b3, t1
1317	FADDX	c6, t2, c6
1318	FMUL	a5, b4, t2
1319	LDF	[A1 + 2 * SIZE], a5
1320	FADD	c7, t3, c7
1321	FMUL	a6, b3, t3
1322	FADD	c8, t4, c8
1323	FMUL	a6, b4, t4
1324	LDF	[A1 + 3 * SIZE], a6
1325	add	A1, 4 * SIZE, A1
1326
1327	FADD	c1, t1, c1
1328	FMUL	a7, b3, t1
1329	FADDX	c2, t2, c2
1330	FMUL	a7, b4, t2
1331	LDF	[A2 + 2 * SIZE], a7
1332	FADD	c3, t3, c3
1333	FMUL	a8, b3, t3
1334	LDF	[X1 + 2 * SIZE], b3
1335	FADD	c4, t4, c4
1336	FMUL	a8, b4, t4
1337	LDF	[A2 + 3 * SIZE], a8
1338	add	A2, 4 * SIZE, A2
1339
1340	FADD	c5, t1, c5
1341	LDF	[X1 + 3 * SIZE], b4
1342	add	X1, 4 * SIZE, X1
1343	FMUL	a1, b1, t1
1344	FADDX	c6, t2, c6
1345	FMUL	a1, b2, t2
1346	FADD	c7, t3, c7
1347	FMUL	a2, b1, t3
1348	FADD	c8, t4, c8
1349	FMUL	a2, b2, t4
1350
1351	FADD	c1, t1, c1
1352	FMUL	a3, b1, t1
1353	FADDX	c2, t2, c2
1354	FMUL	a3, b2, t2
1355	FADD	c3, t3, c3
1356	FMUL	a4, b1, t3
1357	FADD	c4, t4, c4
1358	FMUL	a4, b2, t4
1359
1360	FADD	c5, t1, c5
1361	FMUL	a5, b3, t1
1362	FADDX	c6, t2, c6
1363	FMUL	a5, b4, t2
1364	FADD	c7, t3, c7
1365	FMUL	a6, b3, t3
1366	FADD	c8, t4, c8
1367	FMUL	a6, b4, t4
1368
1369	FADD	c1, t1, c1
1370	FMUL	a7, b3, t1
1371	FADDX	c2, t2, c2
1372	FMUL	a7, b4, t2
1373	FADD	c3, t3, c3
1374	FMUL	a8, b3, t3
1375	FADD	c4, t4, c4
1376	FMUL	a8, b4, t4
1377
1378.LL215:
1379	andcc	MIN_M, 3, I
1380	LDF	ALPHA_R, b3
1381	mov	Y1, Y2
1382	ble	%icc, .LL219
1383	LDF	ALPHA_I, b4
1384
1385	LDF	[A1 + 0 * SIZE], a1
1386	add	I, -1, I
1387	LDF	[A1 + 1 * SIZE], a2
1388	cmp	I, 0
1389	add	A1, 2 * SIZE, A1
1390
1391	LDF	[A2 + 0 * SIZE], a3
1392	LDF	[A2 + 1 * SIZE], a4
1393	add	A2, 2 * SIZE, A2
1394
1395	LDF	[X1 + 0 * SIZE], b1
1396	LDF	[X1 + 1 * SIZE], b2
1397	ble	%icc, .LL217
1398	add	X1, 2 * SIZE, X1
1399
1400.LL216:
1401	FADD	c5, t1, c5
1402	FMUL	a1, b1, t1
1403	FADDX	c6, t2, c6
1404	FMUL	a1, b2, t2
1405	LDF	[A1 + 0 * SIZE], a1
1406	FADD	c7, t3, c7
1407	add	I, -1, I
1408	FMUL	a2, b1, t3
1409	FADD	c8, t4, c8
1410	cmp	I, 0
1411	FMUL	a2, b2, t4
1412	LDF	[A1 + 1 * SIZE], a2
1413	add	A1, 2 * SIZE, A1
1414
1415	FADD	c1, t1, c1
1416	FMUL	a3, b1, t1
1417	FADDX	c2, t2, c2
1418	FMUL	a3, b2, t2
1419	LDF	[A2 + 0 * SIZE], a3
1420	FADD	c3, t3, c3
1421	FMUL	a4, b1, t3
1422	LDF	[X1 + 0 * SIZE], b1
1423	FADD	c4, t4, c4
1424	add	X1, 2 * SIZE, X1
1425	FMUL	a4, b2, t4
1426	LDF	[A2 + 1 * SIZE], a4
1427	add	A2, 2 * SIZE, A2
1428	bg,pn	%icc, .LL216
1429	LDF	[X1 - 1 * SIZE], b2
1430
1431.LL217:
1432	FADD	c5, t1, c5
1433	FMUL	a1, b1, t1
1434	FADDX	c6, t2, c6
1435	FMUL	a1, b2, t2
1436	FADD	c7, t3, c7
1437	FMUL	a2, b1, t3
1438	FADD	c8, t4, c8
1439	FMUL	a2, b2, t4
1440
1441	FADD	c1, t1, c1
1442	FMUL	a3, b1, t1
1443	FADDX	c2, t2, c2
1444	FMUL	a3, b2, t2
1445	FADD	c3, t3, c3
1446	FMUL	a4, b1, t3
1447	FADD	c4, t4, c4
1448	FMUL	a4, b2, t4
1449
1450.LL219:
1451	FADD	c5, t1, c5
1452	LDF	[Y1 + 0 * SIZE], a1
1453	FADDX	c6, t2, c6
1454	LDF	[Y1 + 1 * SIZE] ,a2
1455	add	Y1, INCY, Y1
1456	FADD	c7, t3, c7
1457	LDF	[Y1 + 0 * SIZE], a3
1458	FADD	c8, t4, c8
1459	LDF	[Y1 + 1 * SIZE] ,a4
1460	add	Y1, INCY, Y1
1461
1462#if (!defined(XCONJ) && !defined(CONJ)) || (defined(XCONJ) && defined(CONJ))
1463	FSUB	c1, c4, c1
1464	FSUB	c5, c8, c5
1465#else
1466	FADD	c1, c4, c1
1467	FADD	c5, c8, c5
1468#endif
1469
1470#ifndef CONJ
1471	FADD	c2, c3, c2
1472	FADD	c6, c7, c6
1473#else
1474	FSUB	c2, c3, c2
1475	FSUB	c6, c7, c6
1476#endif
1477
1478	FMUL	b3, c1, c3
1479	FMUL	b4, c1, c4
1480	FMUL	b4, c2, c1
1481	FMUL	b3, c2, c2
1482
1483	FMUL	b3, c5, c7
1484	FMUL	b4, c5, c8
1485	FMUL	b4, c6, c5
1486	FMUL	b3, c6, c6
1487
1488	FSUB	c3, c1, c1
1489	FADD	c2, c4, c2
1490	FSUB	c7, c5, c5
1491	FADD	c6, c8, c6
1492
1493	FADD	a1, c1, a1
1494	FADD	a2, c2, a2
1495	FADD	a3, c5, a3
1496	FADD	a4, c6, a4
1497
1498	STF	a1, [Y2 + 0 * SIZE]
1499	STF	a2, [Y2 + 1 * SIZE]
1500	add	Y2, INCY, Y2
1501	STF	a3, [Y2 + 0 * SIZE]
1502	STF	a4, [Y2 + 1 * SIZE]
1503
1504.LL300:
1505	andcc	N, 1, J
1506	FCLR(0)
1507	ble	%icc, .LL400
1508	FMOV	t1, c1
1509
1510.LL310:
1511	sra	MIN_M, 2, I
1512	FMOV	t1, c2
1513	FMOV	t1, c3
1514	FMOV	t1, c4
1515	mov	A, A1
1516	FMOV	t1, t2
1517	add	A, LDA, A
1518	FMOV	t1, t3
1519	cmp	I, 0
1520	FMOV	t1, t4
1521	ble	%icc, .LL315
1522	mov	XP, X1
1523
1524	LDF	[A1 + 0 * SIZE], a1
1525	LDF	[A1 + 1 * SIZE], a2
1526	LDF	[A1 + 2 * SIZE], a3
1527	LDF	[A1 + 3 * SIZE], a4
1528	LDF	[A1 + 4 * SIZE], a5
1529	LDF	[A1 + 5 * SIZE], a6
1530	LDF	[A1 + 6 * SIZE], a7
1531	LDF	[A1 + 7 * SIZE], a8
1532	add	A1, 8 * SIZE, A1
1533
1534	LDF	[X1 + 0 * SIZE], c9
1535	add	I, -1, I
1536	LDF	[X1 + 1 * SIZE], c10
1537	cmp	I, 0
1538	LDF	[X1 + 2 * SIZE], c11
1539	LDF	[X1 + 3 * SIZE], c12
1540	LDF	[X1 + 4 * SIZE], c13
1541	LDF	[X1 + 5 * SIZE], c14
1542	LDF	[X1 + 6 * SIZE], c15
1543	LDF	[X1 + 7 * SIZE], c16
1544	ble	%icc, .LL312
1545	add	X1, 8 * SIZE, X1
1546
1547.LL311:
1548	prefetch [A1 + PREFETCHSIZE * SIZE], 1
1549
1550	FADD	c1, t1, c1
1551	FMUL	a1, c9,  t1
1552	FADDX	c2, t2, c2
1553	FMUL	a1, c10, t2
1554	LDF	[A1 + 0 * SIZE], a1
1555	FADD	c3, t3, c3
1556	FMUL	a2, c9,  t3
1557	LDF	[X1 + 0 * SIZE], c9
1558	FADD	c4, t4, c4
1559	FMUL	a2, c10, t4
1560	LDF	[A1 + 1 * SIZE], a2
1561	LDF	[X1 + 1 * SIZE], c10
1562
1563	FADD	c1, t1, c1
1564	FMUL	a3, c11, t1
1565	FADDX	c2, t2, c2
1566	FMUL	a3, c12, t2
1567	LDF	[A1 + 2 * SIZE], a3
1568	FADD	c3, t3, c3
1569	add	I, -1, I
1570	FMUL	a4, c11, t3
1571	LDF	[X1 + 2 * SIZE], c11
1572	FADD	c4, t4, c4
1573	cmp	I, 0
1574	FMUL	a4, c12, t4
1575	LDF	[A1 + 3 * SIZE], a4
1576	LDF	[X1 + 3 * SIZE], c12
1577
1578	FADD	c1, t1, c1
1579	FMUL	a5, c13, t1
1580	FADDX	c2, t2, c2
1581	FMUL	a5, c14, t2
1582	LDF	[A1 + 4 * SIZE], a5
1583	FADD	c3, t3, c3
1584	FMUL	a6, c13, t3
1585	LDF	[X1 + 4 * SIZE], c13
1586	FADD	c4, t4, c4
1587	FMUL	a6, c14, t4
1588	LDF	[A1 + 5 * SIZE], a6
1589	LDF	[X1 + 5 * SIZE], c14
1590
1591	FADD	c1, t1, c1
1592	FMUL	a7, c15, t1
1593	FADDX	c2, t2, c2
1594	FMUL	a7, c16, t2
1595	LDF	[A1 + 6 * SIZE], a7
1596
1597	FADD	c3, t3, c3
1598	FMUL	a8, c15, t3
1599	LDF	[X1 + 6 * SIZE], c15
1600	FADD	c4, t4, c4
1601	add	X1, 8 * SIZE, X1
1602	FMUL	a8, c16, t4
1603	LDF	[A1 + 7 * SIZE], a8
1604	add	A1, 8 * SIZE, A1
1605	bg,pn	%icc, .LL311
1606	LDF	[X1 - 1 * SIZE], c16
1607
1608.LL312:
1609	FADD	c1, t1, c1
1610	FMUL	a1, c9,  t1
1611	FADDX	c2, t2, c2
1612	FMUL	a1, c10, t2
1613	FADD	c3, t3, c3
1614	FMUL	a2, c9,  t3
1615	FADD	c4, t4, c4
1616	FMUL	a2, c10, t4
1617
1618	FADD	c1, t1, c1
1619	FMUL	a3, c11, t1
1620	FADDX	c2, t2, c2
1621	FMUL	a3, c12, t2
1622	FADD	c3, t3, c3
1623	FMUL	a4, c11, t3
1624	FADD	c4, t4, c4
1625	FMUL	a4, c12, t4
1626
1627	FADD	c1, t1, c1
1628	FMUL	a5, c13, t1
1629	FADDX	c2, t2, c2
1630	FMUL	a5, c14, t2
1631	FADD	c3, t3, c3
1632	FMUL	a6, c13, t3
1633	FADD	c4, t4, c4
1634	FMUL	a6, c14, t4
1635
1636	FADD	c1, t1, c1
1637	FMUL	a7, c15, t1
1638	FADDX	c2, t2, c2
1639	FMUL	a7, c16, t2
1640	FADD	c3, t3, c3
1641	FMUL	a8, c15, t3
1642	FADD	c4, t4, c4
1643	FMUL	a8, c16, t4
1644
1645.LL315:
1646	andcc	MIN_M, 3, I
1647	LDF	ALPHA_R, b3
1648	mov	Y1, Y2
1649	ble	%icc, .LL319
1650	LDF	ALPHA_I, b4
1651
1652	LDF	[A1 + 0 * SIZE], a1
1653	add	I, -1, I
1654	LDF	[A1 + 1 * SIZE], a2
1655	add	A1, 2 * SIZE, A1
1656	LDF	[X1 + 0 * SIZE], b1
1657	cmp	I, 0
1658	LDF	[X1 + 1 * SIZE], b2
1659	ble	%icc, .LL317
1660	add	X1, 2 * SIZE, X1
1661
1662.LL316:
1663	FADD	c1, t1, c1
1664	add	I, -1, I
1665	FMUL	a1, b1, t1
1666	FADDX	c2, t2, c2
1667	FMUL	a1, b2, t2
1668	LDF	[A1 + 0 * SIZE], a1
1669	FADD	c3, t3, c3
1670	cmp	I, 0
1671	FMUL	a2, b1, t3
1672	LDF	[X1 + 0 * SIZE], b1
1673	FADD	c4, t4, c4
1674	add	X1, 2 * SIZE, X1
1675	FMUL	a2, b2, t4
1676	LDF	[A1 + 1 * SIZE], a2
1677	add	A1, 2 * SIZE, A1
1678
1679	bg,pn	%icc, .LL316
1680	LDF	[X1 - 1 * SIZE], b2
1681
1682.LL317:
1683	FADD	c1, t1, c1
1684	FMUL	a1, b1, t1
1685	FADDX	c2, t2, c2
1686	FMUL	a1, b2, t2
1687	FADD	c3, t3, c3
1688	FMUL	a2, b1, t3
1689	FADD	c4, t4, c4
1690	FMUL	a2, b2, t4
1691
1692.LL319:
1693	FADD	c1, t1, c1
1694	LDF	[Y1 + 0 * SIZE], a1
1695	FADDX	c2, t2, c2
1696	LDF	[Y1 + 1 * SIZE] ,a2
1697	add	Y1, INCY, Y1
1698	FADD	c3, t3, c3
1699	FADD	c4, t4, c4
1700
1701#if (!defined(XCONJ) && !defined(CONJ)) || (defined(XCONJ) && defined(CONJ))
1702	FSUB	c1, c4, c1
1703#else
1704	FADD	c1, c4, c1
1705#endif
1706
1707#ifndef CONJ
1708	FADD	c2, c3, c2
1709#else
1710	FSUB	c2, c3, c2
1711#endif
1712
1713	FMUL	b3, c1, c3
1714	FMUL	b4, c1, c4
1715	FMUL	b4, c2, c1
1716	FMUL	b3, c2, c2
1717
1718	FSUB	c3, c1, c1
1719	FADD	c2, c4, c2
1720	FADD	a1, c1, a1
1721	FADD	a2, c2, a2
1722
1723	STF	a1, [Y2 + 0 * SIZE]
1724	STF	a2, [Y2 + 1 * SIZE]
1725
1726.LL400:
1727	mov	P, I
1728	add	IS, I, IS
1729	cmp	IS, M
1730	bl	%icc, .LL10
1731	add	A, PNLDA, A
1732
1733.LL999:
1734	return	%i7 + 8
1735	clr	%o0
1736
1737	EPILOGUE
1738