1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define PREFETCHSIZE 64 * 8
43
44#define N	r32
45#define X	r36
46#define INCX	r37
47#define Y	r38
48#define INCY	r39
49
50#define PRE1	r2
51#define PRE2	r3
52
53#define I	r14
54#define J	r15
55#define Y1	r16
56#define Y2	r17
57#define X1	r18
58#define X2	r19
59#define INCX16	r20
60#define INCY16	r21
61#define YYY	r25
62#define YY	r27
63#define XA	r28
64#define XB	r29
65#define PR	r30
66#define ARLC	r31
67
68#define ALPHA	f8
69#define ALPHA_P	f9
70
71	PROLOGUE
72	.prologue
73	PROFCODE
74
75	{ .mii
76	shladd	INCX = INCX, BASE_SHIFT, r0
77	.save ar.lc, ARLC
78	mov	ARLC = ar.lc
79	tbit.nz	p10, p0 = X, BASE_SHIFT
80	}
81	{ .mfb
82	cmp.lt	p0, p6 = r0, N
83	fcmp.eq	p7, p0 = ALPHA, f0
84	(p6) br.ret.sptk.many b0
85	}
86	;;
87	.body
88	{ .mmi
89	(p10) LDFD	f32 = [X], INCX
90	shladd	INCY = INCY, BASE_SHIFT, r0
91	mov	PR = pr
92	}
93	{ .mib
94	(p10) adds N = -1, N
95	mov  YYY = Y
96	(p7) br.ret.sptk.many b0
97	}
98	;;
99	{ .mmi
100	(p10) LDFD	f33 = [Y], INCY
101	cmp.ne	p13, p0 = SIZE, INCX
102	shr    XA = X, 2
103	}
104	{ .mmi
105	shladd	INCX16 = INCX, 4, r0
106	shladd	INCY16 = INCY, 4, r0
107	nop.i 0
108	}
109	;;
110	{ .mii
111	mov	Y1 = Y
112	tbit.nz	p11, p0 = Y, BASE_SHIFT
113	shr    XB = Y, 2
114	}
115	;;
116	{ .mmf
117	and	XA = 0x3f, XA
118	and	XB = 0x3f, XB
119	(p10) FMA f32 = ALPHA, f32, f33
120	}
121	;;
122	{ .mmi
123	sub	XA = XB, XA
124	shladd	Y2 = INCY, 2, Y
125	mov	pr.rot = 0x10000
126	}
127	{ .mbb
128	cmp.ne p14, p0 = SIZE, INCY
129	(p13) br.cond.dpnt .L100
130	(p14) br.cond.dpnt .L100
131	}
132	;;
133	{ .mmi
134	cmp.gt	p14, p0 =  r0, XA
135	;;
136	and	J =  15, N
137	shr	I =  N, 4
138	}
139	{ .mfb
140	(p14) adds XA = 64, XA
141	fpack	ALPHA_P = f8, f8
142	(p11) br.cond.dpnt .L30
143	}
144	;;
145	{ .mmi
146	cmp.gt	p14, p0 =  32, XA
147	cmp.lt	p15, p0 =  58, XA
148	mov	ar.ec = 3
149	}
150	{ .mmi
151	and	J =  31, N
152	cmp.eq	p16, p0 = r0, r0
153	shr	I =  N, 5
154	}
155	;;
156	{ .mmi
157	cmp.eq	p9, p0  =   r0, J
158	cmp.eq	p7 ,p0  = 0, I
159	adds	I = -1, I
160	}
161	{ .mbb
162	nop.m 0
163	(p14) br.cond.dpnt .L20
164	(p15) br.cond.dpnt .L20
165	}
166	;;
167	{ .mmi
168	(p10) STFD [YYY] = f32
169	adds	PRE1 = PREFETCHSIZE * SIZE, X
170	mov	ar.lc = I
171	}
172	{ .mib
173	adds	PRE2 = (PREFETCHSIZE - 24) * SIZE, Y
174	tbit.z	p0, p11 = N, 4
175	(p7) br.cond.dpnt  .L15
176	}
177	;;
178	.align 32
179
180.L12:
181/* 0 */
182	{ .mmf
183	(p18) stf8	[Y1] = f6,   2 * SIZE
184	(p16) lfetch.nt1 [PRE1], 32 * SIZE
185	(p18) fpma	f12  = ALPHA_P, f46, f94
186	}
187	{ .mmi
188	(p16) ldf8	f32 = [X], 2 * SIZE
189	(p16) ldf8	f80 = [Y], 2 * SIZE
190	}
191	;;
192/* 1 */
193	{ .mmf
194	(p18) stf8	[Y1] = f7,  2 * SIZE
195	(p16) lfetch.excl.nt1	[PRE2], 32 * SIZE
196	(p18) fpma	f13  = ALPHA_P, f49, f97
197	}
198	{ .mmi
199	(p16) ldf8	f35 = [X], 2 * SIZE
200	(p16) ldf8	f83 = [Y], 2 * SIZE
201	}
202	;;
203/* 2 */
204	{ .mmf
205	(p18) stf8	[Y1] = f10,  2 * SIZE
206	(p18) fpma	f14  = ALPHA_P, f52, f100
207	}
208	{ .mmi
209	(p16) ldf8	f38 = [X], 2 * SIZE
210	(p16) ldf8	f86 = [Y], 2 * SIZE
211	}
212	;;
213/* 3 */
214	{ .mmf
215	(p18) stf8	[Y1] = f11, 2 * SIZE
216	(p18) fpma	f15  = ALPHA_P, f55, f103
217	}
218	{ .mmi
219	(p16) ldf8	f41 = [X], 2 * SIZE
220	(p16) ldf8	f89 = [Y], 2 * SIZE
221	}
222	;;
223/* 4 */
224	{ .mmf
225	(p18) stf8	[Y1] = f12,  2 * SIZE
226	(p18) fpma	f6   = ALPHA_P, f58, f106
227	}
228	{ .mmi
229	(p16) ldf8	f44  = [X], 2 * SIZE
230	(p16) ldf8	f92  = [Y], 2 * SIZE
231	}
232	;;
233/* 5 */
234	{ .mmf
235	(p18) stf8	[Y1] = f13,  2 * SIZE
236	(p18) fpma	f7   = ALPHA_P, f61, f109
237	}
238	{ .mmi
239	(p16) ldf8	f47  = [X], 2 * SIZE
240	(p16) ldf8	f95  = [Y], 2 * SIZE
241	}
242	;;
243/* 6 */
244	{ .mmf
245	(p18) stf8	[Y1] = f14,  2 * SIZE
246	(p18) fpma	f10  = ALPHA_P, f64, f112
247	}
248	{ .mmi
249	(p16) ldf8	f50  = [X], 2 * SIZE
250	(p16) ldf8	f98  = [Y], 2 * SIZE
251	}
252	;;
253/* 7 */
254	{ .mmf
255	(p18) stf8	[Y1] = f15, 2 * SIZE
256	(p18) fpma	f11  = ALPHA_P, f67, f115
257	}
258	{ .mmi
259	(p16) ldf8	f53  = [X], 2 * SIZE
260	(p16) ldf8	f101 = [Y], 2 * SIZE
261	}
262	;;
263/* 8 */
264	{ .mmf
265	(p18) stf8	[Y1] = f6,   2 * SIZE
266	(p18) fpma	f12  = ALPHA_P, f70, f118
267	}
268	{ .mmi
269	(p16) ldf8	f56 = [X], 2 * SIZE
270	(p16) ldf8	f104 = [Y], 2 * SIZE
271	}
272	;;
273/* 9 */
274	{ .mmf
275	(p18) stf8	[Y1] = f7,  2 * SIZE
276	(p18) fpma	f13  = ALPHA_P, f73, f121
277	}
278	{ .mmi
279	(p16) ldf8	f59 = [X], 2 * SIZE
280	(p16) ldf8	f107 = [Y], 2 * SIZE
281	}
282	;;
283/* 10 */
284	{ .mmf
285	(p18) stf8	[Y1] = f10,  2 * SIZE
286	(p18) fpma	f14 = ALPHA_P, f76, f124
287	}
288	{ .mmi
289	(p16) ldf8	f62 = [X], 2 * SIZE
290	(p16) ldf8	f110 = [Y], 2 * SIZE
291	}
292	;;
293/* 11 */
294	{ .mmf
295	(p18) stf8	[Y1] = f11, 2 * SIZE
296	(p18) fpma	f15  = ALPHA_P, f79, f127
297	}
298	{ .mmi
299	(p16) ldf8	f65 = [X], 2 * SIZE
300	(p16) ldf8	f113 = [Y], 2 * SIZE
301	}
302	;;
303/* 12 */
304	{ .mmf
305	(p18) stf8	[Y1] = f12,  2 * SIZE
306	(p17) fpma	f6   = ALPHA_P, f33, f81
307	}
308	{ .mmi
309	(p16) ldf8	f68  = [X], 2 * SIZE
310	(p16) ldf8	f116 = [Y], 2 * SIZE
311	}
312	;;
313/* 13 */
314	{ .mmf
315	(p18) stf8	[Y1] = f13,  2 * SIZE
316	(p17) fpma	f7   = ALPHA_P, f36, f84
317	}
318	{ .mmi
319	(p16) ldf8	f71  = [X], 2 * SIZE
320	(p16) ldf8	f119 = [Y], 2 * SIZE
321	}
322	;;
323/* 14 */
324	{ .mmf
325	(p18) stf8	[Y1] = f14,  2 * SIZE
326	(p17) fpma	f10  = ALPHA_P, f39, f87
327	}
328	{ .mmi
329	(p16) ldf8	f74  = [X], 2 * SIZE
330	(p16) ldf8	f122 = [Y], 2 * SIZE
331	}
332	;;
333/*15 */
334	{ .mmf
335	(p18) stf8	[Y1] = f15, 2 * SIZE
336	(p17) fpma	f11  = ALPHA_P, f42, f90
337	}
338	{ .mmb
339	(p16) ldf8	f77  = [X], 2 * SIZE
340	(p16) ldf8	f125 = [Y], 2 * SIZE
341	br.ctop.sptk.few .L12
342	}
343	;;
344	.align 32
345
346.L15:
347	{ .mmi
348	(p11) ldf8	f32 = [X], 2 * SIZE
349	(p11) ldf8	f33 = [Y], 2 * SIZE
350	mov	pr = PR, -65474
351	}
352	;;
353	{ .mmi
354	(p11) ldf8	f34 = [X], 2 * SIZE
355	(p11) ldf8	f35 = [Y], 2 * SIZE
356	mov	ar.lc  = ARLC
357	}
358	;;
359	{ .mmb
360	(p11) ldf8	f36 = [X], 2 * SIZE
361	(p11) ldf8	f37 = [Y], 2 * SIZE
362	(p9) br.ret.sptk.many b0
363	}
364	;;
365	{ .mmi
366	(p11) ldf8	f38 = [X], 2 * SIZE
367	(p11) ldf8	f39 = [Y], 2 * SIZE
368	tbit.z	p0, p12 = N, 3
369	}
370	;;
371	{ .mmi
372	(p11) ldf8	f40 = [X], 2 * SIZE
373	(p11) ldf8	f41 = [Y], 2 * SIZE
374	tbit.z	p0, p13 = N, 2
375	}
376	;;
377	{ .mmi
378	(p11) ldf8	f42 = [X], 2 * SIZE
379	(p11) ldf8	f43 = [Y], 2 * SIZE
380	tbit.z	p0, p14 = N, 1
381	}
382	;;
383	{ .mmf
384	(p11) ldf8	f44 = [X], 2 * SIZE
385	(p11) ldf8	f45 = [Y], 2 * SIZE
386	(p11) fpma	f6  = ALPHA_P, f32, f33
387	}
388	;;
389	{ .mmf
390	(p11) ldf8	f46 = [X], 2 * SIZE
391	(p11) ldf8	f47 = [Y], 2 * SIZE
392	(p11) fpma	f7  = ALPHA_P, f34, f35
393	}
394	;;
395	{ .mmf
396	(p12) ldf8	f48 = [X], 2 * SIZE
397	(p12) ldf8	f49 = [Y], 2 * SIZE
398	(p11) fpma	f10 = ALPHA_P, f36, f37
399	}
400	;;
401	{ .mmi
402	(p11) stf8	[Y1] = f6,   2 * SIZE
403	nop.m 0
404	tbit.z	p0, p15 = N, 0
405	}
406	{ .mmf
407	(p12) ldf8	f50 = [X], 2 * SIZE
408	(p12) ldf8	f51 = [Y], 2 * SIZE
409	(p11) fpma	f11 = ALPHA_P, f38, f39
410	}
411	;;
412	{ .mmi
413	(p11) stf8	[Y1] = f7,   2 * SIZE
414	nop.m 0
415	nop.i 0
416	}
417	{ .mmf
418	(p12) ldf8	f52 = [X], 2 * SIZE
419	(p12) ldf8	f53 = [Y], 2 * SIZE
420	}
421	;;
422	{ .mmi
423	(p11) stf8	[Y1] = f10,  2 * SIZE
424	nop.m 0
425	nop.i 0
426	}
427	{ .mmf
428	(p12) ldf8	f54 = [X], 2 * SIZE
429	(p12) ldf8	f55 = [Y], 2 * SIZE
430	(p11) fpma	f12 = ALPHA_P, f40, f41
431	}
432	;;
433	{ .mmi
434	(p11) stf8	[Y1] = f11,  2 * SIZE
435	nop.m 0
436	nop.i 0
437	}
438	{ .mmf
439	(p13) ldf8	f56 = [X], 2 * SIZE
440	(p13) ldf8	f57 = [Y], 2 * SIZE
441	(p11) fpma	f13 = ALPHA_P, f42, f43
442	}
443	;;
444	{ .mmi
445	(p11) stf8	[Y1] = f12,  2 * SIZE
446	nop.m 0
447	nop.i 0
448	}
449	{ .mmf
450	(p13) ldf8	f58 = [X], 2 * SIZE
451	(p13) ldf8	f59 = [Y], 2 * SIZE
452	(p11) fpma	f14 = ALPHA_P, f44, f45
453	}
454	;;
455	{ .mmi
456	(p11) stf8	[Y1] = f13,  2 * SIZE
457	nop.m 0
458	nop.i 0
459	}
460	{ .mmf
461	(p14) ldf8	f60 = [X], 2 * SIZE
462	(p14) ldf8	f61 = [Y], 2 * SIZE
463	(p11) fpma	f15 = ALPHA_P, f46, f47
464	}
465	;;
466	{ .mmi
467	(p11) stf8	[Y1] = f14,  2 * SIZE
468	nop.m 0
469	nop.i 0
470	}
471	{ .mmf
472	(p15) ldfs	f62  = [X]
473	(p15) ldfs	f63  = [Y]
474	(p12) fpma	f6  = ALPHA_P, f48, f49
475	}
476	;;
477	(p12) fpma	f7  = ALPHA_P, f50, f51
478	(p12) fpma	f10 = ALPHA_P, f52, f53
479	;;
480	(p11) stf8	[Y1] = f15,  2 * SIZE
481	(p12) fpma	f11 = ALPHA_P, f54, f55
482	;;
483	(p12) stf8	[Y1] = f6,   2 * SIZE
484	(p13) fpma	f12 = ALPHA_P, f56, f57
485	;;
486	(p12) stf8	[Y1] = f7,   2 * SIZE
487	(p13) fpma	f13 = ALPHA_P, f58, f59
488	;;
489	(p12) stf8	[Y1] = f10,  2 * SIZE
490	(p14) fpma	f14 = ALPHA_P, f60, f61
491	;;
492	(p12) stf8	[Y1] = f11,  2 * SIZE
493	(p15) FMA	f15 = ALPHA,   f62, f63
494	;;
495	(p13) stf8	[Y1] = f12,  2 * SIZE
496	;;
497	(p13) stf8	[Y1] = f13,  2 * SIZE
498	;;
499	(p14) stf8	[Y1] = f14,  2 * SIZE
500	;;
501	(p15) stfs	[Y1] = f15
502	br.ret.sptk.many b0
503	;;
504	.align 32
505
506/* X is aligned; case 2 */
507
508.L20:
509	{ .mmi
510	(p10) STFD [YYY] = f32
511	adds	PRE1 = (PREFETCHSIZE - 28) *  SIZE, X
512	mov	ar.lc = I
513	}
514	{ .mib
515	adds	PRE2 = (PREFETCHSIZE +  4) * SIZE, Y
516	tbit.z	p0, p11 = N, 4
517	(p7) br.cond.dpnt  .L25
518	}
519	;;
520	.align 32
521
522.L22:
523/* 0 */
524	{ .mmf
525	(p18) stf8	[Y1] = f6,   2 * SIZE
526	(p16) lfetch.nt1	[PRE1], 32 * SIZE
527	(p18) fpma	f12  = ALPHA_P, f46, f94
528	}
529	{ .mmi
530	(p17) ldf8	f60  = [X], 2 * SIZE
531	(p16) ldf8	f80  = [Y], 2 * SIZE
532	}
533	;;
534/* 1 */
535	{ .mmf
536	(p18) stf8	[Y1] = f7,  2 * SIZE
537	(p16) lfetch.excl.nt1	[PRE2], 32 * SIZE
538	(p18) fpma	f13  = ALPHA_P, f49, f97
539	}
540	{ .mmi
541	(p17) ldf8	f63  = [X], 2 * SIZE
542	(p16) ldf8	f83  = [Y], 2 * SIZE
543	}
544	;;
545/* 2 */
546	{ .mmf
547	(p18) stf8	[Y1] = f10,  2 * SIZE
548	(p18) fpma	f14  = ALPHA_P, f52, f100
549	}
550	{ .mmi
551	(p17) ldf8	f66  = [X], 2 * SIZE
552	(p16) ldf8	f86  = [Y], 2 * SIZE
553	}
554	;;
555/* 3 */
556	{ .mmf
557	(p18) stf8	[Y1] = f11, 2 * SIZE
558	(p18) fpma	f15  = ALPHA_P, f55, f103
559	}
560	{ .mmi
561	(p17) ldf8	f69  = [X], 2 * SIZE
562	(p16) ldf8	f89  = [Y], 2 * SIZE
563	}
564	;;
565/* 4 */
566	{ .mmf
567	(p18) stf8	[Y1] = f12,  2 * SIZE
568	(p18) fpma	f6   = ALPHA_P, f58, f106
569	}
570	{ .mmi
571	(p17) ldf8	f72  = [X], 2 * SIZE
572	(p16) ldf8	f92  = [Y], 2 * SIZE
573	}
574	;;
575/* 5 */
576	{ .mmf
577	(p18) stf8	[Y1] = f13,  2 * SIZE
578	(p18) fpma	f7   = ALPHA_P, f61, f109
579	}
580	{ .mmi
581	(p17) ldf8	f75  = [X], 2 * SIZE
582	(p16) ldf8	f95  = [Y], 2 * SIZE
583	}
584	;;
585/* 6 */
586	{ .mmf
587	(p18) stf8	[Y1] = f14,  2 * SIZE
588	(p18) fpma	f10  = ALPHA_P, f64, f112
589	}
590	{ .mmi
591	(p17) ldf8	f78  = [X], 2 * SIZE
592	(p16) ldf8	f98  = [Y], 2 * SIZE
593	}
594	;;
595/* 7 */
596	{ .mmf
597	(p18) stf8	[Y1] = f15, 2 * SIZE
598	(p18) fpma	f11  = ALPHA_P, f67, f115
599	}
600	{ .mmi
601	(p16) ldf8	f32  = [X], 2 * SIZE
602	(p16) ldf8	f101 = [Y], 2 * SIZE
603	}
604	;;
605/* 8 */
606	{ .mmf
607	(p18) stf8	[Y1] = f6,   2 * SIZE
608	(p18) fpma	f12  = ALPHA_P, f70, f118
609	}
610	{ .mmi
611	(p16) ldf8	f35 = [X], 2 * SIZE
612	(p16) ldf8	f104 = [Y], 2 * SIZE
613	}
614	;;
615/* 9 */
616	{ .mmf
617	(p18) stf8	[Y1] = f7,  2 * SIZE
618	(p18) fpma	f13  = ALPHA_P, f73, f121
619	}
620	{ .mmi
621	(p16) ldf8	f38 = [X], 2 * SIZE
622	(p16) ldf8	f107 = [Y], 2 * SIZE
623	}
624	;;
625/* 10 */
626	{ .mmf
627	(p18) stf8	[Y1] = f10,  2 * SIZE
628	(p18) fpma	f14 = ALPHA_P, f76, f124
629	}
630	{ .mmi
631	(p16) ldf8	f41 = [X], 2 * SIZE
632	(p16) ldf8	f110 = [Y], 2 * SIZE
633	}
634	;;
635/* 11 */
636	{ .mmf
637	(p18) stf8	[Y1] = f11, 2 * SIZE
638	(p18) fpma	f15  = ALPHA_P, f79, f127
639	}
640	{ .mmi
641	(p16) ldf8	f44  = [X], 2 * SIZE
642	(p16) ldf8	f113 = [Y], 2 * SIZE
643	}
644	;;
645/* 12 */
646	{ .mmf
647	(p18) stf8	[Y1] = f12,  2 * SIZE
648	(p17) fpma	f6   = ALPHA_P, f33, f81
649	}
650	{ .mmi
651	(p16) ldf8	f47  = [X], 2 * SIZE
652	(p16) ldf8	f116 = [Y], 2 * SIZE
653	}
654	;;
655/* 13 */
656	{ .mmf
657	(p18) stf8	[Y1] = f13,  2 * SIZE
658	(p17) fpma	f7   = ALPHA_P, f36, f84
659	}
660	{ .mmi
661	(p16) ldf8	f50  = [X], 2 * SIZE
662	(p16) ldf8	f119 = [Y], 2 * SIZE
663	}
664	;;
665/* 14 */
666	{ .mmf
667	(p18) stf8	[Y1] = f14,  2 * SIZE
668	(p17) fpma	f10  = ALPHA_P, f39, f87
669	}
670	{ .mmi
671	(p16) ldf8	f53  = [X], 2 * SIZE
672	(p16) ldf8	f122 = [Y], 2 * SIZE
673	}
674	;;
675/*15 */
676	{ .mmf
677	(p18) stf8	[Y1] = f15, 2 * SIZE
678	(p17) fpma	f11  = ALPHA_P, f42, f90
679	}
680	{ .mmb
681	(p16) ldf8	f56 = [X], 2 * SIZE
682	(p16) ldf8	f125 = [Y], 2 * SIZE
683	br.ctop.sptk.few .L22
684	}
685	;;
686	.align 32
687
688.L25:
689	{ .mmi
690	(p11) ldf8	f32 = [X], 2 * SIZE
691	(p11) ldf8	f33 = [Y], 2 * SIZE
692	mov	pr = PR, -65474
693	}
694	;;
695	{ .mmi
696	(p11) ldf8	f34 = [X], 2 * SIZE
697	(p11) ldf8	f35 = [Y], 2 * SIZE
698	mov	ar.lc  = ARLC
699	}
700	;;
701	{ .mmb
702	(p11) ldf8	f36 = [X], 2 * SIZE
703	(p11) ldf8	f37 = [Y], 2 * SIZE
704	(p9) br.ret.sptk.many b0
705	}
706	;;
707	{ .mmi
708	(p11) ldf8	f38 = [X], 2 * SIZE
709	(p11) ldf8	f39 = [Y], 2 * SIZE
710	tbit.z	p0, p12 = N, 3
711	}
712	;;
713	{ .mmi
714	(p11) ldf8	f40 = [X], 2 * SIZE
715	(p11) ldf8	f41 = [Y], 2 * SIZE
716	tbit.z	p0, p13 = N, 2
717	}
718	;;
719	{ .mmi
720	(p11) ldf8	f42 = [X], 2 * SIZE
721	(p11) ldf8	f43 = [Y], 2 * SIZE
722	tbit.z	p0, p14 = N, 1
723	}
724	;;
725	{ .mmf
726	(p11) ldf8	f44 = [X], 2 * SIZE
727	(p11) ldf8	f45 = [Y], 2 * SIZE
728	(p11) fpma	f6  = ALPHA_P, f32, f33
729	}
730	;;
731	{ .mmf
732	(p11) ldf8	f46 = [X], 2 * SIZE
733	(p11) ldf8	f47 = [Y], 2 * SIZE
734	(p11) fpma	f7  = ALPHA_P, f34, f35
735	}
736	;;
737	{ .mmf
738	(p12) ldf8	f48 = [X], 2 * SIZE
739	(p12) ldf8	f49 = [Y], 2 * SIZE
740	(p11) fpma	f10 = ALPHA_P, f36, f37
741	}
742	;;
743	{ .mmi
744	(p11) stf8	[Y1] = f6,   2 * SIZE
745	nop.m 0
746	tbit.z	p0, p15 = N, 0
747	}
748	{ .mmf
749	(p12) ldf8	f50 = [X], 2 * SIZE
750	(p12) ldf8	f51 = [Y], 2 * SIZE
751	(p11) fpma	f11 = ALPHA_P, f38, f39
752	}
753	;;
754	{ .mmi
755	(p11) stf8	[Y1] = f7,   2 * SIZE
756	nop.m 0
757	nop.i 0
758	}
759	{ .mmf
760	(p12) ldf8	f52 = [X], 2 * SIZE
761	(p12) ldf8	f53 = [Y], 2 * SIZE
762	}
763	;;
764	{ .mmi
765	(p11) stf8	[Y1] = f10,  2 * SIZE
766	nop.m 0
767	nop.i 0
768	}
769	{ .mmf
770	(p12) ldf8	f54 = [X], 2 * SIZE
771	(p12) ldf8	f55 = [Y], 2 * SIZE
772	(p11) fpma	f12 = ALPHA_P, f40, f41
773	}
774	;;
775	{ .mmi
776	(p11) stf8	[Y1] = f11,  2 * SIZE
777	nop.m 0
778	nop.i 0
779	}
780	{ .mmf
781	(p13) ldf8	f56 = [X], 2 * SIZE
782	(p13) ldf8	f57 = [Y], 2 * SIZE
783	(p11) fpma	f13 = ALPHA_P, f42, f43
784	}
785	;;
786	{ .mmi
787	(p11) stf8	[Y1] = f12,  2 * SIZE
788	nop.m 0
789	nop.i 0
790	}
791	{ .mmf
792	(p13) ldf8	f58 = [X], 2 * SIZE
793	(p13) ldf8	f59 = [Y], 2 * SIZE
794	(p11) fpma	f14 = ALPHA_P, f44, f45
795	}
796	;;
797	{ .mmi
798	(p11) stf8	[Y1] = f13,  2 * SIZE
799	nop.m 0
800	nop.i 0
801	}
802	{ .mmf
803	(p14) ldf8	f60 = [X], 2 * SIZE
804	(p14) ldf8	f61 = [Y], 2 * SIZE
805	(p11) fpma	f15 = ALPHA_P, f46, f47
806	}
807	;;
808	{ .mmi
809	(p11) stf8	[Y1] = f14,  2 * SIZE
810	nop.m 0
811	nop.i 0
812	}
813	{ .mmf
814	(p15) ldfs	f62  = [X]
815	(p15) ldfs	f63  = [Y]
816	(p12) fpma	f6  = ALPHA_P, f48, f49
817	}
818	;;
819	(p12) fpma	f7  = ALPHA_P, f50, f51
820	(p12) fpma	f10 = ALPHA_P, f52, f53
821	;;
822	(p11) stf8	[Y1] = f15,  2 * SIZE
823	(p12) fpma	f11 = ALPHA_P, f54, f55
824	;;
825	(p12) stf8	[Y1] = f6,   2 * SIZE
826	(p13) fpma	f12 = ALPHA_P, f56, f57
827	;;
828	(p12) stf8	[Y1] = f7,   2 * SIZE
829	(p13) fpma	f13 = ALPHA_P, f58, f59
830	;;
831	(p12) stf8	[Y1] = f10,  2 * SIZE
832	(p14) fpma	f14 = ALPHA_P, f60, f61
833	;;
834	(p12) stf8	[Y1] = f11,  2 * SIZE
835	(p15) FMA	f15 = ALPHA,   f62, f63
836	;;
837	(p13) stf8	[Y1] = f12,  2 * SIZE
838	;;
839	(p13) stf8	[Y1] = f13,  2 * SIZE
840	;;
841	(p14) stf8	[Y1] = f14,  2 * SIZE
842	;;
843	(p15) stfs	[Y1] = f15
844	br.ret.sptk.many b0
845	;;
846	.align 32
847
848.L30:
849	{ .mmi
850	cmp.eq	p9, p0  =   r0, J
851	cmp.eq	p7 ,p0  = 0, I
852	mov	ar.ec = 4
853	}
854	{ .mmi
855	cmp.lt	p12, p0 = 33, XA
856	adds	I = -1, I
857	}
858	;;
859	{ .mmi
860	cmp.gt	p14, p0 =  15, XA
861	cmp.lt	p15, p0 =  60, XA
862	(p12) cmp.gt.unc p13, p0 = 53, XA
863	}
864	{ .bbb
865	(p13) br.cond.dpnt .L40
866	(p14) br.cond.dpnt .L40
867	(p15) br.cond.dpnt .L40
868	}
869	;;
870	{ .mmi
871	(p10) STFD [YYY] = f32
872	adds	PRE1 = (PREFETCHSIZE +  6) * SIZE, X
873	mov	ar.lc = I
874	}
875	{ .mib
876	adds	PRE2 = (PREFETCHSIZE +  0) * SIZE, Y
877	tbit.z	p0, p12 = N, 3
878	(p7) br.cond.dpnt  .L35
879	}
880	;;
881	.align 32
882
883.L32:
884	{ .mmf
885	(p19) STFD	[Y1] = f6,   1 * SIZE
886	(p19) STFD	[Y2] = f7,   1 * SIZE
887	(p18) FMA	f6  = ALPHA, f34, f82
888	}
889	{ .mmf
890	(p16) LDFPD	f32,  f35  = [X], 2 * SIZE
891	(p16) LDFD	f80 = [Y], 1 * SIZE
892	(p18) FMA	f7  = ALPHA, f46, f94
893	}
894	;;
895	{ .mmf
896	(p19) STFD	[Y1] = f10,  1 * SIZE
897	(p19) STFD	[Y2] = f11,  1 * SIZE
898	(p18) FMA	f10 = ALPHA, f37, f85
899	}
900	{ .mmf
901	(p16) LDFPD	f38,  f41  = [X], 2 * SIZE
902	(p16) LDFPD	f83,  f86  = [Y], 2 * SIZE
903	(p18) FMA	f11 = ALPHA, f49, f97
904	}
905	;;
906	{ .mmf
907	(p19) STFD	[Y1] = f12,  1 * SIZE
908	(p19) STFD	[Y2] = f13,  1 * SIZE
909	(p18) FMA	f12 = ALPHA, f40, f88
910	}
911	{ .mmf
912	(p16) LDFPD	f44,  f47  = [X], 2 * SIZE
913	(p16) LDFPD	f89,  f92  = [Y], 2 * SIZE
914	(p18) FMA	f13 = ALPHA, f52, f100
915	}
916	;;
917	{ .mmf
918	(p19) STFD	[Y1] = f14,  5 * SIZE
919	(p19) STFD	[Y2] = f15,  5 * SIZE
920	(p18) FMA	f14 = ALPHA, f43, f91
921	}
922	{ .mmf
923	(p16) LDFPD	f50,  f53  = [X], 2 * SIZE
924	(p16) LDFPD	f95,  f98  = [Y], 2 * SIZE
925	(p18) FMA	f15 = ALPHA, f55, f103
926	}
927	;;
928	{ .mmf
929	(p18) STFD	[Y1] = f6,   1 * SIZE
930	(p18) STFD	[Y2] = f7,   1 * SIZE
931	(p18) FMA	f6  = ALPHA, f58, f106
932	}
933	{ .mmf
934	(p16) LDFPD	f56,  f59  = [X], 2 * SIZE
935	(p16) LDFPD	f101, f104 = [Y], 2 * SIZE
936	(p18) FMA	f7  = ALPHA, f70, f118
937	}
938	;;
939	{ .mmf
940	(p18) STFD	[Y1] = f10,  1 * SIZE
941	(p18) STFD	[Y2] = f11,  1 * SIZE
942	(p18) FMA	f10 = ALPHA, f61, f109
943	}
944	{ .mmf
945	(p16) LDFPD	f62,  f65  = [X], 2 * SIZE
946	(p16) LDFPD	f107, f110 = [Y], 2 * SIZE
947	(p18) FMA	f11 = ALPHA, f73, f121
948	}
949	;;
950	{ .mmf
951	(p18) STFD	[Y1] = f12,  1 * SIZE
952	(p18) STFD	[Y2] = f13,  1 * SIZE
953	(p18) FMA	f12 = ALPHA, f64, f112
954	}
955	{ .mmf
956	(p16) LDFPD	f68,  f71  = [X], 2 * SIZE
957	(p16) LDFPD	f113, f116 = [Y], 2 * SIZE
958	(p18) FMA	f13 = ALPHA, f76, f124
959	}
960	;;
961	{ .mmf
962	(p18) STFD	[Y1] = f14,  5 * SIZE
963	(p18) STFD	[Y2] = f15,  5 * SIZE
964	(p18) FMA	f14 = ALPHA, f67, f115
965	}
966	{ .mmf
967	(p16) LDFPD	f74,  f77  = [X], 2 * SIZE
968	(p16) LDFPD	f119, f122 = [Y], 2 * SIZE
969	(p18) FMA	f15 = ALPHA, f79, f127
970	}
971	;;
972	{ .mmi
973	(p16) lfetch.nt1	[PRE1], 16 * SIZE
974	(p16) lfetch.excl.nt1	[PRE2], 16 * SIZE
975	nop.i 0
976	}
977	{ .mmb
978	(p16) LDFD	f125  = [Y], 1 * SIZE
979	nop.m 0
980	br.ctop.sptk.few .L32
981	}
982	;;
983	.align 32
984
985.L35:
986	{ .mmi
987	(p12) LDFPD	f32,  f33  = [X], 2 * SIZE
988	(p12) LDFD	f34 = [Y], 1 * SIZE;
989	mov	pr = PR, -65474
990	}
991	;;
992	{ .mmi
993	(p12) LDFPD	f36,  f37  = [X], 2 * SIZE
994	(p12) LDFPD	f35,  f38  = [Y], 2 * SIZE
995	mov	ar.lc  = ARLC
996	}
997	;;
998	{ .mmb
999	(p12) LDFPD	f40,  f41  = [X], 2 * SIZE
1000	(p12) LDFPD	f39,  f42  = [Y], 2 * SIZE
1001	(p9) br.ret.sptk.many b0
1002	}
1003	;;
1004	{ .mmi
1005	(p12) LDFPD	f44,  f45  = [X], 2 * SIZE
1006	(p12) LDFPD	f43,  f46  = [Y], 2 * SIZE
1007	tbit.z	p0, p13 = N, 2
1008	}
1009	;;
1010	{ .mmi
1011	(p13) LDFPD	f48,  f49  = [X], 2 * SIZE
1012	(p12) LDFD	f47 = [Y], 1 * SIZE
1013	tbit.z	p0, p14 = N, 1
1014	}
1015	;;
1016	{ .mmi
1017	(p13) LDFPD	f52,  f53  = [X], 2 * SIZE
1018	(p13) LDFD	f50 = [Y], 1 * SIZE
1019	tbit.z	p0, p15 = N, 0
1020	}
1021	;;
1022	{ .mmi
1023	(p14) LDFPD	f56,  f57  = [X], 2 * SIZE
1024	(p13) LDFPD	f51,  f54  = [Y], 2 * SIZE
1025	mov   YY = Y1;
1026	}
1027	;;
1028	(p15) LDFD	f60 = [X]
1029	(p13) LDFD	f55 = [Y], 1 * SIZE
1030	;;
1031	(p14) LDFD	f58 = [Y], 1 * SIZE
1032	(p12) FMA	f6  = ALPHA, f32, f34
1033	(p12) FMA	f7  = ALPHA, f40, f42
1034	;;
1035	(p14) LDFD	f59 = [Y], 1 * SIZE
1036	(p12) shladd YY = INCY, 3, YY
1037	(p12) FMA	f10 = ALPHA, f33, f35
1038	(p12) FMA	f11 = ALPHA, f41, f43
1039	;;
1040	(p15) LDFD	f61  = [Y]
1041	(p13) shladd YY = INCY, 2, YY
1042	(p12) FMA	f12 = ALPHA, f36, f38
1043	(p12) FMA	f13 = ALPHA, f44, f46
1044	;;
1045	(p12) STFD	[Y1] = f6,   1 * SIZE
1046	(p12) FMA	f14 = ALPHA, f37, f39
1047	(p12) STFD	[Y2] = f7,   1 * SIZE
1048	(p12) FMA	f15 = ALPHA, f45, f47
1049	;;
1050	(p12) STFD	[Y1] = f10,  1 * SIZE
1051	(p13) FMA	f6  = ALPHA, f48, f50
1052	(p12) STFD	[Y2] = f11,  1 * SIZE
1053	(p14) FMA	f7  = ALPHA, f56, f58
1054	;;
1055	(p12) STFD	[Y1] = f12,  1 * SIZE
1056	(p13) FMA	f10 = ALPHA, f49, f51
1057	(p12) STFD	[Y2] = f13,  1 * SIZE
1058	(p14) FMA	f11 = ALPHA, f57, f59
1059	;;
1060	(p12) STFD	[Y1] = f14,  5 * SIZE
1061	(p13) FMA	f12 = ALPHA, f52, f54
1062	(p12) STFD	[Y2] = f15,  5 * SIZE
1063	(p15) FMA	f13 = ALPHA, f60, f61
1064	;;
1065	(p13) STFD	[Y1] = f6,   1 * SIZE
1066	(p14) STFD	[YY] = f7,   1 * SIZE
1067	(p13) FMA	f14 = ALPHA, f53, f55
1068	;;
1069	(p13) STFD	[Y1] = f10,  1 * SIZE
1070	(p14) STFD	[YY] = f11,  1 * SIZE
1071	;;
1072	(p13) STFD	[Y1] = f12,  1 * SIZE
1073	(p15) STFD	[YY] = f13
1074	;;
1075	(p13) STFD	[Y1] = f14
1076	br.ret.sptk.many b0
1077	;;
1078	.align 32
1079
1080.L40:
1081	{ .mmi
1082	(p10) STFD [YYY] = f32
1083	adds	PRE1 = (PREFETCHSIZE + 38) * SIZE, X
1084	mov	ar.lc = I
1085	}
1086	{ .mib
1087	adds	PRE2 = (PREFETCHSIZE + 14) * SIZE, Y
1088	tbit.z	p0, p12 = N, 3
1089	(p7) br.cond.dpnt  .L45
1090	}
1091	;;
1092	.align 32
1093
1094.L42:
1095	{ .mmf
1096	(p19) STFD	[Y1] = f6,   1 * SIZE
1097	(p19) STFD	[Y2] = f7,   1 * SIZE
1098	(p18) FMA	f6  = ALPHA, f34, f82
1099	}
1100	{ .mmf
1101	(p16) lfetch.nt1	[PRE1], 16 * SIZE
1102	(p17) LDFPD	f102, f105 = [Y], 2 * SIZE
1103	(p18) FMA	f7  = ALPHA, f46, f94
1104	}
1105	;;
1106	{ .mmf
1107	(p19) STFD	[Y1] = f10,  1 * SIZE
1108	(p19) STFD	[Y2] = f11,  1 * SIZE
1109	(p18) FMA	f10 = ALPHA, f37, f85
1110	}
1111	{ .mmf
1112	(p17) LDFPD	f33,  f36  = [X], 2 * SIZE
1113	(p17) LDFPD	f108, f111 = [Y], 2 * SIZE
1114	(p18) FMA	f11 = ALPHA, f49, f97
1115	}
1116	;;
1117	{ .mmf
1118	(p19) STFD	[Y1] = f12,  1 * SIZE
1119	(p19) STFD	[Y2] = f13,  1 * SIZE
1120	(p18) FMA	f12 = ALPHA, f40, f88
1121	}
1122	{ .mmf
1123	(p17) LDFPD	f39,  f42  = [X], 2 * SIZE
1124	(p17) LDFPD	f114, f117 = [Y], 2 * SIZE
1125	(p18) FMA	f13 = ALPHA, f52, f100
1126	}
1127	;;
1128	{ .mmf
1129	(p19) STFD	[Y1] = f14,  5 * SIZE
1130	(p19) STFD	[Y2] = f15,  5 * SIZE
1131	(p18) FMA	f14 = ALPHA, f43, f91
1132	}
1133	{ .mmf
1134	(p17) LDFPD	f45,  f48  = [X], 2 * SIZE
1135	(p17) LDFPD	f120, f123 = [Y], 2 * SIZE
1136	(p18) FMA	f15 = ALPHA, f55, f103
1137	}
1138	;;
1139	{ .mmf
1140	(p18) STFD	[Y1] = f6,   1 * SIZE
1141	(p18) STFD	[Y2] = f7,   1 * SIZE
1142	(p18) FMA	f6  = ALPHA, f58, f106
1143	}
1144	{ .mmf
1145	(p17) LDFPD	f51,  f54  = [X], 2 * SIZE
1146	(p17) LDFD	f126  = [Y], 1 * SIZE
1147	(p18) FMA	f7  = ALPHA, f70, f118
1148	}
1149	;;
1150	{ .mmf
1151	(p18) STFD	[Y1] = f10,  1 * SIZE
1152	(p18) STFD	[Y2] = f11,  1 * SIZE
1153	(p18) FMA	f10 = ALPHA, f61, f109
1154	}
1155	{ .mmf
1156	(p17) LDFPD	f57,  f60  = [X], 2 * SIZE
1157	(p16) LDFD	f80 = [Y], 1 * SIZE
1158	(p18) FMA	f11 = ALPHA, f73, f121
1159	}
1160	;;
1161	{ .mmf
1162	(p18) STFD	[Y1] = f12,  1 * SIZE
1163	(p18) STFD	[Y2] = f13,  1 * SIZE
1164	(p18) FMA	f12 = ALPHA, f64, f112
1165	}
1166	{ .mmf
1167	(p17) LDFPD	f63,  f66  = [X], 2 * SIZE
1168	(p16) LDFPD	f83,  f86  = [Y], 2 * SIZE
1169	(p18) FMA	f13 = ALPHA, f76, f124
1170	}
1171	;;
1172	{ .mmf
1173	(p18) STFD	[Y1] = f14,  5 * SIZE
1174	(p18) STFD	[Y2] = f15,  5 * SIZE
1175	(p18) FMA	f14 = ALPHA, f67, f115
1176	}
1177	{ .mmf
1178	(p17) LDFPD	f69,  f72  = [X], 2 * SIZE
1179	(p16) LDFPD	f89,  f92  = [Y], 2 * SIZE
1180	(p18) FMA	f15 = ALPHA, f79, f127
1181	}
1182	;;
1183#if 0
1184	(p16) lfetch.excl.nt1	[PRE2], 16 * SIZE
1185#endif
1186	{ .mmb
1187	(p17) LDFPD	f75,  f78  = [X], 2 * SIZE
1188	(p16) LDFPD	f95,  f98  = [Y], 2 * SIZE
1189	br.ctop.sptk.few .L42
1190	}
1191	;;
1192	{ .mmf
1193	(p19) STFD	[Y1] = f6,   1 * SIZE
1194	(p19) STFD	[Y2] = f7,   1 * SIZE
1195	}
1196	;;
1197	{ .mmf
1198	(p19) STFD	[Y1] = f10,  1 * SIZE
1199	(p19) STFD	[Y2] = f11,  1 * SIZE
1200	}
1201	;;
1202	{ .mmf
1203	(p19) STFD	[Y1] = f12,  1 * SIZE
1204	(p19) STFD	[Y2] = f13,  1 * SIZE
1205	}
1206	;;
1207	{ .mmf
1208	(p19) STFD	[Y1] = f14,  5 * SIZE
1209	(p19) STFD	[Y2] = f15,  5 * SIZE
1210	}
1211	;;
1212	.align 32
1213
1214.L45:
1215	{ .mmi
1216	(p12) LDFPD	f32,  f33  = [X], 2 * SIZE
1217	(p12) LDFD	f34 = [Y], 1 * SIZE;
1218	mov	pr = PR, -65474
1219	}
1220	;;
1221	{ .mmi
1222	(p12) LDFPD	f36,  f37  = [X], 2 * SIZE
1223	(p12) LDFPD	f35,  f38  = [Y], 2 * SIZE
1224	mov	ar.lc  = ARLC
1225	}
1226	;;
1227	{ .mmb
1228	(p12) LDFPD	f40,  f41  = [X], 2 * SIZE
1229	(p12) LDFPD	f39,  f42  = [Y], 2 * SIZE
1230	(p9) br.ret.sptk.many b0
1231	}
1232	;;
1233	{ .mmi
1234	(p12) LDFPD	f44,  f45  = [X], 2 * SIZE
1235	(p12) LDFPD	f43,  f46  = [Y], 2 * SIZE
1236	tbit.z	p0, p13 = N, 2
1237	}
1238	;;
1239	{ .mmi
1240	(p13) LDFPD	f48,  f49  = [X], 2 * SIZE
1241	(p12) LDFD	f47 = [Y], 1 * SIZE
1242	tbit.z	p0, p14 = N, 1
1243	}
1244	;;
1245	{ .mmi
1246	(p13) LDFPD	f52,  f53  = [X], 2 * SIZE
1247	(p13) LDFD	f50 = [Y], 1 * SIZE
1248	tbit.z	p0, p15 = N, 0
1249	}
1250	;;
1251	{ .mmi
1252	(p14) LDFPD	f56,  f57  = [X], 2 * SIZE
1253	(p13) LDFPD	f51,  f54  = [Y], 2 * SIZE
1254	mov   YY = Y1;
1255	}
1256	;;
1257	(p15) LDFD	f60 = [X]
1258	(p13) LDFD	f55 = [Y], 1 * SIZE
1259	;;
1260	(p14) LDFD	f58 = [Y], 1 * SIZE
1261	(p12) FMA	f6  = ALPHA, f32, f34
1262	(p12) FMA	f7  = ALPHA, f40, f42
1263	;;
1264	(p14) LDFD	f59 = [Y], 1 * SIZE
1265	(p12) shladd YY = INCY, 3, YY
1266	(p12) FMA	f10 = ALPHA, f33, f35
1267	(p12) FMA	f11 = ALPHA, f41, f43
1268	;;
1269	(p15) LDFD	f61  = [Y]
1270	(p13) shladd YY = INCY, 2, YY
1271	(p12) FMA	f12 = ALPHA, f36, f38
1272	(p12) FMA	f13 = ALPHA, f44, f46
1273	;;
1274	(p12) STFD	[Y1] = f6,   1 * SIZE
1275	(p12) FMA	f14 = ALPHA, f37, f39
1276	(p12) STFD	[Y2] = f7,   1 * SIZE
1277	(p12) FMA	f15 = ALPHA, f45, f47
1278	;;
1279	(p12) STFD	[Y1] = f10,  1 * SIZE
1280	(p13) FMA	f6  = ALPHA, f48, f50
1281	(p12) STFD	[Y2] = f11,  1 * SIZE
1282	(p14) FMA	f7  = ALPHA, f56, f58
1283	;;
1284	(p12) STFD	[Y1] = f12,  1 * SIZE
1285	(p13) FMA	f10 = ALPHA, f49, f51
1286	(p12) STFD	[Y2] = f13,  1 * SIZE
1287	(p14) FMA	f11 = ALPHA, f57, f59
1288	;;
1289	(p12) STFD	[Y1] = f14,  5 * SIZE
1290	(p13) FMA	f12 = ALPHA, f52, f54
1291	(p12) STFD	[Y2] = f15,  5 * SIZE
1292	(p15) FMA	f13 = ALPHA, f60, f61
1293	;;
1294	(p13) STFD	[Y1] = f6,   1 * SIZE
1295	(p14) STFD	[YY] = f7,   1 * SIZE
1296	(p13) FMA	f14 = ALPHA, f53, f55
1297	;;
1298	(p13) STFD	[Y1] = f10,  1 * SIZE
1299	(p14) STFD	[YY] = f11,  1 * SIZE
1300	;;
1301	(p13) STFD	[Y1] = f12,  1 * SIZE
1302	(p15) STFD	[YY] = f13
1303	;;
1304	(p13) STFD	[Y1] = f14
1305	br.ret.sptk.many b0
1306	;;
1307	.align 32
1308
1309.L100:
1310	{ .mii
1311	and	J =  15, N
1312	shr	I =  N, 4
1313	mov	ar.ec = 3
1314	}
1315	;;
1316	{ .mmi
1317	cmp.eq	p9, p0  =   r0, J
1318	cmp.eq	p7 ,p0  = 0, I
1319	adds	I = -1, I
1320	}
1321	;;
1322	{ .mmi
1323	(p10) STFD [YYY] = f32
1324	adds	PRE1 = PREFETCHSIZE * SIZE, X
1325	mov	ar.lc = I
1326	}
1327	{ .mib
1328	adds	PRE2 = PREFETCHSIZE * SIZE, Y
1329	tbit.z	p0, p12 = N, 3
1330	(p7) br.cond.dpnt  .L115
1331	}
1332	;;
1333	.align 32
1334
1335.L112:
1336	{ .mmi
1337	(p18) STFD	[Y1] = f6
1338	(p16) lfetch.nt1	[PRE1], INCX16
1339	(p18) add Y1 = INCY, Y1
1340	}
1341	{.mmf
1342	(p16) LDFD	f32  = [X], INCX
1343	(p16) LDFD	f80  = [Y], INCY
1344	(p18) FMA	f6  = ALPHA, f58, f106
1345	}
1346	;;
1347	{ .mmi
1348 	(p18) STFD	[Y1] = f7
1349	(p16) lfetch.excl.nt1	[PRE2], INCY16
1350	(p18) add Y1 = INCY, Y1
1351	}
1352	{ .mmf
1353	(p16) LDFD	f35  = [X], INCX
1354	(p16) LDFD	f83  = [Y], INCY
1355	(p18) FMA	f7  = ALPHA, f61, f109
1356	}
1357	;;
1358	{ .mmi
1359	(p18) STFD	[Y1] = f10
1360	(p18) add Y1 = INCY, Y1
1361	nop.i 0
1362	}
1363	{ .mmf
1364	(p16) LDFD	f38  = [X], INCX
1365	(p16) LDFD	f86  = [Y], INCY
1366	(p18) FMA	f10 = ALPHA, f64, f112
1367	}
1368	;;
1369	{ .mmi
1370	(p18) STFD	[Y1] = f11
1371	(p18) add Y1 = INCY, Y1
1372	nop.i 0
1373	}
1374	{ .mmf
1375	(p16) LDFD	f41  = [X], INCX
1376	(p16) LDFD	f89  = [Y], INCY
1377	(p18) FMA	f11 = ALPHA, f67, f115
1378	}
1379	;;
1380	{ .mmi
1381	(p18) STFD	[Y1] = f12
1382	(p18) add Y1 = INCY, Y1
1383	nop.i 0
1384	}
1385	{ .mmf
1386	(p16) LDFD	f44  = [X], INCX
1387	(p16) LDFD	f92  = [Y], INCY
1388	(p18) FMA	f12 = ALPHA, f70, f118
1389	}
1390	;;
1391	{ .mmi
1392	(p18) STFD	[Y1] = f13
1393	(p18) add Y1 = INCY, Y1
1394	nop.i 0
1395	}
1396	{ .mmf
1397	(p16) LDFD	f47  = [X], INCX
1398	(p16) LDFD	f95  = [Y], INCY
1399	(p18) FMA	f13 = ALPHA, f73, f121
1400	}
1401	;;
1402	{ .mmi
1403	(p18) STFD	[Y1] = f14
1404	(p18) add Y1 = INCY, Y1
1405	nop.i 0
1406	}
1407	{ .mmf
1408	(p16) LDFD	f50  = [X], INCX
1409	(p16) LDFD	f98  = [Y], INCY
1410	(p18) FMA	f14 = ALPHA, f76, f124
1411	}
1412	;;
1413	{ .mmi
1414	(p18) STFD	[Y1] = f15
1415	(p18) add Y1 = INCY, Y1
1416	nop.i 0
1417	}
1418	{ .mmf
1419	(p16) LDFD	f53  = [X], INCX
1420	(p16) LDFD	f101 = [Y], INCY
1421	(p18) FMA	f15 = ALPHA, f79, f127
1422	}
1423	;;
1424	{ .mmi
1425	(p18) STFD	[Y1] = f6
1426	(p18) add Y1 = INCY, Y1
1427	nop.i 0
1428	}
1429	{ .mmf
1430	(p16) LDFD	f56  = [X], INCX
1431	(p16) LDFD	f104 = [Y], INCY
1432	(p17) FMA	f6  = ALPHA, f33, f81
1433	}
1434	;;
1435	{ .mmi
1436 	(p18) STFD	[Y1] = f7
1437	(p18) add Y1 = INCY, Y1
1438	nop.i 0
1439	}
1440	{ .mmf
1441	(p16) LDFD	f59  = [X], INCX
1442	(p16) LDFD	f107 = [Y], INCY
1443	(p17) FMA	f7  = ALPHA, f36, f84
1444	}
1445	;;
1446	{ .mmi
1447	(p18) STFD	[Y1] = f10
1448	(p18) add Y1 = INCY, Y1
1449	nop.i 0
1450	}
1451	{ .mmf
1452	(p16) LDFD	f62  = [X], INCX
1453	(p16) LDFD	f110 = [Y], INCY
1454	(p17) FMA	f10 = ALPHA, f39, f87
1455	}
1456	;;
1457	{ .mmi
1458	(p18) STFD	[Y1] = f11
1459	(p18) add Y1 = INCY, Y1
1460	nop.i 0
1461	}
1462	{ .mmf
1463	(p16) LDFD	f65  = [X], INCX
1464	(p16) LDFD	f113 = [Y], INCY
1465	(p17) FMA	f11 = ALPHA, f42, f90
1466	}
1467	;;
1468	{ .mmi
1469	(p18) STFD	[Y1] = f12
1470	(p18) add Y1 = INCY, Y1
1471	nop.i 0
1472	}
1473	{ .mmf
1474	(p16) LDFD	f68  = [X], INCX
1475	(p16) LDFD	f116 = [Y], INCY
1476	(p17) FMA	f12 = ALPHA, f45, f93
1477	}
1478	;;
1479	{ .mmi
1480	(p18) STFD	[Y1] = f13
1481	(p18) add Y1 = INCY, Y1
1482	nop.i 0
1483	}
1484	{ .mmf
1485	(p16) LDFD	f71  = [X], INCX
1486	(p16) LDFD	f119 = [Y], INCY
1487	(p17) FMA	f13 = ALPHA, f48, f96
1488	}
1489	;;
1490	{ .mmi
1491	(p18) STFD	[Y1] = f14
1492	(p18) add Y1 = INCY, Y1
1493	nop.i 0
1494	}
1495	{ .mmf
1496	(p16) LDFD	f74  = [X], INCX
1497	(p16) LDFD	f122 = [Y], INCY
1498	(p17) FMA	f14 = ALPHA, f51, f99
1499	}
1500	;;
1501	{ .mmf
1502	(p18) STFD	[Y1] = f15
1503	(p18) add Y1 = INCY, Y1
1504	(p17) FMA	f15 = ALPHA, f54, f102
1505	}
1506	{ .mmb
1507	(p16) LDFD	f77  = [X], INCX
1508	(p16) LDFD	f125 = [Y], INCY
1509	br.ctop.sptk.few .L112
1510	}
1511	;;
1512	.align 32
1513
1514.L115:
1515	(p12) LDFD f32 = [X], INCX
1516	(p12) LDFD f34 = [Y], INCY
1517	mov	pr = PR, -65474
1518	;;
1519	(p12) LDFD f33 = [X], INCX
1520	(p12) LDFD f35 = [Y], INCY
1521	mov	ar.lc  = ARLC
1522	;;
1523	(p12) LDFD f36 = [X], INCX
1524	(p12) LDFD f38 = [Y], INCY
1525	(p9) br.ret.sptk.many b0
1526	;;
1527	(p12) LDFD f37 = [X], INCX
1528	(p12) LDFD f39 = [Y], INCY
1529	tbit.z	p0, p13 = N, 2
1530	;;
1531	(p12) LDFD f40 = [X], INCX
1532	(p12) LDFD f42 = [Y], INCY
1533	tbit.z	p0, p14 = N, 1
1534	;;
1535	(p12) LDFD f41 = [X], INCX
1536	(p12) LDFD f43 = [Y], INCY
1537	tbit.z	p0, p15 = N, 0
1538	;;
1539	{ .mmf
1540	(p12) LDFD f44 = [X], INCX
1541	(p12) LDFD f46 = [Y], INCY
1542	(p12) FMA	f6  = ALPHA, f32, f34
1543	}
1544	;;
1545	{ .mmf
1546	(p12) LDFD f45 = [X], INCX
1547	(p12) LDFD f47 = [Y], INCY
1548	(p12) FMA	f7  = ALPHA, f33, f35
1549	}
1550	;;
1551	{ .mmf
1552	(p13) LDFD f48 = [X], INCX
1553	(p13) LDFD f50 = [Y], INCY
1554	(p12) FMA	f10 = ALPHA, f36, f38
1555	}
1556	;;
1557	{ .mmf
1558	(p13) LDFD f49 = [X], INCX
1559	(p13) LDFD f51 = [Y], INCY
1560	(p12) FMA	f11 = ALPHA, f37, f39
1561	}
1562	;;
1563	{ .mmi
1564	(p12) STFD	[Y1] = f6
1565	(p12) add Y1 = INCY, Y1
1566	nop.i 0
1567	}
1568	{ .mmf
1569	(p13) LDFD f52 = [X], INCX
1570	(p13) LDFD f54 = [Y], INCY
1571	(p12) FMA	f12 = ALPHA, f40, f42
1572	}
1573	;;
1574	{ .mmi
1575	(p12) STFD	[Y1] = f7
1576	(p12) add Y1 = INCY, Y1
1577	nop.i 0
1578	}
1579	{ .mmf
1580	(p13) LDFD f53 = [X], INCX
1581	(p13) LDFD f55 = [Y], INCY
1582	(p12) FMA	f13 = ALPHA, f41, f43
1583	}
1584	;;
1585	{ .mmi
1586	(p12) STFD	[Y1] = f10
1587	(p12) add Y1 = INCY, Y1
1588	nop.i 0
1589	}
1590	{ .mmf
1591	(p14) LDFD f56 = [X], INCX
1592	(p14) LDFD f58 = [Y], INCY
1593	(p12) FMA	f14 = ALPHA, f44, f46
1594	}
1595	;;
1596	{ .mmi
1597	(p12) STFD	[Y1] = f11
1598	(p12) add Y1 = INCY, Y1
1599	nop.i 0
1600	}
1601	{ .mmf
1602	(p14) LDFD f57 = [X], INCX
1603	(p14) LDFD f59 = [Y], INCY
1604	(p12) FMA	f15 = ALPHA, f45, f47
1605	}
1606	;;
1607	{ .mmi
1608	(p12) STFD	[Y1] = f12
1609	(p12) add Y1 = INCY, Y1
1610	nop.i 0
1611	}
1612	{ .mmf
1613	(p15) LDFD f60 = [X]
1614	(p15) LDFD f61 = [Y]
1615	(p13) FMA	f6  = ALPHA, f48, f50
1616	}
1617	;;
1618	{ .mmf
1619	(p12) STFD	[Y1] = f13
1620	(p12) add Y1 = INCY, Y1
1621	(p13) FMA	f7  = ALPHA, f49, f51
1622	}
1623	;;
1624	{ .mmf
1625	(p12) STFD	[Y1] = f14
1626	(p12) add Y1 = INCY, Y1
1627	(p13) FMA	f10 = ALPHA, f52, f54
1628	}
1629	;;
1630	{ .mmf
1631	(p12) STFD	[Y1] = f15
1632	(p12) add Y1 = INCY, Y1
1633	(p13) FMA	f11 = ALPHA, f53, f55
1634	}
1635	;;
1636	{ .mmf
1637	(p13) STFD	[Y1] = f6
1638	(p13) add Y1 = INCY, Y1
1639	(p14) FMA	f12 = ALPHA, f56, f58
1640	}
1641	;;
1642	{ .mmf
1643	(p13) STFD	[Y1] = f7
1644	(p13) add Y1 = INCY, Y1
1645	(p14) FMA	f13 = ALPHA, f57, f59
1646	}
1647	;;
1648	{ .mmf
1649	(p13) STFD	[Y1] = f10
1650	(p13) add Y1 = INCY, Y1
1651	(p15) FMA	f14 = ALPHA, f60, f61
1652	}
1653	;;
1654	(p13) STFD	[Y1] = f11
1655	(p13) add Y1 = INCY, Y1
1656	;;
1657	(p14) STFD	[Y1] = f12
1658	(p14) add Y1 = INCY, Y1
1659	;;
1660	(p14) STFD	[Y1] = f13
1661	(p14) add Y1 = INCY, Y1
1662	;;
1663	(p15) STFD	[Y1] = f14
1664	br.ret.sptk.many b0
1665	;;
1666	EPILOGUE
1667
1668