1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#ifdef DOUBLE
43#define PREFETCHSIZE  (16 *  8)
44#else
45#define PREFETCHSIZE  (32 *  8)
46#endif
47
48#define CPREFETCHSIZE 15
49#define CPREFETCH     lfetch.excl.nt1
50
51#define M	r32
52#define N	r33
53#define K	r34
54#define A	r37
55#define B	r38
56#define C	r39
57#define LDC	r35
58
59#define I	r15
60#define J	r16
61#define AOFFSET	r17
62#define BOFFSET	r18
63#define L	r20
64
65#define C1	r21
66#define C2	r22
67#define C3	r23
68#define C4	r24
69#define C5	r25
70#define C6	r26
71#define C7	r27
72#define C8	r28
73
74#define C9	loc0
75#define C10	loc1
76#define C11	loc2
77#define C12	loc3
78#define C13	loc4
79#define C14	loc5
80#define C15	loc6
81#define C16	loc7
82
83#define PREA	r8
84#define PREB	r9
85#define PREC	r10
86#define SP	r12
87#define ARLC	r29
88#define PR	r30
89#define ARPFS	r31
90
91#define ALPHA_R	f8
92#define ALPHA_I	f9
93
94	PROLOGUE
95	.prologue
96	PROFCODE
97
98	{ .mmi
99	.save	ar.pfs, ARPFS
100	alloc	ARPFS = ar.pfs, 8, 16, 0, 0
101	adds	r14 = 16, SP
102	mov	ARLC  = ar.lc
103	}
104	{ .mmi
105	adds	r8 = -16 * 16, SP
106	adds	r9 = -15 * 16, SP
107	adds	SP = -16 * 16, SP
108	}
109	;;
110	{ .mmi
111	stf.spill  [r8] = f16, 32
112	stf.spill  [r9] = f17, 32
113	mov	PR = pr
114	}
115	{ .mmi
116	ld8	LDC   = [r14], 8
117	nop	__LINE__
118	nop	__LINE__
119	}
120	;;
121	stf.spill  [r8] = f18, 32
122	stf.spill  [r9] = f19, 32
123	shr	J = N, 3
124	;;
125	stf.spill  [r8] = f20, 32
126	stf.spill  [r9] = f21, 32
127	shladd	LDC = LDC, ZBASE_SHIFT, r0
128	;;
129	stf.spill  [r8] = f22, 32
130	stf.spill  [r9] = f23, 32
131	mov	AOFFSET = A
132	;;
133	stf.spill  [r8] = f24, 32
134	stf.spill  [r9] = f25, 32
135	cmp.ge	p6, p0  = 0, J
136	;;
137	stf.spill  [r8] = f26, 32
138	stf.spill  [r9] = f27, 32
139	;;
140	stf.spill  [r8] = f28, 32
141	stf.spill  [r9] = f29, 32
142       ;;
143	stf.spill  [r8] = f30
144	stf.spill  [r9] = f31
145	(p6)	br.cond.dpnt .L050
146	.body
147	;;
148	.align 32
149
150.L010:
151	{ .mfi
152	adds	J = -1, J
153	mov	f64  = f0
154	shr	I  = M, 3
155	}
156	{ .mfi
157	mov	C1 = C			// coffset1 = c + 0 * ldc
158	mov	f72  = f0
159	}
160	;;
161	{ .mmf
162	cmp.eq	p6, p7 = 0, I
163	nop	__LINE__
164	mov	f80  = f0
165	}
166	{ .mmf
167	add	C2 = LDC, C		// coffset2 = c + 1 * ldc
168	shladd	C3 = LDC, 1, C		// coffset3 = c + 2 * ldc
169	mov	f88  = f0
170	}
171	;;
172	{ .mmf
173	shladd	C5 = LDC, 2, C		// coffset5 = c + 4 * ldc
174	shladd	C = LDC, 3, C		// coffset += 8 * ldc
175	mov	f96  = f0
176	}
177	{ .mmf
178	shladd	C4 = LDC, 1, C2		// coffset4 = c + 3 * ldc
179	shladd	C6 = LDC, 2, C2		// coffset6 = c + 5 * ldc
180	mov	f104 = f0
181	}
182	;;
183	{ .mfi
184	shladd	C7 = LDC, 2, C3		// coffset7 = c + 6 * ldc
185	mov	f112 = f0
186	nop	__LINE__
187	}
188	{ .mfb
189	sub	C8 = C,  LDC		// coffset8 = c + 7 * ldc
190	mov	f120 = f0
191	(p6)	br.cond.dpnt .L020
192	}
193	;;
194	.align 16
195
196.L011:
197	{ .mfb
198	LDFPD	f48, f49 = [B]
199	mov	f65  = f0
200	nop	__LINE__
201	}
202	{ .mfb
203	adds	BOFFSET = 2 * SIZE, B
204	mov	f73  = f0
205	nop	__LINE__
206	}
207	;;
208	{ .mfb
209	LDFPD	f32, f33 = [AOFFSET], 2 * SIZE
210	mov	f81  = f0
211	nop	__LINE__
212	}
213	{ .mfb
214	LDFPD	f50, f51 = [BOFFSET], 2 * SIZE
215	mov	f89  = f0
216	nop	__LINE__
217	}
218	;;
219	{ .mmf
220	LDFPD	f52, f53 = [BOFFSET], 2 * SIZE
221	setf.d	f97  = r0
222	mov	f105 = f0
223	}
224	{ .mfb
225	setf.d	f113 = r0
226	mov	f121 = f0
227	nop	__LINE__
228	}
229	;;
230	{ .mmf
231	LDFPD	f54, f55 = [BOFFSET], 2 * SIZE
232	setf.d	f66  = r0
233	mov	f74  = f0
234	}
235	{ .mfb
236	setf.d	f82  = r0
237	mov	f90  = f0
238	nop	__LINE__
239	}
240	;;
241	{ .mmf
242	LDFPD	f34, f35  = [AOFFSET], 2 * SIZE
243	setf.d	f98  = r0
244	mov	f106 = f0
245	}
246	{ .mfb
247	setf.d	f114 = r0
248	mov	f122 = f0
249	nop	__LINE__
250	}
251	;;
252	{ .mmf
253	LDFPD	f36, f37  = [AOFFSET], 2 * SIZE
254	setf.d	f67  = r0
255	mov	f75  = f0
256	}
257	{ .mfi
258	setf.d	f83  = r0
259	mov	f91  = f0
260	nop	__LINE__
261	}
262	;;
263	{ .mmf
264	LDFPD	f38, f39  = [AOFFSET], 2 * SIZE
265	setf.d	f99  = r0
266	mov	f107 = f0
267	}
268	{ .mfi
269	setf.d	f115 = r0
270	mov	f123 = f0
271	adds	PREC = CPREFETCHSIZE * SIZE, C1
272	}
273	;;
274	{ .mmf
275	CPREFETCH [PREC], LDC
276	setf.d	f68  = r0
277	mov	f76  = f0
278	}
279	{ .mfi
280	setf.d	f84  = r0
281	mov	f92  = f0
282	adds	L =  1, K
283	}
284	;;
285	{ .mmf
286	CPREFETCH [PREC], LDC
287	setf.d	f100 = r0
288	mov	f108 = f0
289	}
290	{ .mfi
291	setf.d	f116 = r0
292	mov	f124 = f0
293	adds	PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET
294	}
295	;;
296	{ .mmf
297	CPREFETCH [PREC], LDC
298	setf.d	f69  = r0
299	mov	f77  = f0
300	}
301	{ .mfi
302	setf.d	f85  = r0
303	mov	f93  = f0
304	adds	PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET
305	}
306	;;
307	{ .mmf
308	CPREFETCH [PREC], LDC
309	setf.d	f101 = r0
310	mov	f109 = f0
311	}
312	{ .mfi
313	setf.d	f117 = r0
314	mov	f125 = f0
315	tbit.z	p12, p0 = L, 0
316	}
317	;;
318	{ .mmf
319	CPREFETCH [PREC], LDC
320	setf.d	f70  = r0
321	mov	f78  = f0
322	}
323	{ .mfi
324	setf.d	f86  = r0
325	mov	f94  = f0
326	shr	L = L, 1
327	}
328	;;
329	{ .mmf
330	CPREFETCH [PREC], LDC
331	setf.d	f102 = r0
332	mov	f110 = f0
333	}
334	{ .mfi
335	setf.d	f118 = r0
336	mov	f126 = f0
337	adds	L =  -1, L
338	}
339	;;
340	{ .mmf
341	CPREFETCH [PREC], LDC
342	setf.d	f71  = r0
343	mov	f79  = f0
344	}
345	{ .mfi
346	setf.d	f87  = r0
347	mov	f95  = f0
348	mov	ar.lc = L
349	}
350	;;
351	{ .mmf
352	CPREFETCH [PREC]
353	setf.d	f103 = r0
354	mov	f111 = f0
355	}
356	{ .mfi
357	setf.d	f119 = r0
358	mov	f127 = f0
359	cmp.eq	p3, p0 = r0, r0
360	}
361	;;
362	.align 16
363
364.L012:
365/*  1 */
366	{ .mfi
367	lfetch.nt1	[PREA],  16 * SIZE
368	FMA	f64   = f32, f48, f64	// A1 * B1
369	nop	__LINE__
370	}
371	{ .mfi
372	(p12) cmp.ne p3, p0 =  0, L
373	FMA	f72   = f32, f49, f72	// A1 * B2
374	nop	__LINE__
375	}
376	;;
377/*  2 */
378	{ .mfi
379	lfetch.nt1	[PREB],  16 * SIZE
380	FMA	f80   = f32, f50, f80	// A1 * B3
381	nop	__LINE__
382	}
383	{ .mfi
384	cmp.ne	p4, p5 =  0, L
385	FMA	f88   = f32, f51, f88	// A1 * B4
386	nop	__LINE__
387	}
388	;;
389/*  3 */
390	{ .mfi
391	(p3) LDFPD	f56, f57 = [BOFFSET],   2 * SIZE
392	FMA	f96   = f32, f52, f96	// A1 * B5
393	nop	__LINE__
394	}
395	{ .mfi
396	adds	C9  = 4 * SIZE, C1
397	FMA	f104  = f32, f53, f104	// A1 * B6
398	nop	__LINE__
399	}
400	;;
401/*  4 */
402	{ .mfi
403	(p3) LDFPD	f40, f41 = [AOFFSET], 2 * SIZE
404	FMA	f112  = f32, f54, f112	// A1 * B7
405	nop	__LINE__
406	}
407	{ .mfi
408	adds	C10 = 4 * SIZE, C2
409	FMA	f120  = f32, f55, f120	// A1 * B8
410	nop	__LINE__
411	}
412	;;
413/*  5 */
414	{ .mfi
415	(p3) LDFPD	f58, f59 = [BOFFSET],  2 * SIZE
416	FMA	f65   = f33, f48, f65	// A2 * B1
417	nop	__LINE__
418	}
419	{ .mfi
420	adds	C11 = 4 * SIZE, C3
421	FMA	f73   = f33, f49, f73	// A2 * B2
422	nop	__LINE__
423	}
424	;;
425/*  6 */
426	{ .mfi
427	(p3) LDFPD	f60, f61 = [BOFFSET], 2 * SIZE
428	FMA	f81   = f33, f50, f81	// A2 * B3
429	nop	__LINE__
430	}
431	{ .mfi
432	adds	C12 = 4 * SIZE, C4
433	FMA	f89   = f33, f51, f89	// A2 * B4
434	nop	__LINE__
435	}
436	;;
437/*  7 */
438	{ .mfi
439	(p3) LDFPD	f62, f63 = [BOFFSET], 2 * SIZE
440	FMA	f97   = f33, f52, f97	// A2 * B5
441	nop	__LINE__
442	}
443	{ .mfi
444	adds	C13 = 4 * SIZE, C5
445	FMA	f105  = f33, f53, f105	// A2 * B6
446	nop	__LINE__
447	}
448	;;
449/*  8 */
450	{ .mfi
451	(p3) LDFPD	f42, f43 = [AOFFSET], 2 * SIZE
452	FMA	f113  = f33, f54, f113	// A2 * B7
453	nop	__LINE__
454	}
455	{ .mfi
456	adds	C14 = 4 * SIZE, C6
457	FMA	f121  = f33, f55, f121	// A2 * B8
458	nop	__LINE__
459	}
460	;;
461/*  9 */
462	{ .mfi
463	(p3) LDFPD	f44, f45 = [AOFFSET], 2 * SIZE
464	FMA	f66   = f34, f48, f66	// A3 * B1
465	nop	__LINE__
466	}
467	{ .mfi
468	adds	C15 = 4 * SIZE, C7
469	FMA	f74   = f34, f49, f74	// A3 * B2
470	nop	__LINE__
471	}
472	;;
473/* 10 */
474	{ .mfi
475	(p3) LDFPD	f46, f47 = [AOFFSET], 2 * SIZE
476	FMA	f82   = f34, f50, f82	// A3 * B3
477	nop	__LINE__
478	}
479	{ .mfi
480	adds	C16 = 4 * SIZE, C8
481	FMA	f90   = f34, f51, f90	// A3 * B4
482	nop	__LINE__
483	}
484	;;
485/* 11 */
486	{ .mfi
487	FMA	f98   = f34, f52, f98	// A3 * B5
488	nop	__LINE__
489	}
490	{ .mfi
491	nop	__LINE__
492	FMA	f106  = f34, f53, f106	// A3 * B6
493	nop	__LINE__
494	}
495	;;
496/* 12 */
497	{ .mfi
498	FMA	f114  = f34, f54, f114	// A3 * B7
499	nop	__LINE__
500	}
501	{ .mfi
502	nop	__LINE__
503	FMA	f122  = f34, f55, f122	// A3 * B8
504	nop	__LINE__
505	}
506	;;
507/* 13 */
508	{ .mfi
509	nop	__LINE__
510	FMA	f67   = f35, f48, f67	// A4 * B1
511	}
512	{ .mfi
513	nop	__LINE__
514	FMA	f75   = f35, f49, f75	// A4 * B2
515	nop	__LINE__
516	}
517	;;
518/* 14 */
519	{ .mfi
520	FMA	f83   = f35, f50, f83	// A4 * B3
521	nop	__LINE__
522	}
523	{ .mfi
524	nop	__LINE__
525	FMA	f91   = f35, f51, f91	// A4 * B4
526	nop	__LINE__
527	}
528	;;
529/* 15 */
530	{ .mfi
531	FMA	f99   = f35, f52, f99	// A4 * B5
532	nop	__LINE__
533	}
534	{ .mfi
535	nop	__LINE__
536	FMA	f107  = f35, f53, f107	// A4 * B6
537	nop	__LINE__
538	}
539	;;
540/* 16 */
541	{ .mfi
542	FMA	f115  = f35, f54, f115	// A4 * B7
543	nop	__LINE__
544	}
545	{ .mfi
546	nop	__LINE__
547	FMA	f123  = f35, f55, f123	// A4 * B8
548	nop	__LINE__
549	}
550	;;
551/* 17 */
552	{ .mfi
553	nop	__LINE__
554	FMA	f68   = f36, f48, f68	// A5 * B1
555	nop	__LINE__
556	}
557	{ .mfi
558	nop	__LINE__
559	FMA	f76   = f36, f49, f76	// A5 * B2
560	nop	__LINE__
561	}
562	;;
563/* 18 */
564	{ .mfi
565	nop	__LINE__
566	FMA	f84   = f36, f50, f84	// A5 * B3
567	nop	__LINE__
568	}
569	{ .mfi
570	nop	__LINE__
571	FMA	f92   = f36, f51, f92	// A5 * B4
572	nop	__LINE__
573	}
574	;;
575/* 19 */
576	{ .mfi
577	nop	__LINE__
578	FMA	f100  = f36, f52, f100	// A5 * B5
579	nop	__LINE__
580	}
581	{ .mfi
582	nop	__LINE__
583	FMA	f108  = f36, f53, f108	// A5 * B6
584	nop	__LINE__
585	}
586	;;
587/* 20 */
588	{ .mfi
589	nop	__LINE__
590	FMA	f116  = f36, f54, f116	// A5 * B7
591	nop	__LINE__
592	}
593	{ .mfi
594	nop	__LINE__
595	FMA	f124  = f36, f55, f124	// A5 * B8
596	nop	__LINE__
597	}
598	;;
599/* 21 */
600	{ .mfi
601	nop	__LINE__
602	FMA	f69   = f37, f48, f69	// A6 * B1
603	nop	__LINE__
604	}
605	{ .mfi
606	nop	__LINE__
607	FMA	f77   = f37, f49, f77	// A6 * B2
608	nop	__LINE__
609	}
610	;;
611/* 22 */
612	{ .mfi
613	nop	__LINE__
614	FMA	f85   = f37, f50, f85	// A6 * B3
615	nop	__LINE__
616	}
617	{ .mfi
618	nop	__LINE__
619	FMA	f93   = f37, f51, f93	// A6 * B4
620	nop	__LINE__
621	}
622	;;
623/* 23 */
624	{ .mfi
625	nop	__LINE__
626	FMA	f101  = f37, f52, f101	// A6 * B5
627	nop	__LINE__
628	}
629	{ .mfi
630	nop	__LINE__
631	FMA	f109  = f37, f53, f109	// A6 * B6
632	nop	__LINE__
633	}
634	;;
635/* 24 */
636	{ .mfi
637	nop	__LINE__
638	FMA	f117  = f37, f54, f117	// A6 * B7
639	nop	__LINE__
640	}
641	{ .mfi
642	nop	__LINE__
643	FMA	f125  = f37, f55, f125	// A6 * B8
644	nop	__LINE__
645	}
646	;;
647/* 25 */
648	{ .mfi
649	nop	__LINE__
650	FMA	f70   = f38, f48, f70	// A7 * B1
651	nop	__LINE__
652	}
653	{ .mfi
654	nop	__LINE__
655	FMA	f78   = f38, f49, f78	// A7 * B2
656	nop	__LINE__
657	}
658	;;
659/* 26 */
660	{ .mfi
661	nop	__LINE__
662	FMA	f86   = f38, f50, f86	// A7 * B3
663	nop	__LINE__
664	}
665	{ .mfi
666	nop	__LINE__
667	FMA	f94   = f38, f51, f94	// A7 * B4
668	nop	__LINE__
669	}
670	;;
671/* 27 */
672	{ .mfi
673	nop	__LINE__
674	FMA	f102  = f38, f52, f102	// A7 * B5
675	nop	__LINE__
676	}
677	{ .mfi
678	nop	__LINE__
679	FMA	f110  = f38, f53, f110	// A7 * B6
680	nop	__LINE__
681	}
682	;;
683/* 28 */
684	{ .mfi
685	nop	__LINE__
686	FMA	f118  = f38, f54, f118	// A7 * B7
687	nop	__LINE__
688	}
689	{ .mfi
690	nop	__LINE__
691	FMA	f126  = f38, f55, f126	// A7 * B8
692	nop	__LINE__
693	}
694	;;
695/* 29 */
696	{ .mfi
697	nop	__LINE__
698	FMA	f71   = f39, f48, f71	// A8 * B1
699	nop	__LINE__
700	}
701	{ .mfi
702	nop	__LINE__
703	FMA	f79   = f39, f49, f79	// A8 * B2
704	nop	__LINE__
705	}
706	;;
707/* 30 */
708	{ .mfi
709	(p4) LDFPD	f32, f33 = [AOFFSET],   2 * SIZE
710	FMA	f87   = f39, f50, f87	// A8 * B3
711	nop	__LINE__
712	}
713	{ .mfi
714	nop	__LINE__
715	FMA	f95   = f39, f51, f95	// A8 * B4
716	nop	__LINE__
717	}
718	;;
719/* 31 */
720	{ .mfi
721	(p4) LDFPD	f48, f49 = [BOFFSET],   2 * SIZE
722	FMA	f103  = f39, f52, f103	// A8 * B5
723	nop	__LINE__
724	}
725	{ .mfi
726	nop	__LINE__
727	FMA	f111  = f39, f53, f111	// A8 * B6
728	nop	__LINE__
729	}
730	;;
731/* 32 */
732	{ .mfi
733	nop	__LINE__
734	FMA	f119  = f39, f54, f119	// A8 * B7
735	nop	__LINE__
736	}
737	{ .mfi
738	nop	__LINE__
739	FMA	f127  = f39, f55, f127	// A8 * B8
740	nop	__LINE__
741	}
742	;;
743/* 33 */
744	{ .mfi
745	nop	__LINE__
746	(p3) FMA	f64   = f40, f56, f64	// A1 * B1
747	nop	__LINE__
748	}
749	{ .mfi
750	nop	__LINE__
751	(p3) FMA	f72   = f40, f57, f72	// A1 * B2
752	nop	__LINE__
753	}
754	;;
755/* 34 */
756	{ .mfi
757	(p4) LDFPD	f50, f51 = [BOFFSET],  2 * SIZE
758	(p3) FMA	f80   = f40, f58, f80	// A1 * B3
759	nop	__LINE__
760	}
761	{ .mfi
762	nop	__LINE__
763	(p3) FMA	f88   = f40, f59, f88	// A1 * B4
764	nop	__LINE__
765	}
766	;;
767/* 35 */
768	{ .mfi
769	(p4) LDFPD	f52, f53 = [BOFFSET], 2 * SIZE
770	(p3) FMA	f96   = f40, f60, f96	// A1 * B5
771	nop	__LINE__
772	}
773	{ .mfi
774	nop	__LINE__
775	(p3) FMA	f104  = f40, f61, f104	// A1 * B6
776	nop	__LINE__
777	}
778	;;
779/* 36 */
780	{ .mfi
781	(p4) LDFPD	f54, f55 = [BOFFSET], 2 * SIZE
782	(p3) FMA	f112  = f40, f62, f112	// A1 * B7
783	nop	__LINE__
784	}
785	{ .mfi
786	nop	__LINE__
787	(p3) FMA	f120  = f40, f63, f120	// A1 * B8
788	nop	__LINE__
789	}
790	;;
791/* 37 */
792	{ .mfi
793	(p4) LDFPD	f34, f35 = [AOFFSET], 2 * SIZE
794	(p3) FMA	f65   = f41, f56, f65	// A2 * B1
795	nop	__LINE__
796	}
797	{ .mfi
798	nop	__LINE__
799	(p3) FMA	f73   = f41, f57, f73	// A2 * B2
800	nop	__LINE__
801	}
802	;;
803/* 38 */
804	{ .mfi
805	(p4) LDFPD	f36, f37 = [AOFFSET], 2 * SIZE
806	(p3) FMA	f81   = f41, f58, f81	// A2 * B3
807	nop	__LINE__
808	}
809	{ .mfi
810	nop	__LINE__
811	(p3) FMA	f89   = f41, f59, f89	// A2 * B4
812	nop	__LINE__
813	}
814	;;
815/* 39 */
816	{ .mfi
817	(p4) LDFPD	f38, f39 = [AOFFSET], 2 * SIZE
818	(p3) FMA	f97   = f41, f60, f97	// A2 * B5
819	nop	__LINE__
820	}
821	{ .mfi
822	nop	__LINE__
823	(p3) FMA	f105  = f41, f61, f105	// A2 * B6
824	nop	__LINE__
825	}
826	;;
827/* 40 */
828	{ .mfi
829	(p5) LDFD	f6  = [C1 ], SIZE
830	(p3) FMA	f113  = f41, f62, f113	// A2 * B7
831	nop	__LINE__
832	}
833	{ .mfi
834	(p5) LDFD	f7  = [C9 ], SIZE
835	(p3) FMA	f121  = f41, f63, f121	// A2 * B8
836	nop	__LINE__
837	}
838	;;
839 /* 41 */
840	{ .mfi
841	(p5) LDFD	f10 = [C1 ], SIZE
842	(p3) FMA	f66   = f42, f56, f66	// A3 * B1
843	nop	__LINE__
844	}
845	{ .mfi
846	(p5) LDFD	f11 = [C9 ], SIZE
847	(p3) FMA	f74   = f42, f57, f74	// A3 * B2
848	nop	__LINE__
849	}
850	;;
851/* 42 */
852	{ .mfi
853	(p5) LDFD	f12 = [C1 ], SIZE
854	(p3) FMA	f82   = f42, f58, f82	// A3 * B3
855	nop	__LINE__
856	}
857	{ .mfi
858	(p5) LDFD	f13 = [C9 ], SIZE
859	(p3) FMA	f90   = f42, f59, f90	// A3 * B4
860	nop	__LINE__
861	}
862	;;
863/* 43 */
864	{ .mfi
865	(p5) LDFD	f14 = [C1 ], 5 * SIZE
866	(p3) FMA	f98   = f42, f60, f98	// A3 * B5
867	nop	__LINE__
868	}
869	{ .mfi
870	(p5) LDFD	f15 = [C9 ], 5 * SIZE
871	(p3) FMA	f106  = f42, f61, f106	// A3 * B6
872	nop	__LINE__
873	}
874	;;
875/* 44 */
876	{ .mfi
877	(p5) LDFD	f16 = [C1 ], SIZE
878	(p3) FMA	f114  = f42, f62, f114	// A3 * B7
879	nop	__LINE__
880	}
881	{ .mfi
882	(p5) LDFD	f17 = [C9 ], SIZE
883	(p3) FMA	f122  = f42, f63, f122	// A3 * B8
884	nop	__LINE__
885	}
886	;;
887/* 45 */
888	{ .mfi
889	(p5) LDFD	f18 = [C1 ], SIZE
890	(p3) FMA	f67   = f43, f56, f67	// A4 * B1
891	nop	__LINE__
892	}
893	{ .mfi
894	(p5) LDFD	f19 = [C9 ], SIZE
895	(p3) FMA	f75   = f43, f57, f75	// A4 * B2
896	nop	__LINE__
897	}
898	;;
899/* 46 */
900	{ .mfi
901	(p5) LDFD	f20 = [C1 ], SIZE
902	(p3) FMA	f83   = f43, f58, f83	// A4 * B3
903	nop	__LINE__
904	}
905	{ .mfi
906	(p5) LDFD	f21 = [C9 ], SIZE
907	(p3) FMA	f91   = f43, f59, f91	// A4 * B4
908	nop	__LINE__
909	}
910	;;
911/* 47 */
912	{ .mfi
913	(p5) LDFD	f22 = [C1 ], - 11 * SIZE
914	(p3) FMA	f99   = f43, f60, f99	// A4 * B5
915	nop	__LINE__
916	}
917	{ .mfi
918	(p5) LDFD	f23 = [C9 ], - 11 * SIZE
919	(p3) FMA	f107  = f43, f61, f107	// A4 * B6
920	nop	__LINE__
921	}
922	;;
923/* 48 */
924	{ .mfi
925	(p5) LDFD	f24 = [C2 ], SIZE
926	(p3) FMA	f115  = f43, f62, f115	// A4 * B7
927	nop	__LINE__
928	}
929	{ .mfi
930	(p5) LDFD	f25 = [C10], SIZE
931	(p3) FMA	f123  = f43, f63, f123	// A4 * B8
932	nop	__LINE__
933	}
934	;;
935/* 49 */
936	{ .mfi
937	(p5) LDFD	f26 = [C2 ], SIZE
938	(p3) FMA	f68   = f44, f56, f68	// A5 * B1
939	nop	__LINE__
940	}
941	{ .mfi
942	(p5) LDFD	f27 = [C10], SIZE
943	(p3) FMA	f76   = f44, f57, f76	// A5 * B2
944	nop	__LINE__
945	}
946	;;
947/* 50 */
948	{ .mfi
949	(p5) LDFD	f28 = [C2 ], SIZE
950	(p3) FMA	f84   = f44, f58, f84	// A5 * B3
951	nop	__LINE__
952	}
953	{ .mfi
954	(p5) LDFD	f29 = [C10], SIZE
955	(p3) FMA	f92   = f44, f59, f92	// A5 * B4
956	nop	__LINE__
957	}
958	;;
959/* 51 */
960	{ .mfi
961	(p5) LDFD	f30 = [C2 ], 5 * SIZE
962	(p3) FMA	f100  = f44, f60, f100	// A5 * B5
963	nop	__LINE__
964	}
965	{ .mfi
966	(p5) LDFD	f31 = [C10], 5 * SIZE
967	(p3) FMA	f108  = f44, f61, f108	// A5 * B6
968	nop	__LINE__
969	}
970	;;
971/* 52 */
972	{ .mfi
973	(p5) LDFD	f32 = [C2 ], SIZE
974	(p3) FMA	f116  = f44, f62, f116	// A5 * B7
975	nop	__LINE__
976	}
977	{ .mfi
978	(p5) LDFD	f33 = [C10], SIZE
979	(p3) FMA	f124  = f44, f63, f124	// A5 * B8
980	nop	__LINE__
981	}
982	;;
983/* 53 */
984	{ .mfi
985	(p5) LDFD	f34 = [C2 ], SIZE
986	(p3) FMA	f69   = f45, f56, f69	// A6 * B1
987	nop	__LINE__
988	}
989	{ .mfi
990	(p5) LDFD	f35 = [C10], SIZE
991	(p3) FMA	f77   = f45, f57, f77	// A6 * B2
992	nop	__LINE__
993	}
994	;;
995/* 54 */
996	{ .mfi
997	(p5) LDFD	f36 = [C2 ], SIZE
998	(p3) FMA	f85   = f45, f58, f85	// A6 * B3
999	nop	__LINE__
1000	}
1001	{ .mfi
1002	(p5) LDFD	f37 = [C10], SIZE
1003	(p3) FMA	f93   = f45, f59, f93	// A6 * B4
1004	nop	__LINE__
1005	}
1006	;;
1007/* 55 */
1008	{ .mfi
1009	(p5) LDFD	f38 = [C2 ], - 11 * SIZE
1010 	(p3) FMA	f101  = f45, f60, f101	// A6 * B5
1011	nop	__LINE__
1012	}
1013	{ .mfi
1014	(p5) LDFD	f39 = [C10], - 11 * SIZE
1015	(p3) FMA	f109  = f45, f61, f109	// A6 * B6
1016	nop	__LINE__
1017	}
1018	;;
1019/* 56 */
1020	{ .mfi
1021	(p5) LDFD	f48 = [C3 ], SIZE
1022	(p3) FMA	f117  = f45, f62, f117	// A6 * B7
1023	nop	__LINE__
1024	}
1025	{ .mfi
1026	(p5) LDFD	f49 = [C11], SIZE
1027	(p3) FMA	f125  = f45, f63, f125	// A6 * B8
1028	nop	__LINE__
1029	}
1030	;;
1031/* 57 */
1032	{ .mfi
1033	(p5) LDFD	f50 = [C3 ], SIZE
1034	(p3) FMA	f70   = f46, f56, f70	// A7 * B1
1035	nop	__LINE__
1036	}
1037	{ .mfi
1038	(p5) LDFD	f51 = [C11], SIZE
1039	(p3) FMA	f78   = f46, f57, f78	// A7 * B2
1040	nop	__LINE__
1041	}
1042	;;
1043/* 58 */
1044	{ .mfi
1045	(p5) LDFD	f52 = [C3 ], SIZE
1046	(p3) FMA	f86   = f46, f58, f86	// A7 * B3
1047	nop	__LINE__
1048	}
1049	{ .mfi
1050	(p5) LDFD	f53 = [C11], SIZE
1051	(p3) FMA	f94   = f46, f59, f94	// A7 * B4
1052	nop  __LINE__
1053	}
1054	;;
1055/* 59 */
1056	{ .mfi
1057	(p5) LDFD	f54 = [C3 ], 5 * SIZE
1058	(p3) FMA	f102  = f46, f60, f102	// A7 * B5
1059	nop	__LINE__
1060	}
1061	{ .mfi
1062	(p5) LDFD	f55 = [C11], 5 * SIZE
1063	(p3) FMA	f110  = f46, f61, f110	// A7 * B6
1064	nop	__LINE__
1065	}
1066	;;
1067/* 60 */
1068	{ .mfi
1069	(p5) LDFD	f40 = [C3 ], SIZE
1070	(p3) FMA	f118  = f46, f62, f118	// A7 * B7
1071	nop	__LINE__
1072	}
1073	{ .mfi
1074	(p5) LDFD	f41 = [C11], SIZE
1075	(p3) FMA	f126  = f46, f63, f126	// A7 * B8
1076	nop	__LINE__
1077	}
1078	;;
1079/* 61 */
1080	{ .mfi
1081	(p5) LDFD	f42 = [C3 ], SIZE
1082	(p3) FMA	f71   = f47, f56, f71	// A8 * B1
1083	nop	__LINE__
1084	}
1085	{ .mfi
1086	(p5) LDFD	f43 = [C11], SIZE
1087	(p3) FMA	f79   = f47, f57, f79	// A8 * B2
1088	nop	__LINE__
1089	}
1090	;;
1091/* 62 */
1092	{ .mfi
1093	(p5) LDFD	f44 = [C3 ], SIZE
1094	(p3) FMA	f87   = f47, f58, f87	// A8 * B3
1095	nop	__LINE__
1096	}
1097	{ .mfi
1098	(p5) LDFD	f45 = [C11], SIZE
1099	(p3) FMA	f95   = f47, f59, f95	// A8 * B4
1100	nop	__LINE__
1101	}
1102	;;
1103/* 63 */
1104	{ .mfi
1105	(p5) LDFD	f46 = [C3 ], - 11 * SIZE
1106	(p3) FMA	f103  = f47, f60, f103	// A8 * B5
1107	nop	__LINE__
1108	}
1109	{ .mfi
1110	(p5) LDFD	f56 = [C11], - 11 * SIZE
1111	(p3) FMA	f111  = f47, f61, f111	// A8 * B6
1112	nop	__LINE__
1113	}
1114	;;
1115/* 64 */
1116	{ .mfi
1117	(p5) LDFD	f57  = [C4 ], SIZE
1118	(p3) FMA	f119  = f47, f62, f119	// A8 * B7
1119	adds	L = -1, L
1120	}
1121	{ .mfb
1122	(p5) LDFD	f58  = [C12], SIZE
1123	(p3) FMA	f127  = f47, f63, f127	// A8 * B8
1124	br.cloop.sptk.few .L012
1125	}
1126	;;
1127.L013:
1128	{ .mmf
1129	(p5) LDFD	f59 = [C4 ], SIZE
1130	(p5) LDFD	f60 = [C12], SIZE
1131	FMA	f6   = ALPHA_R, f64, f6
1132	}
1133	{ .mmf
1134	cmp.ne	p6, p0 = 1, I
1135	nop	__LINE__
1136	FMA	f7   = ALPHA_R, f66, f7
1137	}
1138	;;
1139	{ .mmf
1140	(p5) LDFD	f61 = [C4 ], SIZE
1141	(p5) LDFD	f62 = [C12], SIZE
1142	FMA	f10  = ALPHA_I, f64, f10
1143	}
1144	{ .mmf
1145	nop	__LINE__
1146	nop	__LINE__
1147	FMA	f11  = ALPHA_I, f66, f11
1148	}
1149	;;
1150	{ .mmf
1151	(p5) LDFD	f63 = [C4 ], 5 * SIZE
1152	(p5) LDFD	f47 = [C12], 5 * SIZE
1153	FMA	f12  = ALPHA_R, f65, f12
1154	}
1155	{ .mmf
1156	nop	__LINE__
1157	nop	__LINE__
1158	FMA	f13  = ALPHA_R, f67, f13
1159	}
1160	;;
1161	{ .mfi
1162	(p5) LDFD	f64 = [C4 ], SIZE
1163	FMA	f14  = ALPHA_I, f65, f14
1164	nop	__LINE__
1165	}
1166	{ .mfi
1167	(p5) LDFD	f65 = [C12], SIZE
1168	FMA	f15  = ALPHA_I, f67, f15
1169	nop	__LINE__
1170	}
1171	;;
1172	{ .mmf
1173	STFD	[C1 ] = f6, SIZE
1174	STFD	[C9 ] = f7, SIZE
1175	FMA	f16  = ALPHA_R, f68, f16
1176	}
1177	{ .mmf
1178	(p5) LDFD	f6 = [C4 ], SIZE
1179	(p5) LDFD	f7 = [C12], SIZE
1180	FMA	f17  = ALPHA_R, f70, f17
1181	}
1182	;;
1183	{ .mmf
1184	STFD	[C1 ] = f10, SIZE
1185	STFD	[C9 ] = f11, SIZE
1186	FMA	f18  = ALPHA_I, f68, f18
1187	}
1188	{ .mmf
1189	(p5) LDFD	f10 = [C4 ], SIZE
1190	(p5) LDFD	f11 = [C12], SIZE
1191	FMA	f19  = ALPHA_I, f70, f19
1192	}
1193	;;
1194	{ .mmf
1195	STFD	[C1 ] = f12, SIZE
1196	STFD	[C9 ] = f13, SIZE
1197	FMA	f20  = ALPHA_R, f69, f20
1198	}
1199	{ .mmf
1200	(p5) LDFD	f12 = [C4 ], - 11 * SIZE
1201	(p5) LDFD	f13 = [C12], - 11 * SIZE
1202	FMA	f21  = ALPHA_R, f71, f21
1203	}
1204	;;
1205	{ .mmf
1206	STFD	[C1 ] = f14, 5 * SIZE
1207	STFD	[C9 ] = f15, 5 * SIZE
1208	FMA	f22  = ALPHA_I, f69, f22
1209	}
1210	{ .mmf
1211	(p5) LDFD	f14 = [C5 ], SIZE
1212	(p5) LDFD	f15 = [C13], SIZE
1213	FMA	f23  = ALPHA_I, f71, f23
1214	}
1215	;;
1216	{ .mmf
1217	STFD	[C1 ] = f16, SIZE
1218	STFD	[C9 ] = f17, SIZE
1219	FMA	f24  = ALPHA_R, f72, f24
1220	}
1221	{ .mmf
1222	(p5) LDFD	f16 = [C5 ], SIZE
1223	(p5) LDFD	f17 = [C13], SIZE
1224	FMA	f25  = ALPHA_R, f74, f25
1225	}
1226	;;
1227	{ .mmf
1228	STFD	[C1 ] = f18, SIZE
1229	STFD	[C9 ] = f19, SIZE
1230	FMA	f26  = ALPHA_I, f72, f26
1231	}
1232	{ .mmf
1233	(p5) LDFD	f18 = [C5 ], SIZE
1234	(p5) LDFD	f19 = [C13], SIZE
1235	FMA	f27  = ALPHA_I, f74, f27
1236	}
1237	;;
1238	{ .mmf
1239	STFD	[C1 ] = f20, SIZE
1240	STFD	[C9 ] = f21, SIZE
1241	FMA	f28  = ALPHA_R, f73, f28
1242	}
1243	{ .mmf
1244	(p5) LDFD	f20 = [C5 ], 5 * SIZE
1245	(p5) LDFD	f21 = [C13], 5 * SIZE
1246	FMA	f29  = ALPHA_R, f75, f29
1247	}
1248	;;
1249	{ .mmf
1250	STFD	[C1 ] = f22, 5 * SIZE
1251	STFD	[C9 ] = f23, 5 * SIZE
1252	FMA	f30  = ALPHA_I, f73, f30
1253	}
1254	{ .mmf
1255	(p5) LDFD	f22 = [C5 ], SIZE
1256	(p5) LDFD	f23 = [C13], SIZE
1257	FMA	f31  = ALPHA_I, f75, f31
1258	}
1259	;;
1260	{ .mmf
1261	STFD	[C2 ] = f24, SIZE
1262	STFD	[C10] = f25, SIZE
1263	FMA	f32  = ALPHA_R, f76, f32
1264	}
1265	{ .mmf
1266	(p5) LDFD	f24 = [C5 ], SIZE
1267	(p5) LDFD	f25 = [C13], SIZE
1268	FMA	f33  = ALPHA_R, f78, f33
1269	}
1270	;;
1271	{ .mmf
1272	STFD	[C2 ] = f26, SIZE
1273	STFD	[C10] = f27, SIZE
1274	FMA	f34  = ALPHA_I, f76, f34
1275	}
1276	{ .mmf
1277	(p5) LDFD	f26 = [C5 ], SIZE
1278	(p5) LDFD	f27 = [C13], SIZE
1279	FMA	f35  = ALPHA_I, f78, f35
1280	}
1281	;;
1282	{ .mmf
1283	STFD	[C2 ] = f28, SIZE
1284	STFD	[C10] = f29, SIZE
1285	FMA	f36  = ALPHA_R, f77, f36
1286	}
1287	{ .mmf
1288	(p5) LDFD	f28 = [C5 ], - 11 * SIZE
1289	(p5) LDFD	f29 = [C13], - 11 * SIZE
1290	FMA	f37  = ALPHA_R, f79, f37
1291	}
1292	;;
1293	{ .mmf
1294	STFD	[C2 ] = f30, 5 * SIZE
1295	STFD	[C10] = f31, 5 * SIZE
1296	FMA	f38  = ALPHA_I, f77, f38
1297	}
1298	{ .mmf
1299	(p5) LDFD	f30 = [C6 ], SIZE
1300	(p5) LDFD	f31 = [C14], SIZE
1301	FMA	f39  = ALPHA_I, f79, f39
1302	}
1303	;;
1304	{ .mmf
1305	STFD	[C2 ] = f32, SIZE
1306	STFD	[C10] = f33, SIZE
1307	FMA	f48  = ALPHA_R, f80, f48
1308	}
1309	{ .mmf
1310	(p5) LDFD	f32 = [C6 ], SIZE
1311	(p5) LDFD	f33 = [C14], SIZE
1312	FMA	f49  = ALPHA_R, f82, f49
1313	}
1314	;;
1315	{ .mmf
1316	STFD	[C2 ] = f34, SIZE
1317	STFD	[C10] = f35, SIZE
1318	FMA	f50  = ALPHA_I, f80, f50
1319	}
1320	{ .mmf
1321	(p5) LDFD	f34 = [C6 ], SIZE
1322	(p5) LDFD	f35 = [C14], SIZE
1323	FMA	f51  = ALPHA_I, f82, f51
1324	}
1325	;;
1326	{ .mmf
1327	STFD	[C2 ] = f36, SIZE
1328	STFD	[C10] = f37, SIZE
1329	FMA	f52  = ALPHA_R, f81, f52
1330	}
1331	{ .mmf
1332	(p5) LDFD	f36 = [C6 ], 5 * SIZE
1333	(p5) LDFD	f37 = [C14], 5 * SIZE
1334	FMA	f53  = ALPHA_R, f83, f53
1335	}
1336	;;
1337	{ .mmf
1338	STFD	[C2 ] = f38, 5 * SIZE
1339	STFD	[C10] = f39, 5 * SIZE
1340	FMA	f54  = ALPHA_I, f81, f54
1341	}
1342	{ .mmf
1343	(p5) LDFD	f38 = [C6 ], SIZE
1344	(p5) LDFD	f39 = [C14], SIZE
1345	FMA	f55  = ALPHA_I, f83, f55
1346	}
1347	;;
1348	{ .mmf
1349	STFD	[C3 ] = f48, SIZE
1350	STFD	[C11] = f49, SIZE
1351	FMA	f40  = ALPHA_R, f84, f40
1352	}
1353	{ .mmf
1354	(p5) LDFD	f48 = [C6 ], SIZE
1355	(p5) LDFD	f49 = [C14], SIZE
1356	FMA	f41  = ALPHA_R, f86, f41
1357	}
1358	;;
1359	{ .mmf
1360	STFD	[C3 ] = f50, SIZE
1361	STFD	[C11] = f51, SIZE
1362	FMA	f42  = ALPHA_I, f84, f42
1363	}
1364	{ .mmf
1365	(p5) LDFD	f50 = [C6 ], SIZE
1366	(p5) LDFD	f51 = [C14], SIZE
1367	FMA	f43  = ALPHA_I, f86, f43
1368	}
1369	;;
1370	{ .mmf
1371	STFD	[C3 ] = f52, SIZE
1372	STFD	[C11] = f53, SIZE
1373	FMA	f44  = ALPHA_R, f85, f44
1374	}
1375	{ .mmf
1376	(p5) LDFD	f52 = [C6 ], - 11 * SIZE
1377	(p5) LDFD	f53 = [C14], - 11 * SIZE
1378	FMA	f45  = ALPHA_R, f87, f45
1379	}
1380	;;
1381	{ .mmf
1382	STFD	[C3 ] = f54, 5 * SIZE
1383	STFD	[C11] = f55, 5 * SIZE
1384	FMA	f46  = ALPHA_I, f85, f46
1385	}
1386	{ .mmf
1387	(p5) LDFD	f54 = [C7 ], SIZE
1388	(p5) LDFD	f55 = [C15], SIZE
1389	FMA	f56  = ALPHA_I, f87, f56
1390	}
1391	;;
1392	{ .mmf
1393	STFD	[C3 ] = f40, SIZE
1394	STFD	[C11] = f41, SIZE
1395	FMA	f57  = ALPHA_R, f88, f57
1396	}
1397	{ .mmf
1398	(p5) LDFD	f40 = [C7 ], SIZE
1399	(p5) LDFD	f41 = [C15], SIZE
1400	FMA	f58  = ALPHA_R, f90, f58
1401	}
1402	;;
1403	{ .mmf
1404	STFD	[C3 ] = f42, SIZE
1405	STFD	[C11] = f43, SIZE
1406	FMA	f59  = ALPHA_I, f88, f59
1407	}
1408	{ .mmf
1409	(p5) LDFD	f42 = [C7 ], SIZE
1410	(p5) LDFD	f43 = [C15], SIZE
1411	FMA	f60  = ALPHA_I, f90, f60
1412	}
1413	;;
1414	{ .mmf
1415	STFD	[C3 ] = f44, SIZE
1416	STFD	[C11] = f45, SIZE
1417	FMA	f61  = ALPHA_R, f89, f61
1418	}
1419	{ .mmf
1420	(p5) LDFD	f44 = [C7 ], 5 * SIZE
1421	(p5) LDFD	f45 = [C15], 5 * SIZE
1422	FMA	f62  = ALPHA_R, f91, f62
1423	}
1424	;;
1425	{ .mmf
1426	STFD	[C3 ] = f46, 5 * SIZE
1427	STFD	[C11] = f56, 5 * SIZE
1428	FMA	f63  = ALPHA_I, f89, f63
1429	}
1430	{ .mmf
1431	(p5) LDFD	f46 = [C7 ], SIZE
1432	(p5) LDFD	f56 = [C15], SIZE
1433	FMA	f47  = ALPHA_I, f91, f47
1434	}
1435	;;
1436	{ .mmf
1437	STFD	[C4 ] = f57, SIZE
1438	STFD	[C12] = f58, SIZE
1439	FMA	f64  = ALPHA_R, f92, f64
1440	}
1441	{ .mmf
1442	(p5) LDFD	f57 = [C7 ], SIZE
1443	(p5) LDFD	f58 = [C15], SIZE
1444	FMA	f65  = ALPHA_R, f94, f65
1445	}
1446	;;
1447	{ .mmf
1448	STFD	[C4 ] = f59, SIZE
1449	STFD	[C12] = f60, SIZE
1450	FMA	f6   = ALPHA_I, f92, f6
1451	}
1452	{ .mmf
1453	(p5) LDFD	f59 = [C7 ], SIZE
1454	(p5) LDFD	f60 = [C15], SIZE
1455	FMA	f7   = ALPHA_I, f94, f7
1456	}
1457	;;
1458	{ .mmf
1459	STFD	[C4 ] = f61, SIZE
1460	STFD	[C12] = f62, SIZE
1461	FMA	f10  = ALPHA_R, f93, f10
1462	}
1463	{ .mmf
1464	(p5) LDFD	f61 = [C7 ], - 11 * SIZE
1465	(p5) LDFD	f62 = [C15], - 11 * SIZE
1466	FMA	f11  = ALPHA_R, f95, f11
1467	}
1468	;;
1469	{ .mmf
1470	STFD	[C4 ] = f63, 5 * SIZE
1471	STFD	[C12] = f47, 5 * SIZE
1472	FMA	f12  = ALPHA_I, f93, f12
1473	}
1474	{ .mmf
1475	(p5) LDFD	f63 = [C8 ], SIZE
1476	(p5) LDFD	f47 = [C16], SIZE
1477	FMA	f13  = ALPHA_I, f95, f13
1478	}
1479	;;
1480	{ .mmf
1481	STFD	[C4 ] = f64, SIZE
1482	STFD	[C12] = f65, SIZE
1483	FMA	f14  = ALPHA_R, f96, f14
1484	}
1485	{ .mmf
1486	(p5) LDFD	f64 = [C8 ], SIZE
1487	(p5) LDFD	f65 = [C16], SIZE
1488	FMA	f15  = ALPHA_R, f98, f15
1489	}
1490	;;
1491	{ .mmf
1492	STFD	[C4 ] = f6, SIZE
1493	STFD	[C12] = f7, SIZE
1494	FMA	f16  = ALPHA_I, f96, f16
1495	}
1496	{ .mmf
1497	(p5) LDFD	f6  = [C8 ], SIZE
1498	(p5) LDFD	f7  = [C16], SIZE
1499	FMA	f17  = ALPHA_I, f98, f17
1500	}
1501	;;
1502	{ .mmf
1503	STFD	[C4 ] = f10, SIZE
1504	STFD	[C12] = f11, SIZE
1505	FMA	f18  = ALPHA_R, f97, f18
1506	}
1507	{ .mmf
1508	(p5) LDFD	f10 = [C8 ], 5 * SIZE
1509	(p5) LDFD	f11 = [C16], 5 * SIZE
1510	FMA	f19  = ALPHA_R, f99, f19
1511	}
1512	;;
1513	{ .mmf
1514	STFD	[C4 ] = f12, 5 * SIZE
1515	STFD	[C12] = f13, 5 * SIZE
1516	FMA	f20  = ALPHA_I, f97, f20
1517	}
1518	{ .mmf
1519	(p5) LDFD	f12 = [C8 ], SIZE
1520	(p5) LDFD	f13 = [C16], SIZE
1521	FMA	f21  = ALPHA_I, f99, f21
1522	}
1523	;;
1524	{ .mmf
1525	STFD	[C5 ] = f14, SIZE
1526	STFD	[C13] = f15, SIZE
1527	FMA	f22  = ALPHA_R, f100, f22
1528	}
1529	{ .mmf
1530	(p5) LDFD	f14 = [C8 ], SIZE
1531	(p5) LDFD	f15 = [C16], SIZE
1532	FMA	f23  = ALPHA_R, f102, f23
1533	}
1534	;;
1535	{ .mmf
1536	STFD	[C5 ] = f16, SIZE
1537	STFD	[C13] = f17, SIZE
1538	FMA	f24  = ALPHA_I, f100, f24
1539	}
1540	{ .mmf
1541	(p5) LDFD	f16 = [C8 ], SIZE
1542	(p5) LDFD	f17 = [C16], SIZE
1543	FMA	f25  = ALPHA_I, f102, f25
1544	}
1545	;;
1546	{ .mmf
1547	STFD	[C5 ] = f18, SIZE
1548	STFD	[C13] = f19, SIZE
1549	FMA	f26  = ALPHA_R, f101, f26
1550	}
1551	{ .mmf
1552	(p5) LDFD	f18 = [C8 ], - 11 * SIZE
1553	(p5) LDFD	f19 = [C16], - 11 * SIZE
1554	FMA	f27  = ALPHA_R, f103, f27
1555	}
1556	;;
1557	{ .mmf
1558	STFD	[C5 ] = f20, 5 * SIZE
1559	STFD	[C13] = f21, 5 * SIZE
1560	FMA	f28  = ALPHA_I, f101, f28
1561	}
1562	{ .mmf
1563	nop	__LINE__
1564	nop	__LINE__
1565	FMA	f29  = ALPHA_I, f103, f29
1566	}
1567	;;
1568	{ .mmf
1569	STFD	[C5 ] = f22, SIZE
1570	STFD	[C13] = f23, SIZE
1571	FMA	f30  = ALPHA_R, f104, f30
1572	}
1573	{ .mmf
1574	nop	__LINE__
1575	nop	__LINE__
1576	FMA	f31  = ALPHA_R, f106, f31
1577	}
1578	;;
1579	{ .mmf
1580	STFD	[C5 ] = f24, SIZE
1581	STFD	[C13] = f25, SIZE
1582	FMA	f32  = ALPHA_I, f104, f32
1583	}
1584	{ .mmf
1585	nop	__LINE__
1586	nop	__LINE__
1587	FMA	f33  = ALPHA_I, f106, f33
1588	}
1589	;;
1590	{ .mmf
1591	STFD	[C5 ] = f26, SIZE
1592	STFD	[C13] = f27, SIZE
1593	FMA	f34  = ALPHA_R, f105, f34
1594	}
1595	{ .mmf
1596	nop	__LINE__
1597	nop	__LINE__
1598	FMA	f35  = ALPHA_R, f107, f35
1599	}
1600	;;
1601	{ .mmf
1602	STFD	[C5 ] = f28, 5 * SIZE
1603	STFD	[C13] = f29, 5 * SIZE
1604	FMA	f36  = ALPHA_I, f105, f36
1605	}
1606	{ .mmf
1607	nop	__LINE__
1608	nop	__LINE__
1609	FMA	f37  = ALPHA_I, f107, f37
1610	}
1611	;;
1612	{ .mmf
1613	STFD	[C6 ] = f30, SIZE
1614	STFD	[C14] = f31, SIZE
1615	FMA	f38  = ALPHA_R, f108, f38
1616	}
1617	{ .mmf
1618	nop	__LINE__
1619	nop	__LINE__
1620	FMA	f39  = ALPHA_R, f110, f39
1621	}
1622	;;
1623	{ .mmf
1624	STFD	[C6 ] = f32, SIZE
1625	STFD	[C14] = f33, SIZE
1626	FMA	f48  = ALPHA_I, f108, f48
1627	}
1628	{ .mmf
1629	nop	__LINE__
1630	nop	__LINE__
1631	FMA	f49  = ALPHA_I, f110, f49
1632	}
1633	;;
1634	{ .mmf
1635	STFD	[C6 ] = f34, SIZE
1636	STFD	[C14] = f35, SIZE
1637	FMA	f50  = ALPHA_R, f109, f50
1638	}
1639	{ .mmf
1640	nop	__LINE__
1641	nop	__LINE__
1642	FMA	f51  = ALPHA_R, f111, f51
1643	}
1644	;;
1645	{ .mmf
1646	STFD	[C6 ] = f36, 5 * SIZE
1647	STFD	[C14] = f37, 5 * SIZE
1648	FMA	f52  = ALPHA_I, f109, f52
1649	}
1650	{ .mmf
1651	nop	__LINE__
1652	nop	__LINE__
1653	FMA	f53  = ALPHA_I, f111, f53
1654	}
1655	;;
1656	{ .mmf
1657	STFD	[C6 ] = f38, SIZE
1658	STFD	[C14] = f39, SIZE
1659	FMA	f54  = ALPHA_R, f112, f54
1660	}
1661	{ .mmf
1662	nop	__LINE__
1663	nop	__LINE__
1664	FMA	f55  = ALPHA_R, f114, f55
1665	}
1666	;;
1667	{ .mmf
1668	STFD	[C6 ] = f48, SIZE
1669	STFD	[C14] = f49, SIZE
1670	FMA	f40  = ALPHA_I, f112, f40
1671	}
1672	{ .mmf
1673	nop	__LINE__
1674	nop	__LINE__
1675	FMA	f41  = ALPHA_I, f114, f41
1676	}
1677	;;
1678	{ .mmf
1679	STFD	[C6 ] = f50, SIZE
1680	STFD	[C14] = f51, SIZE
1681	FMA	f42  = ALPHA_R, f113, f42
1682	}
1683	{ .mmf
1684	nop	__LINE__
1685	nop	__LINE__
1686	FMA	f43  = ALPHA_R, f115, f43
1687	}
1688	;;
1689	{ .mmf
1690	STFD	[C6 ] = f52, 5 * SIZE
1691	STFD	[C14] = f53, 5 * SIZE
1692	FMA	f44  = ALPHA_I, f113, f44
1693	}
1694	{ .mmf
1695	nop	__LINE__
1696	nop	__LINE__
1697	FMA	f45  = ALPHA_I, f115, f45
1698	}
1699	;;
1700	{ .mmf
1701	STFD	[C7 ] = f54, SIZE
1702	STFD	[C15] = f55, SIZE
1703	FMA	f46  = ALPHA_R, f116, f46
1704	}
1705	{ .mmf
1706	nop	__LINE__
1707	nop	__LINE__
1708	FMA	f56  = ALPHA_R, f118, f56
1709	}
1710	;;
1711	{ .mmf
1712	STFD	[C7 ] = f40, SIZE
1713	STFD	[C15] = f41, SIZE
1714	FMA	f57  = ALPHA_I, f116, f57
1715	}
1716	{ .mmf
1717	nop	__LINE__
1718	nop	__LINE__
1719	FMA	f58  = ALPHA_I, f118, f58
1720	}
1721	;;
1722	{ .mmf
1723	STFD	[C7 ] = f42, SIZE
1724	STFD	[C15] = f43, SIZE
1725	FMA	f59  = ALPHA_R, f117, f59
1726	}
1727	{ .mmf
1728	nop	__LINE__
1729	nop	__LINE__
1730	FMA	f60  = ALPHA_R, f119, f60
1731	}
1732	;;
1733	{ .mmf
1734	STFD	[C7 ] = f44, 5 * SIZE
1735	STFD	[C15] = f45, 5 * SIZE
1736	FMA	f61  = ALPHA_I, f117, f61
1737	}
1738	{ .mmf
1739	nop	__LINE__
1740	nop	__LINE__
1741	FMA	f62  = ALPHA_I, f119, f62
1742	}
1743	;;
1744	{ .mmf
1745	STFD	[C7 ] = f46, SIZE
1746	STFD	[C15] = f56, SIZE
1747	FMA	f63  = ALPHA_R, f120, f63
1748	}
1749	{ .mmf
1750	nop	__LINE__
1751	nop	__LINE__
1752	FMA	f47  = ALPHA_R, f122, f47
1753	}
1754	;;
1755	{ .mmf
1756	STFD	[C7 ] = f57, SIZE
1757	STFD	[C15] = f58, SIZE
1758	FMA	f64  = ALPHA_I, f120, f64
1759	}
1760	{ .mmf
1761	nop	__LINE__
1762	nop	__LINE__
1763	FMA	f65  = ALPHA_I, f122, f65
1764	}
1765	;;
1766	{ .mmf
1767	STFD	[C7 ] = f59, SIZE
1768	STFD	[C15] = f60, SIZE
1769	FMA	f6   = ALPHA_R, f121, f6
1770	}
1771	{ .mmf
1772	nop	__LINE__
1773	nop	__LINE__
1774	FMA	f7   = ALPHA_R, f123, f7
1775	}
1776	;;
1777	{ .mmf
1778	STFD	[C7 ] = f61, 5 * SIZE
1779	STFD	[C15] = f62, 5 * SIZE
1780	FMA	f10  = ALPHA_I, f121, f10
1781	}
1782	{ .mmf
1783	nop	__LINE__
1784	nop	__LINE__
1785	FMA	f11  = ALPHA_I, f123, f11
1786	}
1787	;;
1788	{ .mmf
1789	STFD	[C8 ] = f63, SIZE
1790	STFD	[C16] = f47, SIZE
1791	FMA	f12  = ALPHA_R, f124, f12
1792	}
1793	{ .mmf
1794	nop	__LINE__
1795	nop	__LINE__
1796	FMA	f13  = ALPHA_R, f126, f13
1797	}
1798	;;
1799	{ .mmf
1800	STFD	[C8 ] = f64, SIZE
1801	STFD	[C16] = f65, SIZE
1802	FMA	f14  = ALPHA_I, f124, f14
1803	}
1804	{ .mmf
1805	nop	__LINE__
1806	nop	__LINE__
1807	FMA	f15  = ALPHA_I, f126, f15
1808	}
1809	;;
1810	{ .mmf
1811	STFD	[C8 ] = f6,  SIZE
1812	STFD	[C16] = f7,  SIZE
1813	FMA	f16  = ALPHA_R, f125, f16
1814	}
1815	{ .mmf
1816	nop	__LINE__
1817	nop	__LINE__
1818	FMA	f17  = ALPHA_R, f127, f17
1819	}
1820	;;
1821	{ .mmf
1822	STFD	[C8 ] = f10, 5 * SIZE
1823	STFD	[C16] = f11, 5 * SIZE
1824	FMA	f18  = ALPHA_I, f125, f18
1825	}
1826	{ .mmf
1827	nop	__LINE__
1828	nop	__LINE__
1829	FMA	f19  = ALPHA_I, f127, f19
1830	}
1831	;;
1832	{ .mmf
1833	STFD	[C8 ] = f12, SIZE
1834	STFD	[C16] = f13, SIZE
1835	mov	f64  = f0
1836	}
1837	{ .mmf
1838	nop	__LINE__
1839	nop	__LINE__
1840	mov	f72  = f0
1841	}
1842	;;
1843	{ .mmf
1844	STFD	[C8 ] = f14, SIZE
1845	STFD	[C16] = f15, SIZE
1846 	mov	f80  = f0
1847	}
1848	{ .mmf
1849	nop	__LINE__
1850	nop	__LINE__
1851	mov	f88  = f0
1852	}
1853	;;
1854	{ .mmf
1855	STFD	[C8 ] = f16, SIZE
1856	STFD	[C16] = f17, SIZE
1857	mov	f96  = f0
1858	}
1859	{ .mmf
1860	nop	__LINE__
1861	nop	__LINE__
1862	mov	f104 = f0
1863	}
1864	;;
1865	{ .mmf
1866	STFD	[C8 ] = f18, 5 * SIZE
1867	STFD	[C16] = f19, 5 * SIZE
1868	mov	f112 = f0
1869	}
1870	{ .mfb
1871	adds	I = -1, I
1872	mov	f120 = f0
1873	(p6)	br.cond.dptk .L011
1874	}
1875	;;
1876
1877.L020:
1878	{ .mfi
1879	cmp.eq	p3, p0 = r0, r0
1880	mov	f89  = f0
1881	tbit.z	p6, p7 = M, 2
1882	}
1883	{ .mfb
1884	nop	__LINE__
1885	mov	f81  = f0
1886	(p6)	br.cond.dptk .L030
1887	}
1888	;;
1889	{ .mfi
1890	LDFPD	f48, f49 = [B]
1891	mov	f65  = f0
1892	nop	__LINE__
1893	}
1894	{ .mfi
1895	adds	BOFFSET = 2 * SIZE, B
1896	mov	f73  = f0
1897	adds	PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
1898	}
1899	;;
1900	{ .mmf
1901	LDFPD	f32, f33 = [AOFFSET], 2 * SIZE
1902	setf.d	f97  = r0
1903	mov	f105 = f0
1904	}
1905	{ .mfi
1906	setf.d	f113 = r0
1907	mov	f121 = f0
1908	adds	L =  1, K
1909	}
1910	;;
1911	{ .mmf
1912	LDFPD	f50, f51 = [BOFFSET], 2 * SIZE
1913	setf.d	f66  = r0
1914	mov	f74  = f0
1915	}
1916	{ .mfi
1917	setf.d	f82  = r0
1918	mov	f90  = f0
1919	tbit.z	p12, p0 = L, 0
1920	}
1921	;;
1922	{ .mmf
1923	LDFPD	f52, f53 = [BOFFSET], 2 * SIZE
1924	setf.d	f98   = r0
1925	mov	f106  = f0
1926	}
1927	{ .mfi
1928	setf.d	f114 = r0
1929	mov	f122 = f0
1930	shr	L = L, 1
1931	}
1932	;;
1933	{ .mfi
1934	LDFPD	f54, f55 = [BOFFSET], 2 * SIZE
1935	mov	f75  = f0
1936	adds	L =  -1, L
1937	}
1938	{ .mmf
1939	setf.d	f67  = r0
1940	setf.d	f83  = r0
1941	mov	f91  = f0
1942	}
1943	;;
1944	{ .mfi
1945	LDFPD	f34, f35 = [AOFFSET], 2 * SIZE
1946	mov	f107 = f0
1947	mov	ar.lc = L
1948	}
1949	{ .mmf
1950	setf.d	f99  = r0
1951	setf.d	f115 = r0
1952	mov	f123 = f0
1953	}
1954	;;
1955	.align 32
1956
1957.L022:
1958	{ .mfi
1959	lfetch.nt1	[PREA],  16 * SIZE
1960	FMA	f64   = f32, f48, f64	// A1 * B1
1961	adds	PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
1962	}
1963	{ .mfi
1964	nop	__LINE__
1965	FMA	f72   = f32, f49, f72	// A1 * B2
1966	(p12) cmp.ne p3, p0 =  0, L
1967	}
1968	;;
1969	{ .mfi
1970	lfetch.nt1	[PREB],  16 * SIZE
1971	FMA	f80   = f32, f50, f80	// A1 * B3
1972	cmp.ne	p4, p5 =  0, L
1973	}
1974	{ .mfb
1975	nop	__LINE__
1976	FMA	f88   = f32, f51, f88	// A1 * B4
1977	nop	__LINE__
1978	}
1979	;;
1980	{ .mfi
1981	(p3) LDFPD	f40, f41 = [AOFFSET], 2 * SIZE
1982	FMA	f96   = f32, f52, f96	// A1 * B5
1983	(p5) adds	C9  = 4 * SIZE, C1
1984	}
1985	{ .mfi
1986	nop	__LINE__
1987	FMA	f104  = f32, f53, f104	// A1 * B6
1988	(p5) adds	C10 = 4 * SIZE, C2
1989	}
1990	;;
1991	{ .mfi
1992	(p3) LDFPD	f56, f57 = [BOFFSET],   2 * SIZE
1993	FMA	f112  = f32, f54, f112	// A1 * B7
1994	(p5) adds	C11 = 4 * SIZE, C3
1995	}
1996	{ .mfi
1997	nop	__LINE__
1998	FMA	f120  = f32, f55, f120	// A1 * B8
1999	(p5) adds	C12 = 4 * SIZE, C4
2000	}
2001	;;
2002	{ .mfi
2003	(p3) LDFPD	f58, f59 = [BOFFSET],  2 * SIZE
2004	FMA	f65   = f33, f48, f65	// A2 * B1
2005	(p5) adds	C13 = 4 * SIZE, C5
2006	}
2007	{ .mfi
2008	nop	__LINE__
2009	FMA	f73   = f33, f49, f73	// A2 * B2
2010	(p5) adds	C14 = 4 * SIZE, C6
2011	}
2012	;;
2013	{ .mfi
2014	(p3) LDFPD	f60, f61 = [BOFFSET], 2 * SIZE
2015	FMA	f81   = f33, f50, f81	// A2 * B3
2016	(p5) adds	C15 = 4 * SIZE, C7
2017	}
2018	{ .mfi
2019	nop	__LINE__
2020	FMA	f89   = f33, f51, f89	// A2 * B4
2021	(p5) adds	C16 = 4 * SIZE, C8
2022	}
2023	;;
2024	{ .mfb
2025	(p3) LDFPD	f62, f63 = [BOFFSET], 2 * SIZE
2026	FMA	f97   = f33, f52, f97	// A2 * B5
2027	nop	__LINE__
2028	}
2029	{ .mfb
2030	nop	__LINE__
2031	FMA	f105  = f33, f53, f105	// A2 * B6
2032	nop	__LINE__
2033	}
2034	;;
2035	{ .mfb
2036	(p3) LDFPD	f42, f43 = [AOFFSET], 2 * SIZE
2037	FMA	f113  = f33, f54, f113	// A2 * B7
2038	nop	__LINE__
2039	}
2040	{ .mfb
2041	nop	__LINE__
2042	FMA	f121  = f33, f55, f121	// A2 * B8
2043	nop	__LINE__
2044	}
2045	;;
2046	{ .mfb
2047	nop	__LINE__
2048	FMA	f66   = f34, f48, f66	// A3 * B1
2049	nop	__LINE__
2050	}
2051	{ .mfb
2052	nop	__LINE__
2053	FMA	f74   = f34, f49, f74	// A3 * B2
2054	nop	__LINE__
2055	}
2056	;;
2057	{ .mfb
2058	nop	__LINE__
2059	FMA	f82   = f34, f50, f82	// A3 * B3
2060	nop	__LINE__
2061	}
2062	{ .mfb
2063	nop	__LINE__
2064	FMA	f90   = f34, f51, f90	// A3 * B4
2065	nop	__LINE__
2066	}
2067	;;
2068	{ .mfb
2069	nop	__LINE__
2070	FMA	f98   = f34, f52, f98	// A3 * B5
2071	nop	__LINE__
2072	}
2073	{ .mfb
2074	nop	__LINE__
2075	FMA	f106  = f34, f53, f106	// A3 * B6
2076	nop	__LINE__
2077	}
2078	;;
2079	{ .mfb
2080	nop	__LINE__
2081	FMA	f114  = f34, f54, f114	// A3 * B7
2082	nop	__LINE__
2083	}
2084	{ .mfb
2085	nop	__LINE__
2086	FMA	f122  = f34, f55, f122	// A3 * B8
2087	nop	__LINE__
2088	}
2089	;;
2090	{ .mfb
2091	nop	__LINE__
2092	FMA	f67   = f35, f48, f67	// A4 * B1
2093	nop	__LINE__
2094	}
2095	{ .mfb
2096	nop	__LINE__
2097	FMA	f75   = f35, f49, f75	// A4 * B2
2098	nop	__LINE__
2099	}
2100	;;
2101	{ .mfb
2102	nop	__LINE__
2103	FMA	f83   = f35, f50, f83	// A4 * B3
2104	nop	__LINE__
2105	}
2106	{ .mfb
2107	nop	__LINE__
2108	FMA	f91   = f35, f51, f91	// A4 * B4
2109	nop	__LINE__
2110	}
2111	;;
2112	{ .mfb
2113	(p4) LDFPD	f32, f33 = [AOFFSET],   2 * SIZE
2114	FMA	f99   = f35, f52, f99	// A4 * B5
2115	nop	__LINE__
2116	}
2117	{ .mfb
2118	nop	__LINE__
2119	FMA	f107  = f35, f53, f107	// A4 * B6
2120	nop	__LINE__
2121	}
2122	;;
2123	{ .mfb
2124	(p4) LDFPD	f48, f49 = [BOFFSET],   2 * SIZE
2125	FMA	f115  = f35, f54, f115	// A4 * B7
2126	nop	__LINE__
2127	}
2128	{ .mfb
2129	nop	__LINE__
2130	FMA	f123  = f35, f55, f123	// A4 * B8
2131	nop	__LINE__
2132	}
2133	;;
2134	{ .mfb
2135	(p4) LDFPD	f50, f51 = [BOFFSET],  2 * SIZE
2136	(p3) FMA	f64   = f40, f56, f64	// A1 * B1
2137	nop	__LINE__
2138	}
2139	{ .mfb
2140	nop	__LINE__
2141	(p3) FMA	f72   = f40, f57, f72	// A1 * B2
2142	nop	__LINE__
2143	}
2144	;;
2145	{ .mfb
2146	(p4) LDFPD	f52, f53 = [BOFFSET], 2 * SIZE
2147	(p3) FMA	f80   = f40, f58, f80	// A1 * B3
2148	nop	__LINE__
2149	}
2150	{ .mfb
2151	nop	__LINE__
2152	(p3) FMA	f88   = f40, f59, f88	// A1 * B4
2153	nop	__LINE__
2154	}
2155	;;
2156	{ .mfb
2157	(p5) LDFD	f6  = [C1 ], SIZE
2158	(p3) FMA	f96   = f40, f60, f96	// A1 * B5
2159	nop	__LINE__
2160	}
2161	{ .mfb
2162	(p5) LDFD	f7  = [C9 ], SIZE
2163	(p3) FMA	f104  = f40, f61, f104	// A1 * B6
2164	nop	__LINE__
2165	}
2166	;;
2167	{ .mfb
2168	(p5) LDFD	f10 = [C1 ], SIZE
2169	(p3) FMA	f112  = f40, f62, f112	// A1 * B7
2170	nop	__LINE__
2171	}
2172	{ .mfb
2173	(p5) LDFD	f11 = [C9 ], SIZE
2174	(p3) FMA	f120  = f40, f63, f120	// A1 * B8
2175	nop	__LINE__
2176	}
2177	;;
2178	{ .mfb
2179	(p4) LDFPD	f54, f55 = [BOFFSET], 2 * SIZE
2180	(p3) FMA	f65   = f41, f56, f65	// A2 * B1
2181	nop	__LINE__
2182	}
2183	{ .mfb
2184	(p3) FMA	f73   = f41, f57, f73	// A2 * B2
2185	nop	__LINE__
2186	}
2187	{ .mfb
2188	(p4) LDFPD	f34, f35 = [AOFFSET], 2 * SIZE
2189	(p3) FMA	f81   = f41, f58, f81	// A2 * B3
2190	nop	__LINE__
2191	}
2192	{ .mfb
2193	(p3) FMA	f89   = f41, f59, f89	// A2 * B4
2194	nop	__LINE__
2195	}
2196	;;
2197	{ .mfb
2198	(p5) LDFD	f12 = [C1 ], SIZE
2199	(p3) FMA	f97   = f41, f60, f97	// A2 * B5
2200	nop	__LINE__
2201	}
2202	{ .mfb
2203	(p5) LDFD	f13 = [C9 ], SIZE
2204	(p3) FMA	f105  = f41, f61, f105	// A2 * B6
2205	nop	__LINE__
2206	}
2207	;;
2208	{ .mfb
2209	(p5) LDFD	f14 = [C1 ], - 3 * SIZE
2210	(p3) FMA	f113  = f41, f62, f113	// A2 * B7
2211	nop	__LINE__
2212	}
2213	{ .mfb
2214	(p5) LDFD	f15 = [C9 ], - 3 * SIZE
2215	(p3) FMA	f121  = f41, f63, f121	// A2 * B8
2216	nop	__LINE__
2217	}
2218	;;
2219	{ .mfb
2220	(p5) LDFD	f16 = [C2 ], SIZE
2221	(p3) FMA	f66   = f42, f56, f66	// A3 * B1
2222	nop	__LINE__
2223	}
2224	{ .mfb
2225	(p5) LDFD	f17 = [C10], SIZE
2226	(p3) FMA	f74   = f42, f57, f74	// A3 * B2
2227	nop	__LINE__
2228	}
2229	;;
2230	{ .mfb
2231	(p5) LDFD	f18 = [C2 ], SIZE
2232	(p3) FMA	f82   = f42, f58, f82	// A3 * B3
2233	nop	__LINE__
2234	}
2235	{ .mfb
2236	(p5) LDFD	f19 = [C10], SIZE
2237	(p3) FMA	f90   = f42, f59, f90	// A3 * B4
2238	nop	__LINE__
2239	}
2240	;;
2241	{ .mfb
2242	(p5) LDFD	f20 = [C2 ], SIZE
2243	(p3) FMA	f98   = f42, f60, f98	// A3 * B5
2244	nop	__LINE__
2245	}
2246	{ .mfb
2247	(p5) LDFD	f21 = [C10], SIZE
2248	(p3) FMA	f106  = f42, f61, f106	// A3 * B6
2249	nop	__LINE__
2250	}
2251	;;
2252	{ .mfb
2253	(p5) LDFD	f22 = [C2 ], - 3 * SIZE
2254	(p3) FMA	f114  = f42, f62, f114	// A3 * B7
2255	nop	__LINE__
2256	}
2257	{ .mfb
2258	(p5) LDFD	f23 = [C10], - 3 * SIZE
2259	(p3) FMA	f122  = f42, f63, f122	// A3 * B8
2260	nop	__LINE__
2261	}
2262	;;
2263	{ .mfb
2264	(p5) LDFD	f24 = [C3 ], SIZE
2265	(p3) FMA	f67   = f43, f56, f67	// A4 * B1
2266	nop	__LINE__
2267	}
2268	{ .mfb
2269	(p5) LDFD	f25 = [C11], SIZE
2270	(p3) FMA	f75   = f43, f57, f75	// A4 * B2
2271	nop	__LINE__
2272	}
2273	;;
2274	{ .mfb
2275	(p5) LDFD	f26 = [C3 ], SIZE
2276	(p3) FMA	f83   = f43, f58, f83	// A4 * B3
2277	nop	__LINE__
2278	}
2279	{ .mfb
2280	(p5) LDFD	f27 = [C11], SIZE
2281	(p3) FMA	f91   = f43, f59, f91	// A4 * B4
2282	nop	__LINE__
2283	}
2284	;;
2285	{ .mfb
2286	(p5) LDFD	f28 = [C3 ], SIZE
2287	(p3) FMA	f99   = f43, f60, f99	// A4 * B5
2288	nop	__LINE__
2289	}
2290	{ .mfb
2291	(p5) LDFD	f29 = [C11], SIZE
2292	(p3) FMA	f107  = f43, f61, f107	// A4 * B6
2293	nop	__LINE__
2294	}
2295	;;
2296	{ .mfi
2297	(p5) LDFD	f30 = [C3 ], - 3 * SIZE
2298	(p3) FMA	f115  = f43, f62, f115	// A4 * B7
2299	adds	L = -1, L
2300	}
2301	{ .mfb
2302	(p5) LDFD	f31 = [C11], - 3 * SIZE
2303	(p3) FMA	f123  = f43, f63, f123	// A4 * B8
2304	br.cloop.sptk.few .L022
2305	}
2306	;;
2307
2308.L028:
2309	{ .mmf
2310	LDFD	f68 = [C4 ], SIZE
2311	LDFD	f69 = [C12], SIZE
2312	FMA	f6  = ALPHA_R, f64, f6
2313	}
2314	{ .mmf
2315	nop	__LINE__
2316	nop	__LINE__
2317	FMA	f7  = ALPHA_R, f66, f7
2318	}
2319	;;
2320	{ .mmf
2321	LDFD	f70 = [C4 ], SIZE
2322	LDFD	f71 = [C12], SIZE
2323	FMA	f10 = ALPHA_I, f64, f10
2324	}
2325	{ .mmf
2326	nop	__LINE__
2327	nop	__LINE__
2328	FMA	f11 = ALPHA_I, f66, f11
2329	}
2330	;;
2331	{ .mmf
2332	LDFD	f76 = [C4 ], SIZE
2333	LDFD	f77 = [C12], SIZE
2334	FMA	f12 = ALPHA_R, f65, f12
2335	}
2336	{ .mmf
2337	nop	__LINE__
2338	nop	__LINE__
2339	FMA	f13 = ALPHA_R, f67, f13
2340	}
2341	;;
2342	{ .mmf
2343	LDFD	f78 = [C4 ], -3 * SIZE
2344	LDFD	f79 = [C12], -3 * SIZE
2345	FMA	f14 = ALPHA_I, f65, f14
2346	}
2347	{ .mmf
2348	nop	__LINE__
2349	nop	__LINE__
2350	FMA	f15 = ALPHA_I, f67, f15
2351	}
2352	;;
2353	{ .mmf
2354	STFD	[C1 ] = f6, SIZE
2355	STFD	[C9 ] = f7, SIZE
2356	FMA	f16 = ALPHA_R, f72, f16
2357	}
2358	{ .mmf
2359	LDFD	f84 = [C5 ], SIZE
2360	LDFD	f85 = [C13], SIZE
2361	FMA	f17 = ALPHA_R, f74, f17
2362	}
2363	;;
2364	{ .mmf
2365	STFD	[C1 ] = f10, SIZE
2366	STFD	[C9 ] = f11, SIZE
2367	FMA	f18 = ALPHA_I, f72, f18
2368	}
2369	{ .mmf
2370	LDFD	f86 = [C5 ], SIZE
2371	LDFD	f87 = [C13], SIZE
2372	FMA	f19 = ALPHA_I, f74, f19
2373	}
2374	;;
2375	{ .mmf
2376	STFD	[C1 ] = f12, SIZE
2377	STFD	[C9 ] = f13, SIZE
2378	FMA	f20 = ALPHA_R, f73, f20
2379	}
2380	{ .mmf
2381	LDFD	f92 = [C5 ], SIZE
2382	LDFD	f93 = [C13], SIZE
2383	FMA	f21 = ALPHA_R, f75, f21
2384	}
2385	;;
2386	{ .mmf
2387	STFD	[C1 ] = f14, 5 * SIZE
2388	STFD	[C9 ] = f15, 5 * SIZE
2389	FMA	f22 = ALPHA_I, f73, f22
2390	}
2391	{ .mmf
2392	LDFD	f94 = [C5 ], -3 * SIZE
2393	LDFD	f95 = [C13], -3 * SIZE
2394	FMA	f23 = ALPHA_I, f75, f23
2395	}
2396	;;
2397	{ .mmf
2398	STFD	[C2 ] = f16, SIZE
2399	STFD	[C10] = f17, SIZE
2400	FMA	f24 = ALPHA_R, f80, f24
2401	}
2402	{ .mmf
2403	LDFD	f100 = [C6 ], SIZE
2404	LDFD	f101 = [C14], SIZE
2405	FMA	f25 = ALPHA_R, f82, f25
2406	}
2407	;;
2408	{ .mmf
2409	STFD	[C2 ] = f18, SIZE
2410	STFD	[C10] = f19, SIZE
2411	FMA	f26 = ALPHA_I, f80, f26
2412	}
2413	{ .mmf
2414	LDFD	f102 = [C6 ], SIZE
2415	LDFD	f103 = [C14], SIZE
2416	FMA	f27 = ALPHA_I, f82, f27
2417	}
2418	;;
2419	{ .mmf
2420	STFD	[C2 ] = f20, SIZE
2421	STFD	[C10] = f21, SIZE
2422	FMA	f28 = ALPHA_R, f81, f28
2423	}
2424	{ .mmf
2425	LDFD	f108 = [C6 ], SIZE
2426	LDFD	f109 = [C14], SIZE
2427	FMA	f29 = ALPHA_R, f83, f29
2428	}
2429	;;
2430	{ .mmf
2431	STFD	[C2 ] = f22, 5 * SIZE
2432	STFD	[C10] = f23, 5 * SIZE
2433	FMA	f30 = ALPHA_I, f81, f30
2434	}
2435	{ .mmf
2436	LDFD	f110 = [C6 ], -3 * SIZE
2437	LDFD	f111 = [C14], -3 * SIZE
2438	FMA	f31 = ALPHA_I, f83, f31
2439	}
2440	;;
2441	{ .mmf
2442	STFD	[C3 ] = f24, SIZE
2443	STFD	[C11] = f25, SIZE
2444	FMA	f68 = ALPHA_R, f88, f68
2445	}
2446	{ .mmf
2447	LDFD	f116 = [C7 ], SIZE
2448	LDFD	f117 = [C15], SIZE
2449	FMA	f69 = ALPHA_R, f90, f69
2450	}
2451	;;
2452	{ .mmf
2453	STFD	[C3 ] = f26, SIZE
2454	STFD	[C11] = f27, SIZE
2455	FMA	f70 = ALPHA_I, f88, f70
2456	}
2457	{ .mmf
2458	LDFD	f118 = [C7 ], SIZE
2459	LDFD	f119 = [C15], SIZE
2460	FMA	f71 = ALPHA_I, f90, f71
2461	}
2462	;;
2463	{ .mmf
2464	STFD	[C3 ] = f28, SIZE
2465	STFD	[C11] = f29, SIZE
2466	FMA	f76 = ALPHA_R, f89, f76
2467	}
2468	{ .mmf
2469	LDFD	f124 = [C7 ], SIZE
2470	LDFD	f125 = [C15], SIZE
2471	FMA	f77 = ALPHA_R, f91, f77
2472	}
2473	;;
2474	{ .mmf
2475	STFD	[C3 ] = f30, 5 * SIZE
2476	STFD	[C11] = f31, 5 * SIZE
2477	FMA	f78 = ALPHA_I, f89, f78
2478	}
2479	{ .mmf
2480	LDFD	f126 = [C7 ], -3 * SIZE
2481	LDFD	f127 = [C15], -3 * SIZE
2482	FMA	f79 = ALPHA_I, f91, f79
2483	}
2484	;;
2485	{ .mmf
2486	STFD	[C4 ] = f68, SIZE
2487	STFD	[C12] = f69, SIZE
2488	FMA	f84 = ALPHA_R, f96, f84
2489	}
2490	{ .mmf
2491	LDFD	f32 = [C8 ], SIZE
2492	LDFD	f33 = [C16], SIZE
2493	FMA	f85 = ALPHA_R, f98, f85
2494	}
2495	;;
2496	{ .mmf
2497	STFD	[C4 ] = f70, SIZE
2498	STFD	[C12] = f71, SIZE
2499	FMA	f86 = ALPHA_I, f96, f86
2500	}
2501	{ .mmf
2502	LDFD	f34 = [C8 ], SIZE
2503	LDFD	f35 = [C16], SIZE
2504	FMA	f87 = ALPHA_I, f98, f87
2505	}
2506	;;
2507	{ .mmf
2508	STFD	[C4 ] = f76, SIZE
2509	STFD	[C12] = f77, SIZE
2510	FMA	f92 = ALPHA_R, f97, f92
2511	}
2512	{ .mmf
2513	LDFD	f36 = [C8 ], SIZE
2514	LDFD	f37 = [C16], SIZE
2515	FMA	f93 = ALPHA_R, f99, f93
2516	}
2517	;;
2518	{ .mmf
2519	STFD	[C4 ] = f78, 5 * SIZE
2520	STFD	[C12] = f79, 5 * SIZE
2521	FMA	f94 = ALPHA_I, f97, f94
2522	}
2523	{ .mmf
2524	LDFD	f38 = [C8 ], -3 * SIZE
2525	LDFD	f39 = [C16], -3 * SIZE
2526	FMA	f95 = ALPHA_I, f99, f95
2527	}
2528	;;
2529	{ .mmf
2530	STFD	[C5 ] = f84, SIZE
2531	STFD	[C13] = f85, SIZE
2532	FMA	f100 = ALPHA_R, f104, f100
2533	}
2534	{ .mmf
2535	nop	__LINE__
2536	nop	__LINE__
2537	FMA	f101 = ALPHA_R, f106, f101
2538	}
2539	;;
2540	{ .mmf
2541	STFD	[C5 ] = f86, SIZE
2542	STFD	[C13] = f87, SIZE
2543	FMA	f102 = ALPHA_I, f104, f102
2544	}
2545	{ .mmf
2546	nop	__LINE__
2547	nop	__LINE__
2548	FMA	f103 = ALPHA_I, f106, f103
2549	}
2550	;;
2551	{ .mmf
2552	STFD	[C5 ] = f92, SIZE
2553	STFD	[C13] = f93, SIZE
2554	FMA	f108 = ALPHA_R, f105, f108
2555	}
2556	{ .mmf
2557	nop	__LINE__
2558	nop	__LINE__
2559	FMA	f109 = ALPHA_R, f107, f109
2560	}
2561	;;
2562	{ .mmf
2563	STFD	[C5 ] = f94, 5 * SIZE
2564	STFD	[C13] = f95, 5 * SIZE
2565	FMA	f110 = ALPHA_I, f105, f110
2566	}
2567	{ .mmf
2568	nop	__LINE__
2569	nop	__LINE__
2570	FMA	f111 = ALPHA_I, f107, f111
2571	}
2572	;;
2573	{ .mmf
2574	STFD	[C6 ] = f100, SIZE
2575	STFD	[C14] = f101, SIZE
2576	FMA	f116 = ALPHA_R, f112, f116
2577	}
2578	{ .mmf
2579	nop	__LINE__
2580	nop	__LINE__
2581	FMA	f117 = ALPHA_R, f114, f117
2582	}
2583	;;
2584	{ .mmf
2585	STFD	[C6 ] = f102, SIZE
2586	STFD	[C14] = f103, SIZE
2587	FMA	f118 = ALPHA_I, f112, f118
2588	}
2589	{ .mmf
2590	nop	__LINE__
2591	nop	__LINE__
2592	FMA	f119 = ALPHA_I, f114, f119
2593	}
2594	;;
2595	{ .mmf
2596	STFD	[C6 ] = f108, SIZE
2597	STFD	[C14] = f109, SIZE
2598	FMA	f124 = ALPHA_R, f113, f124
2599	}
2600	{ .mmf
2601	nop	__LINE__
2602	nop	__LINE__
2603	FMA	f125 = ALPHA_R, f115, f125
2604	}
2605	;;
2606	{ .mmf
2607	STFD	[C6 ] = f110, 5 * SIZE
2608	STFD	[C14] = f111, 5 * SIZE
2609	FMA	f126 = ALPHA_I, f113, f126
2610	}
2611	{ .mmf
2612	nop	__LINE__
2613	nop	__LINE__
2614	FMA	f127 = ALPHA_I, f115, f127
2615	}
2616	;;
2617	{ .mmf
2618	STFD	[C7 ] = f116, SIZE
2619	STFD	[C15] = f117, SIZE
2620	FMA	f32 = ALPHA_R, f120, f32
2621	}
2622	{ .mmf
2623	nop	__LINE__
2624	nop	__LINE__
2625	FMA	f33 = ALPHA_R, f122, f33
2626	}
2627	;;
2628	{ .mmf
2629	STFD	[C7 ] = f118, SIZE
2630	STFD	[C15] = f119, SIZE
2631	FMA	f34 = ALPHA_I, f120, f34
2632	}
2633	{ .mmf
2634	nop	__LINE__
2635	nop	__LINE__
2636	FMA	f35 = ALPHA_I, f122, f35
2637	}
2638	;;
2639	{ .mmf
2640	STFD	[C7 ] = f124, SIZE
2641	STFD	[C15] = f125, SIZE
2642	FMA	f36 = ALPHA_R, f121, f36
2643	}
2644	{ .mmf
2645	nop	__LINE__
2646	nop	__LINE__
2647	FMA	f37 = ALPHA_R, f123, f37
2648	}
2649	;;
2650	{ .mmf
2651	STFD	[C7 ] = f126, 5 * SIZE
2652	STFD	[C15] = f127, 5 * SIZE
2653	FMA	f38 = ALPHA_I, f121, f38
2654	}
2655	{ .mmf
2656	nop	__LINE__
2657	nop	__LINE__
2658	FMA	f39 = ALPHA_I, f123, f39
2659	}
2660	;;
2661	{ .mmf
2662	STFD	[C8 ] = f32, SIZE
2663	STFD	[C16] = f33, SIZE
2664	mov	f64  = f0
2665	}
2666	{ .mmf
2667	nop	__LINE__
2668	nop	__LINE__
2669	mov	f72  = f0
2670	}
2671	;;
2672	{ .mmf
2673	STFD	[C8 ] = f34, SIZE
2674	STFD	[C16] = f35, SIZE
2675	mov	f80  = f0
2676	}
2677	{ .mmf
2678	nop	__LINE__
2679	nop	__LINE__
2680	mov	f88  = f0
2681	}
2682	;;
2683	{ .mmf
2684	STFD	[C8 ] = f36, SIZE
2685	STFD	[C16] = f37, SIZE
2686	mov	f96  = f0
2687	}
2688	{ .mmf
2689	nop	__LINE__
2690	nop	__LINE__
2691	mov	f104 = f0
2692	}
2693	;;
2694	{ .mmf
2695	STFD	[C8 ] = f38, 5 * SIZE
2696	STFD	[C16] = f39, 5 * SIZE
2697	mov	f112 = f0
2698	}
2699	{ .mmf
2700	nop	__LINE__
2701	nop	__LINE__
2702	mov	f120 = f0
2703	}
2704	;;
2705	.align 32
2706
2707.L030:
2708	{ .mib
2709	nop	__LINE__
2710	tbit.z	p6, p7 = M, 1
2711	(p6)	br.cond.dptk .L040
2712	}
2713	;;
2714	{ .mfi
2715	LDFPD	f48, f49 = [B]
2716	mov	f65  = f0
2717	nop	__LINE__
2718	}
2719	{ .mfi
2720	adds	BOFFSET = 2 * SIZE, B
2721	mov	f73  = f0
2722	adds	L =  1, K
2723	}
2724	;;
2725	{ .mfi
2726	LDFPD	f50, f51 = [BOFFSET], 2 * SIZE
2727	mov	f81  = f0
2728	tbit.z	p12, p0 = L, 0
2729	}
2730	{ .mfi
2731	(p7) LDFPD	f32, f33 = [AOFFSET], 2 * SIZE
2732	mov	f89  = f0
2733	shr	L = L, 1
2734	}
2735	;;
2736	{ .mfi
2737	LDFPD	f52, f53 = [BOFFSET], 2 * SIZE
2738	mov	f97  = f0
2739	adds	L =  -1, L
2740	}
2741	{ .mfi
2742	nop	__LINE__
2743	mov	f105 = f0
2744	adds	PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
2745	}
2746	;;
2747	{ .mfi
2748	adds	PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
2749	mov	f113 = f0
2750	mov	ar.lc = L
2751	}
2752	{ .mfi
2753	LDFPD	f54, f55 = [BOFFSET], 2 * SIZE
2754	mov	f121 = f0
2755	cmp.eq	p3, p0 = r0, r0
2756	}
2757	;;
2758	.align 32
2759
2760.L032:
2761	{ .mfb
2762	lfetch.nt1	[PREA],  4 * SIZE
2763	FMA	f64   = f32, f48, f64	// A1 * B1
2764	nop	__LINE__
2765	}
2766	{ .mfi
2767	nop	__LINE__
2768	FMA	f72   = f32, f49, f72	// A1 * B2
2769	(p12) cmp.ne p3, p0 =  0, L
2770	}
2771	;;
2772	{ .mfi
2773	lfetch.nt1	[PREB],  16 * SIZE
2774	FMA	f80   = f32, f50, f80	// A1 * B3
2775	cmp.ne	p4, p5 =  0, L
2776	}
2777	{ .mfb
2778	nop	__LINE__
2779	FMA	f88   = f32, f51, f88	// A1 * B4
2780	nop	__LINE__
2781	}
2782	;;
2783	{ .mfb
2784	(p3) LDFPD	f56, f57 = [BOFFSET],   2 * SIZE
2785	FMA	f96   = f32, f52, f96	// A1 * B5
2786	nop	__LINE__
2787	}
2788	{ .mfb
2789	nop	__LINE__
2790	FMA	f104  = f32, f53, f104	// A1 * B6
2791	nop	__LINE__
2792	}
2793	;;
2794	{ .mfb
2795	(p3) LDFPD	f40, f41 = [AOFFSET], 2 * SIZE
2796	FMA	f112  = f32, f54, f112	// A1 * B7
2797	nop	__LINE__
2798	}
2799	{ .mfb
2800	nop	__LINE__
2801	FMA	f120  = f32, f55, f120	// A1 * B8
2802	nop	__LINE__
2803	}
2804	;;
2805	{ .mfb
2806	(p3) LDFPD	f58, f59 = [BOFFSET],  2 * SIZE
2807	FMA	f65   = f33, f48, f65	// A2 * B1
2808	nop	__LINE__
2809	}
2810	{ .mfb
2811	nop	__LINE__
2812	FMA	f73   = f33, f49, f73	// A2 * B2
2813	nop	__LINE__
2814	}
2815	;;
2816	{ .mfb
2817	(p3) LDFPD	f60, f61 = [BOFFSET], 2 * SIZE
2818	FMA	f81   = f33, f50, f81	// A2 * B3
2819	nop	__LINE__
2820	}
2821	{ .mfb
2822	nop	__LINE__
2823	FMA	f89   = f33, f51, f89	// A2 * B4
2824	nop	__LINE__
2825	}
2826	;;
2827	{ .mfb
2828	(p3) LDFPD	f62, f63 = [BOFFSET], 2 * SIZE
2829	FMA	f97   = f33, f52, f97	// A2 * B5
2830	nop	__LINE__
2831	}
2832	{ .mfb
2833	nop	__LINE__
2834	FMA	f105  = f33, f53, f105	// A2 * B6
2835	nop	__LINE__
2836	}
2837	;;
2838	{ .mfb
2839	nop	__LINE__
2840	FMA	f113  = f33, f54, f113	// A2 * B7
2841	nop	__LINE__
2842	}
2843	{ .mfb
2844	nop	__LINE__
2845	FMA	f121  = f33, f55, f121	// A2 * B8
2846	nop	__LINE__
2847	}
2848	;;
2849	{ .mfb
2850	(p4) LDFPD	f32, f33 = [AOFFSET],   2 * SIZE
2851	(p3) FMA	f64   = f40, f56, f64	// A1 * B1
2852	nop	__LINE__
2853	}
2854	{ .mfb
2855	(p4) LDFPD	f48, f49 = [BOFFSET],   2 * SIZE
2856	(p3) FMA	f72   = f40, f57, f72	// A1 * B2
2857	nop	__LINE__
2858	}
2859	;;
2860	{ .mfb
2861	(p4) LDFPD	f50, f51 = [BOFFSET],  2 * SIZE
2862	(p3) FMA	f80   = f40, f58, f80	// A1 * B3
2863	nop	__LINE__
2864	}
2865	{ .mfb
2866	nop	__LINE__
2867	(p3) FMA	f88   = f40, f59, f88	// A1 * B4
2868	nop	__LINE__
2869	}
2870	;;
2871	{ .mfb
2872	(p4) LDFPD	f52, f53 = [BOFFSET], 2 * SIZE
2873	(p3) FMA	f96   = f40, f60, f96	// A1 * B5
2874	nop	__LINE__
2875	}
2876	{ .mfb
2877	nop	__LINE__
2878	(p3) FMA	f104  = f40, f61, f104	// A1 * B6
2879	nop	__LINE__
2880	}
2881	;;
2882	{ .mfb
2883	(p5) LDFD	f6   = [C1], SIZE
2884	(p3) FMA	f112  = f40, f62, f112	// A1 * B7
2885	nop	__LINE__
2886	}
2887	{ .mfb
2888	(p5) LDFD	f12  = [C2], SIZE
2889	(p3) FMA	f120  = f40, f63, f120	// A1 * B8
2890	nop	__LINE__
2891	}
2892	;;
2893	{ .mfb
2894	(p4) LDFPD	f54, f55 = [BOFFSET], 2 * SIZE
2895	(p3) FMA	f65   = f41, f56, f65	// A2 * B1
2896	nop	__LINE__
2897	}
2898	{ .mfb
2899	(p3) FMA	f73   = f41, f57, f73	// A2 * B2
2900	nop	__LINE__
2901	}
2902	{ .mfb
2903	(p5) LDFD	f7   = [C1], SIZE
2904	(p3) FMA	f81   = f41, f58, f81	// A2 * B3
2905	nop	__LINE__
2906	}
2907	{ .mfb
2908	(p5) LDFD	f13  = [C2], SIZE
2909	(p3) FMA	f89   = f41, f59, f89	// A2 * B4
2910	nop	__LINE__
2911	}
2912	;;
2913	{ .mfb
2914	(p5) LDFD	f10  = [C1], SIZE
2915	(p3) FMA	f97   = f41, f60, f97	// A2 * B5
2916	nop	__LINE__
2917	}
2918	{ .mfb
2919	(p5) LDFD	f14  = [C2], SIZE
2920	(p3) FMA	f105  = f41, f61, f105	// A2 * B6
2921	nop	__LINE__
2922	}
2923	;;
2924	{ .mfi
2925	(p5) LDFD	f11  = [C1], -3 * SIZE
2926	(p3) FMA	f113  = f41, f62, f113	// A2 * B7
2927	adds	L = -1, L
2928	}
2929	{ .mfb
2930	(p5) LDFD	f15  = [C2], -3 * SIZE
2931	(p3) FMA	f121  = f41, f63, f121	// A2 * B8
2932	br.cloop.sptk.few .L032
2933	}
2934	;;
2935
2936.L038:
2937	{ .mmf
2938	LDFD	f16  = [C3], SIZE
2939	LDFD	f20  = [C4], SIZE
2940	FMA	f6  = ALPHA_R, f64, f6
2941	}
2942	{ .mmf
2943	nop	__LINE__
2944	nop	__LINE__
2945	FMA	f12 = ALPHA_R, f72, f12
2946	}
2947	;;
2948	{ .mmf
2949	LDFD	f17  = [C3], SIZE
2950	LDFD	f21  = [C4], SIZE
2951	FMA	f7  = ALPHA_I, f64, f7
2952	}
2953	{ .mmf
2954	nop	__LINE__
2955	nop	__LINE__
2956	FMA	f13 = ALPHA_I, f72, f13
2957	}
2958	;;
2959	{ .mmf
2960	LDFD	f18  = [C3], SIZE
2961	LDFD	f22  = [C4], SIZE
2962	FMA	f10 = ALPHA_R, f65, f10
2963	}
2964	{ .mmf
2965	nop	__LINE__
2966	nop	__LINE__
2967	FMA	f14 = ALPHA_R, f73, f14
2968	}
2969	;;
2970	{ .mmf
2971	LDFD	f19  = [C3], - 3 * SIZE
2972	LDFD	f23  = [C4], - 3 * SIZE
2973	FMA	f11 = ALPHA_I, f65, f11
2974	}
2975	{ .mmf
2976	nop	__LINE__
2977	nop	__LINE__
2978	FMA	f15 = ALPHA_I, f73, f15
2979	}
2980	;;
2981	{ .mmf
2982	STFD	[C1] = f6,  SIZE
2983	STFD	[C2] = f12, SIZE
2984	FMA	f16 = ALPHA_R, f80, f16
2985	}
2986	{ .mmf
2987	LDFD	f24  = [C5], SIZE
2988	LDFD	f28  = [C6], SIZE
2989	FMA	f20 = ALPHA_R, f88, f20
2990	}
2991	;;
2992	{ .mmf
2993	STFD	[C1] = f7,  SIZE
2994	STFD	[C2] = f13, SIZE
2995	FMA	f17 = ALPHA_I, f80, f17
2996	}
2997	{ .mmf
2998	LDFD	f25  = [C5], SIZE
2999	LDFD	f29  = [C6], SIZE
3000	FMA	f21 = ALPHA_I, f88, f21
3001	}
3002	;;
3003	{ .mmf
3004	STFD	[C1] = f10, SIZE
3005	STFD	[C2] = f14, SIZE
3006	FMA	f18 = ALPHA_R, f81, f18
3007	}
3008	{ .mmf
3009	LDFD	f26  = [C5], SIZE
3010	LDFD	f30  = [C6], SIZE
3011	FMA	f22 = ALPHA_R, f89, f22
3012	}
3013	;;
3014	{ .mmf
3015	STFD	[C1] = f11, SIZE
3016	STFD	[C2] = f15, SIZE
3017	FMA	f19 = ALPHA_I, f81, f19
3018	}
3019	{ .mmf
3020	LDFD	f27  = [C5], - 3 * SIZE
3021	LDFD	f31  = [C6], - 3 * SIZE
3022	FMA	f23 = ALPHA_I, f89, f23
3023	}
3024	;;
3025	{ .mmf
3026	STFD	[C3] = f16, SIZE
3027	STFD	[C4] = f20, SIZE
3028	FMA	f24 = ALPHA_R, f96,  f24
3029	}
3030	{ .mmf
3031	LDFD	f32  = [C7], SIZE
3032	LDFD	f36  = [C8], SIZE
3033	FMA	f28 = ALPHA_R, f104, f28
3034	}
3035	;;
3036	{ .mmf
3037	STFD	[C3] = f17, SIZE
3038	STFD	[C4] = f21, SIZE
3039	FMA	f25 = ALPHA_I, f96,  f25
3040	}
3041	{ .mmf
3042	LDFD	f33  = [C7], SIZE
3043	LDFD	f37  = [C8], SIZE
3044	FMA	f29 = ALPHA_I, f104, f29
3045	}
3046	;;
3047	{ .mmf
3048	STFD	[C3] = f18, SIZE
3049	STFD	[C4] = f22, SIZE
3050	FMA	f26 = ALPHA_R, f97,  f26
3051	}
3052	{ .mmf
3053	LDFD	f34  = [C7], SIZE
3054	LDFD	f38  = [C8], SIZE
3055	FMA	f30 = ALPHA_R, f105, f30
3056	}
3057	;;
3058	{ .mmf
3059	STFD	[C3] = f19, SIZE
3060	STFD	[C4] = f23, SIZE
3061	FMA	f27 = ALPHA_I, f97,  f27
3062	}
3063	{ .mmf
3064	LDFD	f35  = [C7], - 3 * SIZE
3065	LDFD	f39  = [C8], - 3 * SIZE
3066	FMA	f31 = ALPHA_I, f105, f31
3067	}
3068	;;
3069	{ .mmf
3070	STFD	[C5] = f24, SIZE
3071	STFD	[C6] = f28, SIZE
3072	FMA	f32 = ALPHA_R, f112, f32
3073	}
3074	{ .mmf
3075	nop	__LINE__
3076	nop	__LINE__
3077	FMA	f36 = ALPHA_R, f120, f36
3078	}
3079	;;
3080	{ .mmf
3081	STFD	[C5] = f25, SIZE
3082	STFD	[C6] = f29, SIZE
3083	FMA	f33 = ALPHA_I, f112, f33
3084	}
3085	{ .mmf
3086	nop	__LINE__
3087	nop	__LINE__
3088	FMA	f37 = ALPHA_I, f120, f37
3089	}
3090	;;
3091	{ .mmf
3092	STFD	[C5] = f26, SIZE
3093	STFD	[C6] = f30, SIZE
3094	FMA	f34 = ALPHA_R, f113, f34
3095	}
3096	{ .mmf
3097	nop	__LINE__
3098	nop	__LINE__
3099	FMA	f38 = ALPHA_R, f121, f38
3100	}
3101	;;
3102	{ .mmf
3103	STFD	[C5] = f27, SIZE
3104	STFD	[C6] = f31, SIZE
3105	FMA	f35 = ALPHA_I, f113,  f35
3106	}
3107	{ .mmf
3108	nop	__LINE__
3109	nop	__LINE__
3110	FMA	f39 = ALPHA_I, f121, f39
3111	}
3112	;;
3113	{ .mmf
3114	STFD	[C7] = f32, SIZE
3115	STFD	[C8] = f36, SIZE
3116	mov	f64  = f0
3117	}
3118	{ .mmf
3119	nop	__LINE__
3120	nop	__LINE__
3121	mov	f72  = f0
3122	}
3123	;;
3124	{ .mmf
3125	STFD	[C7] = f33, SIZE
3126	STFD	[C8] = f37, SIZE
3127	mov	f80  = f0
3128	}
3129	{ .mmf
3130	nop	__LINE__
3131	nop	__LINE__
3132	mov	f88  = f0
3133	}
3134	;;
3135	{ .mmf
3136	STFD	[C7] = f34, SIZE
3137	STFD	[C8] = f38, SIZE
3138	mov	f96  = f0
3139	}
3140	{ .mmf
3141	nop	__LINE__
3142	nop	__LINE__
3143	mov	f104 = f0
3144	}
3145	;;
3146	{ .mmf
3147	STFD	[C7] = f35, SIZE
3148	STFD	[C8] = f39, SIZE
3149	mov	f112 = f0
3150	}
3151	{ .mmf
3152	nop	__LINE__
3153	nop	__LINE__
3154	mov	f120 = f0
3155	}
3156	;;
3157	.align 32
3158
3159.L040:
3160	{ .mib
3161	nop	__LINE__
3162	tbit.z	p6, p7 = M, 0
3163	(p6)	br.cond.dptk .L049
3164	}
3165	;;
3166	{ .mmi
3167	LDFPD	f48, f49 = [B]
3168	adds	BOFFSET = 2 * SIZE, B
3169	adds	L =  1, K
3170	}
3171	;;
3172	{ .mii
3173	LDFPD	f50, f51 = [BOFFSET], 2 * SIZE
3174	tbit.z	p12, p0 = L, 0
3175	shr	L = L, 1
3176	}
3177	;;
3178	{ .mmi
3179	LDFPD	f52, f53 = [BOFFSET], 2 * SIZE
3180	LDFD	f32 = [AOFFSET], 1 * SIZE
3181	adds	L =  -1, L
3182	}
3183	;;
3184	{ .mmi
3185	adds	PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
3186	cmp.eq	p3, p0 = r0, r0
3187	mov	ar.lc = L
3188	}
3189	{ .mmi
3190	LDFPD	f54, f55 = [BOFFSET], 2 * SIZE
3191	adds	PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
3192	nop	__LINE__
3193	}
3194	;;
3195	.align 32
3196
3197.L042:
3198	{ .mfb
3199	lfetch.nt1	[PREB],  16 * SIZE
3200	FMA	f64   = f32, f48, f64	// A1 * B1
3201	nop	__LINE__
3202	}
3203	{ .mfb
3204	(p12) cmp.ne p3, p0 =  0, L
3205	FMA	f72   = f32, f49, f72	// A1 * B2
3206	nop	__LINE__
3207	}
3208	;;
3209	{ .mfi
3210	(p3) LDFD	f40 = [AOFFSET], 1 * SIZE
3211	FMA	f80   = f32, f50, f80	// A1 * B3
3212	cmp.ne	p4, p5 =  0, L
3213	}
3214	{ .mfb
3215	(p3) LDFPD	f56, f57 = [BOFFSET],   2 * SIZE
3216	FMA	f88   = f32, f51, f88	// A1 * B4
3217	nop	__LINE__
3218	}
3219	;;
3220	{ .mfi
3221	(p3) LDFPD	f58, f59 = [BOFFSET],  2 * SIZE
3222	FMA	f96   = f32, f52, f96	// A1 * B5
3223	nop	__LINE__
3224	}
3225	{ .mmf
3226	(p5) LDFD	f6   = [C1], SIZE
3227	(p5) LDFD	f10  = [C2], SIZE
3228	FMA	f104  = f32, f53, f104	// A1 * B6
3229	}
3230	;;
3231	{ .mfi
3232	(p3) LDFPD	f60, f61 = [BOFFSET], 2 * SIZE
3233	FMA	f112  = f32, f54, f112	// A1 * B7
3234	nop	__LINE__
3235	}
3236	{ .mmf
3237	(p5) LDFD	f7   = [C1], -SIZE
3238	(p5) LDFD	f11  = [C2], -SIZE
3239	FMA	f120  = f32, f55, f120	// A1 * B8
3240	}
3241	;;
3242	{ .mmf
3243	(p3) LDFPD	f62, f63 = [BOFFSET], 2 * SIZE
3244	(p4) LDFD	f32 = [AOFFSET],   1 * SIZE
3245	(p3) FMA	f64   = f40, f56, f64	// A1 * B1
3246	}
3247	{ .mmf
3248	(p5) LDFD	f12  = [C3], SIZE
3249	(p5) LDFD	f14  = [C4], SIZE
3250	(p3) FMA	f72   = f40, f57, f72	// A1 * B2
3251	}
3252	;;
3253	{ .mfi
3254	(p4) LDFPD	f48, f49 = [BOFFSET],   2 * SIZE
3255	(p3) FMA	f80   = f40, f58, f80	// A1 * B3
3256	nop	__LINE__
3257	}
3258	{ .mmf
3259	(p5) LDFD	f13  = [C3], -SIZE
3260	(p5) LDFD	f15  = [C4], -SIZE
3261	(p3) FMA	f88   = f40, f59, f88	// A1 * B4
3262	}
3263	;;
3264	{ .mfi
3265	(p4) LDFPD	f50, f51 = [BOFFSET],  2 * SIZE
3266	(p3) FMA	f96   = f40, f60, f96	// A1 * B5
3267	nop	__LINE__
3268	}
3269	{ .mmf
3270	(p5) LDFD	f16  = [C5], SIZE
3271	(p5) LDFD	f18  = [C6], SIZE
3272	(p3) FMA	f104  = f40, f61, f104	// A1 * B6
3273	}
3274	;;
3275	{ .mfi
3276	(p4) LDFPD	f52, f53 = [BOFFSET], 2 * SIZE
3277	(p3) FMA	f112  = f40, f62, f112	// A1 * B7
3278	adds	L = -1, L
3279	}
3280	{ .mmb
3281	(p5) LDFD	f17 = [C5], -SIZE
3282	(p5) LDFD	f19 = [C6], -SIZE
3283	nop	__LINE__
3284	}
3285	;;
3286	{ .mfb
3287	(p4) LDFPD	f54, f55 = [BOFFSET], 2 * SIZE
3288	(p3) FMA	f120  = f40, f63, f120	// A1 * B8
3289	nop	__LINE__
3290	}
3291	{ .mmb
3292	(p5) LDFD	f20 = [C7], SIZE
3293	(p5) LDFD	f22 = [C8], SIZE
3294	br.cloop.sptk.few .L042
3295	}
3296	;;
3297	{ .mmf
3298	LDFD	f21 = [C7], -SIZE
3299	LDFD	f23 = [C8], -SIZE
3300	FMA	f6  = ALPHA_R, f64, f6
3301	}
3302	{ .mmf
3303	nop	__LINE__
3304	nop	__LINE__
3305	FMA	f10 = ALPHA_R, f72, f10
3306	}
3307	;;
3308	{ .mmf
3309	nop	__LINE__
3310	nop	__LINE__
3311	FMA	f7  = ALPHA_I, f64, f7
3312	}
3313	{ .mmf
3314	nop	__LINE__
3315	nop	__LINE__
3316	FMA	f11 = ALPHA_I, f72, f11
3317	}
3318	;;
3319	{ .mmf
3320	nop	__LINE__
3321	nop	__LINE__
3322	FMA	f12 = ALPHA_R, f80, f12
3323	}
3324	{ .mmf
3325	nop	__LINE__
3326	nop	__LINE__
3327	FMA	f14 = ALPHA_R, f88, f14
3328	}
3329	;;
3330	{ .mmf
3331	nop	__LINE__
3332	nop	__LINE__
3333	FMA	f13 = ALPHA_I, f80, f13
3334	}
3335	{ .mmf
3336	nop	__LINE__
3337	nop	__LINE__
3338	FMA	f15 = ALPHA_I, f88, f15
3339	}
3340	;;
3341	{ .mmf
3342	STFD	[C1 ] = f6,  SIZE
3343	STFD	[C2 ] = f10, SIZE
3344	FMA	f16 = ALPHA_R, f96,  f16
3345	}
3346	{ .mmf
3347	nop	__LINE__
3348	nop	__LINE__
3349	FMA	f18 = ALPHA_R, f104, f18
3350	}
3351	;;
3352	{ .mmf
3353	STFD	[C1 ] = f7,  SIZE
3354	STFD	[C2 ] = f11, SIZE
3355	FMA	f17 = ALPHA_I, f96,  f17
3356	}
3357	{ .mmf
3358	nop	__LINE__
3359	nop	__LINE__
3360	FMA	f19 = ALPHA_I, f104, f19
3361	}
3362	;;
3363	{ .mmf
3364	STFD	[C3 ] = f12, SIZE
3365	STFD	[C4 ] = f14, SIZE
3366	FMA	f20 = ALPHA_R, f112, f20
3367	}
3368	{ .mmf
3369	nop	__LINE__
3370	nop	__LINE__
3371	FMA	f22 = ALPHA_R, f120, f22
3372	}
3373	;;
3374	{ .mmf
3375	STFD	[C3 ] = f13, SIZE
3376	STFD	[C4 ] = f15, SIZE
3377	FMA	f21 = ALPHA_I, f112, f21
3378	}
3379	{ .mmf
3380	nop	__LINE__
3381	nop	__LINE__
3382	FMA	f23 = ALPHA_I, f120, f23
3383	}
3384	;;
3385	{ .mmi
3386	STFD	[C5 ] = f16, SIZE
3387	STFD	[C6 ] = f18, SIZE
3388	nop	__LINE__
3389	}
3390	;;
3391	{ .mmi
3392	STFD	[C5 ] = f17, SIZE
3393	STFD	[C6 ] = f19, SIZE
3394	nop	__LINE__
3395	}
3396	;;
3397	{ .mmi
3398	STFD	[C7 ] = f20, SIZE
3399	STFD	[C8 ] = f22, SIZE
3400	nop	__LINE__
3401	}
3402	;;
3403	{ .mmi
3404	STFD	[C7 ] = f21, SIZE
3405	STFD	[C8 ] = f23, SIZE
3406	nop	__LINE__
3407	}
3408	;;
3409	.align 32
3410
3411.L049:
3412	{ .mmi
3413	mov	B = BOFFSET
3414	mov	AOFFSET = A
3415	nop	__LINE__
3416	}
3417	;;
3418	{ .mmb
3419	nop	__LINE__
3420	cmp.lt	p6, p0 = 0, J
3421	(p6)	br.cond.dptk .L010
3422	}
3423	;;
3424	.align 32
3425
3426.L050:
3427	{ .mfi
3428	mov	C1 = C
3429	mov	f64  = f0
3430	tbit.z	p6, p0 = N, 2
3431	}
3432	{ .mfi
3433	add	C2 = LDC, C
3434	mov	f72  = f0
3435	shr	I  = M, 3
3436	}
3437	;;
3438	{ .mfi
3439	shladd	C3 = LDC, 1, C
3440	mov	f80  = f0
3441	nop	__LINE__
3442	}
3443	{ .mfb
3444	mov	AOFFSET = A
3445	mov	f88  = f0
3446	(p6)	br.cond.dpnt .L090
3447	}
3448	;;
3449	{ .mfi
3450	cmp.eq	p6, p7 = 0, I
3451	mov	f65  = f0
3452	nop	__LINE__
3453	}
3454	{ .mfi
3455	shladd	C4 = LDC, 1, C2
3456	mov	f73  = f0
3457	nop	__LINE__
3458	}
3459	;;
3460	{ .mfi
3461	nop	__LINE__
3462	mov	f81  = f0
3463	nop	__LINE__
3464	}
3465	{ .mfb
3466	shladd	C = LDC, 2, C
3467	mov	f89  = f0
3468	(p6)	br.cond.dpnt .L060
3469	}
3470	;;
3471	.align 32
3472
3473.L052:
3474	{ .mfb
3475	LDFPD	f48, f49 = [B]
3476	mov	f66  = f0
3477	nop	__LINE__
3478	}
3479	{ .mfb
3480	adds	BOFFSET = 2 * SIZE, B
3481	mov	f74  = f0
3482	nop	__LINE__
3483	}
3484	;;
3485	{ .mfi
3486	LDFPD	f32, f33 = [AOFFSET], 2 * SIZE
3487	mov	f82  = f0
3488	nop	__LINE__
3489	}
3490	{ .mfi
3491	setf.d	f84  = r0
3492	mov	f90  = f0
3493	nop	__LINE__
3494	}
3495	;;
3496	{ .mfi
3497	LDFPD	f50, f51 = [BOFFSET], 2 * SIZE
3498	mov	f67  = f0
3499	adds	PREC = CPREFETCHSIZE * SIZE, C1
3500	}
3501	{ .mfi
3502	LDFPD	f34, f35 = [AOFFSET], 2 * SIZE
3503	mov	f75  = f0
3504	adds	L =  1, K
3505	}
3506	;;
3507	{ .mfi
3508	LDFPD	f36, f37 = [AOFFSET], 2 * SIZE
3509	mov	f83  = f0
3510	tbit.z	p12, p0 = L, 0
3511	}
3512	{ .mfi
3513	setf.d	f91  = r0
3514	mov	f68  = f0
3515	adds	PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
3516	}
3517	;;
3518	{ .mfi
3519	CPREFETCH [PREC], LDC
3520	mov	f76  = f0
3521	adds	PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET
3522	}
3523	{ .mfi
3524	LDFPD	f38, f39 = [AOFFSET], 2 * SIZE
3525	mov	f92  = f0
3526	cmp.eq	p3, p0 = r0, r0
3527	}
3528	;;
3529	{ .mfi
3530	CPREFETCH [PREC], LDC
3531	mov	f69  = f0
3532	shr	L = L, 1
3533	}
3534	{ .mmf
3535	setf.d	f77  = r0
3536	setf.d	f85  = r0
3537	mov	f93  = f0
3538	}
3539	;;
3540	{ .mfi
3541	CPREFETCH [PREC], LDC
3542	mov	f70  = f0
3543	adds	L =  -1, L
3544	}
3545	{ .mmf
3546	setf.d	f78  = r0
3547	setf.d	f86  = r0
3548	mov	f94  = f0
3549	}
3550	;;
3551	{ .mfi
3552	CPREFETCH [PREC]
3553	mov	f71  = f0
3554	mov	ar.lc = L
3555	}
3556	{ .mmf
3557	setf.d	f79  = r0
3558	setf.d	f87  = r0
3559	mov	f95  = f0
3560	}
3561	;;
3562	.align 32
3563
3564.L053:
3565	{ .mfb
3566	lfetch.nt1	[PREA],  16 * SIZE
3567	FMA	f64   = f32, f48, f64	// A1 * B1
3568	nop	__LINE__
3569	}
3570	{ .mfi
3571	nop	__LINE__
3572	FMA	f72   = f32, f49, f72	// A1 * B2
3573	(p12) cmp.ne p3, p0 =  0, L
3574	}
3575	;;
3576	{ .mfi
3577	lfetch.nt1	[PREB],   8 * SIZE
3578	FMA	f80   = f32, f50, f80	// A1 * B3
3579	cmp.ne	p4, p5 =  0, L
3580	}
3581	{ .mfi
3582	nop	__LINE__
3583	FMA	f88   = f32, f51, f88	// A1 * B4
3584	adds	C9  = 4 * SIZE, C1
3585	}
3586	;;
3587	{ .mfi
3588	(p3) LDFPD	f40, f41 = [AOFFSET], 2 * SIZE
3589	FMA	f65   = f33, f48, f65	// A2 * B1
3590	adds	C10 = 4 * SIZE, C2
3591	}
3592	{ .mfi
3593	nop	__LINE__
3594	FMA	f73   = f33, f49, f73	// A2 * B2
3595	adds	C11 = 4 * SIZE, C3
3596	}
3597	;;
3598	{ .mfi
3599	(p3) LDFPD	f56, f57 = [BOFFSET],  2 * SIZE
3600	FMA	f81   = f33, f50, f81	// A2 * B3
3601	adds	C12 = 4 * SIZE, C4
3602	}
3603	{ .mfb
3604	nop	__LINE__
3605	FMA	f89   = f33, f51, f89	// A2 * B4
3606	nop	__LINE__
3607	}
3608	;;
3609	{ .mfb
3610	(p3) LDFPD	f58, f59 = [BOFFSET],  2 * SIZE
3611	FMA	f66   = f34, f48, f66	// A3 * B1
3612	nop	__LINE__
3613	}
3614	{ .mfb
3615	nop	__LINE__
3616	FMA	f74   = f34, f49, f74	// A3 * B2
3617	nop	__LINE__
3618	}
3619	;;
3620	{ .mfb
3621	(p3) LDFPD	f42, f43 = [AOFFSET], 2 * SIZE
3622	FMA	f82   = f34, f50, f82	// A3 * B3
3623	nop	__LINE__
3624	}
3625	{ .mfb
3626	nop	__LINE__
3627	FMA	f90   = f34, f51, f90	// A3 * B4
3628	nop	__LINE__
3629	}
3630	;;
3631	{ .mfb
3632	(p3) LDFPD	f44, f45 = [AOFFSET], 2 * SIZE
3633	FMA	f67   = f35, f48, f67	// A4 * B1
3634	nop	__LINE__
3635	}
3636	{ .mfb
3637	nop	__LINE__
3638	FMA	f75   = f35, f49, f75	// A4 * B2
3639	nop	__LINE__
3640	}
3641	;;
3642	{ .mfb
3643	(p3) LDFPD	f46, f47 = [AOFFSET], 2 * SIZE
3644	FMA	f83   = f35, f50, f83	// A4 * B3
3645	nop	__LINE__
3646	}
3647	{ .mfb
3648	nop	__LINE__
3649	FMA	f91   = f35, f51, f91	// A4 * B4
3650	nop	__LINE__
3651	}
3652	;;
3653	{ .mfb
3654	nop	__LINE__
3655	FMA	f68   = f36, f48, f68	// A5 * B1
3656	nop	__LINE__
3657	}
3658	{ .mfb
3659	nop	__LINE__
3660	FMA	f76   = f36, f49, f76	// A5 * B2
3661	nop	__LINE__
3662	}
3663	;;
3664	{ .mfb
3665	nop	__LINE__
3666	FMA	f84   = f36, f50, f84	// A5 * B3
3667	nop	__LINE__
3668	}
3669	{ .mfb
3670	nop	__LINE__
3671	FMA	f92   = f36, f51, f92	// A5 * B4
3672	nop	__LINE__
3673	}
3674	;;
3675	{ .mfb
3676	nop	__LINE__
3677	FMA	f69   = f37, f48, f69	// A6 * B1
3678	nop	__LINE__
3679	}
3680	{ .mfb
3681	nop	__LINE__
3682	FMA	f77   = f37, f49, f77	// A6 * B2
3683	nop	__LINE__
3684	}
3685	;;
3686	{ .mfb
3687	nop	__LINE__
3688	FMA	f85   = f37, f50, f85	// A6 * B3
3689	nop	__LINE__
3690	}
3691	{ .mfb
3692	nop	__LINE__
3693	FMA	f93   = f37, f51, f93	// A6 * B4
3694	nop	__LINE__
3695	}
3696	;;
3697	{ .mfb
3698	nop	__LINE__
3699	FMA	f70   = f38, f48, f70	// A7 * B1
3700	nop	__LINE__
3701	}
3702	{ .mfb
3703	nop	__LINE__
3704	FMA	f78   = f38, f49, f78	// A7 * B2
3705	nop	__LINE__
3706	}
3707	;;
3708	{ .mfb
3709	nop	__LINE__
3710	FMA	f86   = f38, f50, f86	// A7 * B3
3711	nop	__LINE__
3712	}
3713	{ .mfb
3714	nop	__LINE__
3715	FMA	f94   = f38, f51, f94	// A7 * B4
3716	nop	__LINE__
3717	}
3718	;;
3719	{ .mfb
3720	(p4) LDFPD	f32, f33 = [AOFFSET],   2 * SIZE
3721	FMA	f71   = f39, f48, f71	// A8 * B1
3722	nop	__LINE__
3723	}
3724	{ .mfb
3725	nop	__LINE__
3726	FMA	f79   = f39, f49, f79	// A8 * B2
3727	nop	__LINE__
3728	}
3729	;;
3730	{ .mfb
3731	(p4) LDFPD	f48, f49 = [BOFFSET],  2 * SIZE
3732	FMA	f87   = f39, f50, f87	// A8 * B3
3733	nop	__LINE__
3734	}
3735	{ .mfb
3736	nop	__LINE__
3737	FMA	f95   = f39, f51, f95	// A8 * B4
3738	nop	__LINE__
3739	}
3740	;;
3741	{ .mfb
3742	(p4) LDFPD	f50, f51 = [BOFFSET],  2 * SIZE
3743	(p3) FMA	f64   = f40, f56, f64	// A1 * B1
3744	nop	__LINE__
3745	}
3746	{ .mfb
3747	nop	__LINE__
3748	(p3) FMA	f72   = f40, f57, f72	// A1 * B2
3749	nop	__LINE__
3750	}
3751	;;
3752	{ .mfb
3753	(p4) LDFPD	f34, f35 = [AOFFSET], 2 * SIZE
3754	(p3) FMA	f80   = f40, f58, f80	// A1 * B3
3755	nop	__LINE__
3756	}
3757	{ .mfb
3758	nop	__LINE__
3759	(p3) FMA	f88   = f40, f59, f88	// A1 * B4
3760	nop	__LINE__
3761	}
3762	;;
3763	{ .mfb
3764	(p4) LDFPD	f36, f37 = [AOFFSET], 2 * SIZE
3765	(p3) FMA	f65   = f41, f56, f65	// A2 * B1
3766	nop	__LINE__
3767	}
3768	{ .mfb
3769	nop	__LINE__
3770	(p3) FMA	f73   = f41, f57, f73	// A2 * B2
3771	nop	__LINE__
3772	}
3773	;;
3774	{ .mfb
3775	(p4) LDFPD	f38, f39 = [AOFFSET], 2 * SIZE
3776	(p3) FMA	f81   = f41, f58, f81	// A2 * B3
3777	nop	__LINE__
3778	}
3779	{ .mfb
3780	nop	__LINE__
3781	(p3) FMA	f89   = f41, f59, f89	// A2 * B4
3782	nop	__LINE__
3783	}
3784	;;
3785	{ .mfb
3786	(p5) LDFD	f6  = [C1 ], SIZE
3787	(p3) FMA	f66   = f42, f56, f66	// A3 * B1
3788	nop	__LINE__
3789	}
3790	{ .mfb
3791	(p5) LDFD	f7  = [C9 ], SIZE
3792	(p3) FMA	f74   = f42, f57, f74	// A3 * B2
3793	nop	__LINE__
3794	}
3795	;;
3796	{ .mfb
3797	(p5) LDFD	f10 = [C1 ], SIZE
3798	(p3) FMA	f82   = f42, f58, f82	// A3 * B3
3799	nop	__LINE__
3800	}
3801	{ .mfb
3802	(p5) LDFD	f11 = [C9 ], SIZE
3803	(p3) FMA	f90   = f42, f59, f90	// A3 * B4
3804	nop	__LINE__
3805	}
3806	;;
3807	{ .mfb
3808	(p5) LDFD	f12 = [C1 ], SIZE
3809	(p3) FMA	f67   = f43, f56, f67	// A4 * B1
3810	nop	__LINE__
3811	}
3812	{ .mfb
3813	(p5) LDFD	f13 = [C9 ], SIZE
3814	(p3) FMA	f75   = f43, f57, f75	// A4 * B2
3815	nop	__LINE__
3816	}
3817	;;
3818	{ .mfb
3819	(p5) LDFD	f14 = [C1 ], 5 * SIZE
3820	(p3) FMA	f83   = f43, f58, f83	// A4 * B3
3821	nop	__LINE__
3822	}
3823	{ .mfb
3824	(p5) LDFD	f15 = [C9 ], 5 * SIZE
3825	(p3) FMA	f91   = f43, f59, f91	// A4 * B4
3826	nop	__LINE__
3827	}
3828	;;
3829	{ .mfb
3830	(p5) LDFD	f16 = [C1 ], SIZE
3831	(p3) FMA	f68   = f44, f56, f68	// A5 * B1
3832	nop	__LINE__
3833	}
3834	{ .mfb
3835	(p5) LDFD	f17 = [C9], SIZE
3836	(p3) FMA	f76   = f44, f57, f76	// A5 * B2
3837	nop	__LINE__
3838	}
3839	;;
3840	{ .mfb
3841	(p5) LDFD	f18 = [C1 ], SIZE
3842	(p3) FMA	f84   = f44, f58, f84	// A5 * B3
3843	nop	__LINE__
3844	}
3845	{ .mfb
3846	(p5) LDFD	f19 = [C9], SIZE
3847	(p3) FMA	f92   = f44, f59, f92	// A5 * B4
3848	nop	__LINE__
3849	}
3850	;;
3851	{ .mfb
3852	(p5) LDFD	f20 = [C1 ], SIZE
3853	(p3) FMA	f69   = f45, f56, f69	// A6 * B1
3854	nop	__LINE__
3855	}
3856	{ .mfb
3857	(p5) LDFD	f21 = [C9], SIZE
3858	(p3) FMA	f77   = f45, f57, f77	// A6 * B2
3859	nop	__LINE__
3860	}
3861	;;
3862	{ .mfb
3863	(p5) LDFD	f22 = [C1 ], -11 * SIZE
3864	(p3) FMA	f85   = f45, f58, f85	// A6 * B3
3865	nop	__LINE__
3866	}
3867	{ .mfb
3868	(p5) LDFD	f23 = [C9 ], -11 * SIZE
3869	(p3) FMA	f93   = f45, f59, f93	// A6 * B4
3870	nop	__LINE__
3871	}
3872	;;
3873	{ .mfb
3874	(p5) LDFD	f24 = [C2 ], SIZE
3875	(p3) FMA	f70   = f46, f56, f70	// A7 * B1
3876	nop	__LINE__
3877	}
3878	{ .mfb
3879	(p5) LDFD	f25 = [C10], SIZE
3880	(p3) FMA	f78   = f46, f57, f78	// A7 * B2
3881	nop	__LINE__
3882	}
3883	;;
3884	{ .mfb
3885	(p5) LDFD	f26 = [C2 ], SIZE
3886	(p3) FMA	f86   = f46, f58, f86	// A7 * B3
3887	nop	__LINE__
3888	}
3889	{ .mfb
3890	(p5) LDFD	f27 = [C10], SIZE
3891	(p3) FMA	f94   = f46, f59, f94	// A7 * B4
3892	nop	__LINE__
3893	}
3894	;;
3895	{ .mfb
3896	(p5) LDFD	f28 = [C2 ], SIZE
3897	(p3) FMA	f71   = f47, f56, f71	// A8 * B1
3898	nop	__LINE__
3899	}
3900	{ .mfb
3901	(p5) LDFD	f29 = [C10], SIZE
3902	(p3) FMA	f79   = f47, f57, f79	// A8 * B2
3903	nop	__LINE__
3904	}
3905	;;
3906	{ .mfi
3907	(p5) LDFD	f30 = [C2 ], 5 * SIZE
3908	(p3) FMA	f87   = f47, f58, f87	// A8 * B3
3909	adds	L = -1, L
3910	}
3911	{ .mfb
3912	(p5) LDFD	f31 = [C10], 5 * SIZE
3913	(p3) FMA	f95   = f47, f59, f95	// A8 * B4
3914	br.cloop.sptk.few .L053
3915	}
3916	;;
3917	.align 32
3918
3919.L058:
3920	{ .mmf
3921	LDFD	f32 = [C2 ], SIZE
3922	LDFD	f33 = [C10], SIZE
3923	FMA	f6   = ALPHA_R, f64, f6
3924	}
3925	{ .mmf
3926	nop	__LINE__
3927	nop	__LINE__
3928	FMA	f7   = ALPHA_R, f66, f7
3929	}
3930	;;
3931	{ .mmf
3932	LDFD	f34 = [C2 ], SIZE
3933	LDFD	f35 = [C10], SIZE
3934	FMA	f10  = ALPHA_I, f64, f10
3935	}
3936	{ .mmf
3937	nop	__LINE__
3938	nop	__LINE__
3939	FMA	f11  = ALPHA_I, f66, f11
3940	}
3941	;;
3942	{ .mmf
3943	LDFD	f36 = [C2 ], SIZE
3944	LDFD	f37 = [C10], SIZE
3945	FMA	f12  = ALPHA_R, f65, f12
3946	}
3947	{ .mmf
3948	nop	__LINE__
3949	nop	__LINE__
3950	FMA	f13  = ALPHA_R, f67, f13
3951	}
3952	;;
3953	{ .mmf
3954	LDFD	f38 = [C2 ], - 11 * SIZE
3955	LDFD	f39 = [C10], - 11 * SIZE
3956	FMA	f14  = ALPHA_I, f65, f14
3957	}
3958	{ .mmf
3959	nop	__LINE__
3960	nop	__LINE__
3961	FMA	f15  = ALPHA_I, f67, f15
3962	}
3963	;;
3964	{ .mmf
3965	STFD	[C1 ] = f6, SIZE
3966	STFD	[C9 ] = f7, SIZE
3967	FMA	f16  = ALPHA_R, f68, f16
3968	}
3969	{ .mmf
3970	LDFD	f48 = [C3 ], SIZE
3971	LDFD	f49 = [C11], SIZE
3972	FMA	f17  = ALPHA_R, f70, f17
3973	}
3974	;;
3975	{ .mmf
3976	STFD	[C1 ] = f10, SIZE
3977	STFD	[C9 ] = f11, SIZE
3978	FMA	f18  = ALPHA_I, f68, f18
3979	}
3980	{ .mmf
3981	LDFD	f50 = [C3 ], SIZE
3982	LDFD	f51 = [C11], SIZE
3983	FMA	f19  = ALPHA_I, f70, f19
3984	}
3985	;;
3986	{ .mmf
3987	STFD	[C1 ] = f12, SIZE
3988	STFD	[C9 ] = f13, SIZE
3989	FMA	f20  = ALPHA_R, f69, f20
3990	}
3991	{ .mmf
3992	LDFD	f52 = [C3 ], SIZE
3993	LDFD	f53 = [C11], SIZE
3994	FMA	f21  = ALPHA_R, f71, f21
3995	}
3996	;;
3997	{ .mmf
3998	STFD	[C1 ] = f14, 5 * SIZE
3999	STFD	[C9 ] = f15, 5 * SIZE
4000	FMA	f22  = ALPHA_I, f69, f22
4001	}
4002	{ .mmf
4003	LDFD	f54 = [C3 ], 5 * SIZE
4004	LDFD	f55 = [C11], 5 * SIZE
4005	FMA	f23  = ALPHA_I, f71, f23
4006	}
4007	;;
4008	{ .mmf
4009	STFD	[C1 ] = f16, SIZE
4010	STFD	[C9 ] = f17, SIZE
4011	FMA	f24  = ALPHA_R, f72, f24
4012	}
4013	{ .mmf
4014	LDFD	f40 = [C3 ], SIZE
4015	LDFD	f41 = [C11], SIZE
4016	FMA	f25  = ALPHA_R, f74, f25
4017	}
4018	;;
4019	{ .mmf
4020	STFD	[C1 ] = f18, SIZE
4021	STFD	[C9 ] = f19, SIZE
4022	FMA	f26  = ALPHA_I, f72, f26
4023	}
4024	{ .mmf
4025	LDFD	f42 = [C3 ], SIZE
4026	LDFD	f43 = [C11], SIZE
4027	FMA	f27  = ALPHA_I, f74, f27
4028	}
4029	;;
4030	{ .mmf
4031	STFD	[C1 ] = f20, SIZE
4032	STFD	[C9 ] = f21, SIZE
4033	FMA	f28  = ALPHA_R, f73, f28
4034	}
4035	{ .mmf
4036	LDFD	f44 = [C3 ], SIZE
4037	LDFD	f45 = [C11], SIZE
4038	FMA	f29  = ALPHA_R, f75, f29
4039	}
4040	;;
4041	{ .mmf
4042	STFD	[C1 ] = f22, 5 * SIZE
4043	STFD	[C9 ] = f23, 5 * SIZE
4044	FMA	f30  = ALPHA_I, f73, f30
4045	}
4046	{ .mmf
4047	LDFD	f46 = [C3 ], - 11 * SIZE
4048	LDFD	f56 = [C11], - 11 * SIZE
4049	FMA	f31  = ALPHA_I, f75, f31
4050	}
4051	;;
4052	{ .mmf
4053	STFD	[C2 ] = f24, SIZE
4054	STFD	[C10] = f25, SIZE
4055	FMA	f32  = ALPHA_R, f76, f32
4056	}
4057	{ .mmf
4058	LDFD	f57 = [C4 ], SIZE
4059	LDFD	f58 = [C12], SIZE
4060	FMA	f33  = ALPHA_R, f78, f33
4061	}
4062	;;
4063	{ .mmf
4064	STFD	[C2 ] = f26, SIZE
4065	STFD	[C10] = f27, SIZE
4066	FMA	f34  = ALPHA_I, f76, f34
4067	}
4068	{ .mmf
4069	LDFD	f59 = [C4 ], SIZE
4070	LDFD	f60 = [C12], SIZE
4071	FMA	f35  = ALPHA_I, f78, f35
4072	}
4073	;;
4074	{ .mmf
4075	STFD	[C2 ] = f28, SIZE
4076	STFD	[C10] = f29, SIZE
4077	FMA	f36  = ALPHA_R, f77, f36
4078	}
4079	{ .mmf
4080	LDFD	f61 = [C4 ], SIZE
4081	LDFD	f62 = [C12], SIZE
4082	FMA	f37  = ALPHA_R, f79, f37
4083	}
4084	;;
4085	{ .mmf
4086	STFD	[C2 ] = f30, 5 * SIZE
4087	STFD	[C10] = f31, 5 * SIZE
4088	FMA	f38  = ALPHA_I, f77, f38
4089	}
4090	{ .mmf
4091	LDFD	f63 = [C4 ], 5 * SIZE
4092	LDFD	f47 = [C12], 5 * SIZE
4093	FMA	f39  = ALPHA_I, f79, f39
4094	}
4095	;;
4096	{ .mmf
4097	STFD	[C2 ] = f32, SIZE
4098	STFD	[C10] = f33, SIZE
4099	FMA	f48  = ALPHA_R, f80, f48
4100	}
4101	{ .mmf
4102	LDFD	f64 = [C4 ], SIZE
4103	LDFD	f65 = [C12], SIZE
4104	FMA	f49  = ALPHA_R, f82, f49
4105	}
4106	;;
4107	{ .mmf
4108	STFD	[C2 ] = f34, SIZE
4109	STFD	[C10] = f35, SIZE
4110	FMA	f50  = ALPHA_I, f80, f50
4111	}
4112	{ .mmf
4113	LDFD	f6 = [C4 ], SIZE
4114	LDFD	f7 = [C12], SIZE
4115	FMA	f51  = ALPHA_I, f82, f51
4116	}
4117	;;
4118	{ .mmf
4119	STFD	[C2 ] = f36, SIZE
4120	STFD	[C10] = f37, SIZE
4121	FMA	f52  = ALPHA_R, f81, f52
4122	}
4123	{ .mmf
4124	LDFD	f10 = [C4 ], SIZE
4125	LDFD	f11 = [C12], SIZE
4126	FMA	f53  = ALPHA_R, f83, f53
4127	}
4128	;;
4129	{ .mmf
4130	STFD	[C2 ] = f38, 5 * SIZE
4131	STFD	[C10] = f39, 5 * SIZE
4132	FMA	f54  = ALPHA_I, f81, f54
4133	}
4134	{ .mmf
4135	LDFD	f12 = [C4 ], - 11 * SIZE
4136	LDFD	f13 = [C12], - 11 * SIZE
4137	FMA	f55  = ALPHA_I, f83, f55
4138	}
4139	;;
4140	{ .mmf
4141	STFD	[C3 ] = f48, SIZE
4142	STFD	[C11] = f49, SIZE
4143	FMA	f40  = ALPHA_R, f84, f40
4144	}
4145	{ .mmf
4146	nop	__LINE__
4147	nop	__LINE__
4148	FMA	f41  = ALPHA_R, f86, f41
4149	}
4150	;;
4151	{ .mmf
4152	STFD	[C3 ] = f50, SIZE
4153	STFD	[C11] = f51, SIZE
4154	FMA	f42  = ALPHA_I, f84, f42
4155	}
4156	{ .mmf
4157	nop	__LINE__
4158	nop	__LINE__
4159	FMA	f43  = ALPHA_I, f86, f43
4160	}
4161	;;
4162	{ .mmf
4163	STFD	[C3 ] = f52, SIZE
4164	STFD	[C11] = f53, SIZE
4165	FMA	f44  = ALPHA_R, f85, f44
4166	}
4167	{ .mmf
4168	nop	__LINE__
4169	nop	__LINE__
4170	FMA	f45  = ALPHA_R, f87, f45
4171	}
4172	;;
4173	{ .mmf
4174	STFD	[C3 ] = f54, 5 * SIZE
4175	STFD	[C11] = f55, 5 * SIZE
4176	FMA	f46  = ALPHA_I, f85, f46
4177	}
4178	{ .mmf
4179	nop	__LINE__
4180	nop	__LINE__
4181	FMA	f56  = ALPHA_I, f87, f56
4182	}
4183	;;
4184	{ .mmf
4185	STFD	[C3 ] = f40, SIZE
4186	STFD	[C11] = f41, SIZE
4187	FMA	f57  = ALPHA_R, f88, f57
4188	}
4189	{ .mmf
4190	nop	__LINE__
4191	nop	__LINE__
4192	FMA	f58  = ALPHA_R, f90, f58
4193	}
4194	;;
4195	{ .mmf
4196	STFD	[C3 ] = f42, SIZE
4197	STFD	[C11] = f43, SIZE
4198	FMA	f59  = ALPHA_I, f88, f59
4199	}
4200	{ .mmf
4201	nop	__LINE__
4202	nop	__LINE__
4203	FMA	f60  = ALPHA_I, f90, f60
4204	}
4205	;;
4206	{ .mmf
4207	STFD	[C3 ] = f44, SIZE
4208	STFD	[C11] = f45, SIZE
4209	FMA	f61  = ALPHA_R, f89, f61
4210	}
4211	{ .mmf
4212	nop	__LINE__
4213	nop	__LINE__
4214	FMA	f62  = ALPHA_R, f91, f62
4215	}
4216	;;
4217	{ .mmf
4218	STFD	[C3 ] = f46, 5 * SIZE
4219	STFD	[C11] = f56, 5 * SIZE
4220	FMA	f63  = ALPHA_I, f89, f63
4221	}
4222	{ .mmf
4223	nop	__LINE__
4224	nop	__LINE__
4225	FMA	f47  = ALPHA_I, f91, f47
4226	}
4227	;;
4228	{ .mmf
4229	STFD	[C4 ] = f57, SIZE
4230	STFD	[C12] = f58, SIZE
4231	FMA	f64  = ALPHA_R, f92, f64
4232	}
4233	{ .mmf
4234	nop	__LINE__
4235	nop	__LINE__
4236	FMA	f65  = ALPHA_R, f94, f65
4237	}
4238	;;
4239	{ .mmf
4240	STFD	[C4 ] = f59, SIZE
4241	STFD	[C12] = f60, SIZE
4242	FMA	f6   = ALPHA_I, f92, f6
4243	}
4244	{ .mmf
4245	nop	__LINE__
4246	nop	__LINE__
4247	FMA	f7   = ALPHA_I, f94, f7
4248	}
4249	;;
4250	{ .mmf
4251	STFD	[C4 ] = f61, SIZE
4252	STFD	[C12] = f62, SIZE
4253	FMA	f10  = ALPHA_R, f93, f10
4254	}
4255	{ .mmf
4256	nop	__LINE__
4257	nop	__LINE__
4258	FMA	f11  = ALPHA_R, f95, f11
4259	}
4260	;;
4261	{ .mmf
4262	STFD	[C4 ] = f63, 5 * SIZE
4263	STFD	[C12] = f47, 5 * SIZE
4264	FMA	f12  = ALPHA_I, f93, f12
4265	}
4266	{ .mmf
4267	nop	__LINE__
4268	nop	__LINE__
4269	FMA	f13  = ALPHA_I, f95, f13
4270	}
4271	;;
4272	{ .mmf
4273	STFD	[C4 ] = f64, SIZE
4274	STFD	[C12] = f65, SIZE
4275	mov	f64  = f0
4276	}
4277	{ .mmf
4278	cmp.ne	p6, p0 = 1, I
4279	nop	__LINE__
4280	mov	f72  = f0
4281	}
4282	;;
4283	{ .mmf
4284	STFD	[C4 ] = f6, SIZE
4285	STFD	[C12] = f7, SIZE
4286 	mov	f80  = f0
4287	}
4288	{ .mmf
4289	nop	__LINE__
4290	nop	__LINE__
4291	mov	f88  = f0
4292	}
4293	;;
4294	{ .mmf
4295	STFD	[C4 ] = f10, SIZE
4296	STFD	[C12] = f11, SIZE
4297	mov	f65 = f0
4298	}
4299	{ .mmf
4300	nop	__LINE__
4301	nop	__LINE__
4302	mov	f73 = f0
4303	}
4304	;;
4305	{ .mmf
4306	STFD	[C4 ] = f12, 5 * SIZE
4307	STFD	[C12] = f13, 5 * SIZE
4308	mov	f81 = f0
4309	}
4310	{ .mfb
4311	adds	I = -1, I
4312	mov	f89 = f0
4313	(p6)	br.cond.dptk .L052
4314	}
4315	;;
4316	.align 32
4317
4318.L060:
4319	{ .mfi
4320	nop	__LINE__
4321	mov	f66  = f0
4322	tbit.z	p6, p7  = M, 2
4323	}
4324	{ .mfb
4325	nop	__LINE__
4326	mov	f74  = f0
4327	(p6)	br.cond.dptk .L070
4328	}
4329	;;
4330	{ .mfb
4331	LDFPD	f48, f49 = [B]
4332	mov	f82  = f0
4333	nop	__LINE__
4334	}
4335	{ .mfi
4336	adds	BOFFSET = 2 * SIZE, B
4337	mov	f90  = f0
4338	adds	L =  1, K
4339	}
4340	;;
4341	{ .mii
4342	LDFPD	f32, f33 = [AOFFSET], 2 * SIZE
4343	tbit.z	p12, p0 = L, 0
4344	shr	L = L, 1
4345	}
4346	;;
4347	{ .mfi
4348	LDFPD	f34, f35 = [AOFFSET], 2 * SIZE
4349	mov	f67  = f0
4350	adds	L =  -1, L
4351	}
4352	{ .mfi
4353	adds	PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
4354	mov	f75  = f0
4355	nop	__LINE__
4356	}
4357	;;
4358	{ .mfi
4359	LDFPD	f50, f51 = [BOFFSET], 2 * SIZE
4360	mov	f83  = f0
4361	mov	ar.lc = L
4362	}
4363	{ .mfi
4364	adds	PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
4365	mov	f91  = f0
4366	cmp.eq	p3, p0 = r0, r0
4367	}
4368	;;
4369	.align 32
4370
4371.L062:
4372	{ .mfi
4373	lfetch.nt1	[PREA],  8 * SIZE
4374	FMA	f64   = f32, f48, f64	// A1 * B1
4375	cmp.ne	p4, p5 =  0, L
4376	}
4377	{ .mfi
4378	nop	__LINE__
4379	FMA	f72   = f32, f49, f72	// A1 * B2
4380	(p12) cmp.ne p3, p0 =  0, L
4381	}
4382	;;
4383	{ .mfi
4384	lfetch.nt1	[PREB],   8 * SIZE
4385	FMA	f80   = f32, f50, f80	// A1 * B3
4386	(p5) adds	C9  = 4 * SIZE, C1
4387	}
4388	{ .mfi
4389	nop	__LINE__
4390	FMA	f88   = f32, f51, f88	// A1 * B4
4391	(p5) adds	C10 = 4 * SIZE, C2
4392	}
4393	;;
4394	{ .mfi
4395	(p3) LDFPD	f56, f57 = [BOFFSET],   2 * SIZE
4396	FMA	f65   = f33, f48, f65	// A2 * B1
4397	(p5) adds	C11 = 4 * SIZE, C3
4398	}
4399	{ .mfi
4400	nop	__LINE__
4401	FMA	f73   = f33, f49, f73	// A2 * B2
4402	(p5) adds	C12 = 4 * SIZE, C4
4403	}
4404	;;
4405	{ .mfb
4406	(p3) LDFPD	f40, f41 = [AOFFSET], 2 * SIZE
4407	FMA	f81   = f33, f50, f81	// A2 * B3
4408	nop	__LINE__
4409	}
4410	{ .mfb
4411	nop	__LINE__
4412	FMA	f89   = f33, f51, f89	// A2 * B4
4413	nop	__LINE__
4414	}
4415	;;
4416	{ .mfb
4417	(p3) LDFPD	f58, f59 = [BOFFSET],  2 * SIZE
4418	FMA	f66   = f34, f48, f66	// A3 * B1
4419	nop	__LINE__
4420	}
4421	{ .mfb
4422	nop	__LINE__
4423	FMA	f74   = f34, f49, f74	// A3 * B2
4424	nop	__LINE__
4425	}
4426	;;
4427	{ .mfb
4428	(p3) LDFPD	f42, f43 = [AOFFSET], 2 * SIZE
4429	FMA	f82   = f34, f50, f82	// A3 * B3
4430	nop	__LINE__
4431	}
4432	{ .mfb
4433	nop	__LINE__
4434	FMA	f90   = f34, f51, f90	// A3 * B4
4435	nop	__LINE__
4436	}
4437	;;
4438	{ .mfb
4439	(p4) LDFPD	f32, f33 = [AOFFSET],   2 * SIZE
4440	FMA	f67   = f35, f48, f67	// A4 * B1
4441	}
4442	{ .mfb
4443	(p5) LDFD	f6  = [C1 ], SIZE
4444	FMA	f75   = f35, f49, f75	// A4 * B2
4445	nop	__LINE__
4446	}
4447
4448	{ .mfb
4449	(p4) LDFPD	f48, f49 = [BOFFSET],   2 * SIZE
4450	FMA	f83   = f35, f50, f83	// A4 * B3
4451	nop	__LINE__
4452	}
4453	{ .mfb
4454	(p5) LDFD	f7  = [C9 ], SIZE
4455	FMA	f91   = f35, f51, f91	// A4 * B4
4456	nop	__LINE__
4457	}
4458	;;
4459	{ .mfb
4460	(p4) LDFPD	f50, f51 = [BOFFSET],  2 * SIZE
4461	(p3) FMA	f64   = f40, f56, f64	// A1 * B1
4462	nop	__LINE__
4463	}
4464	{ .mfb
4465	(p5) LDFD	f10 = [C1 ], SIZE
4466	(p3) FMA	f72   = f40, f57, f72	// A1 * B2
4467	nop	__LINE__
4468	}
4469	;;
4470	{ .mfb
4471	(p4) LDFPD	f34, f35 = [AOFFSET], 2 * SIZE
4472	(p3) FMA	f80   = f40, f58, f80	// A1 * B3
4473	nop	__LINE__
4474	}
4475	{ .mfb
4476	(p5) LDFD	f11 = [C9 ], SIZE
4477	(p3) FMA	f88   = f40, f59, f88	// A1 * B4
4478	nop	__LINE__
4479	}
4480	;;
4481	{ .mfb
4482	(p5) LDFD	f12 = [C1 ], SIZE
4483	(p3) FMA	f65   = f41, f56, f65	// A2 * B1
4484	nop	__LINE__
4485	}
4486	{ .mfb
4487	(p5) LDFD	f13 = [C9], SIZE
4488	(p3) FMA	f73   = f41, f57, f73	// A2 * B2
4489	nop	__LINE__
4490	}
4491	;;
4492	{ .mfb
4493	(p5) LDFD	f14 = [C1 ], - 3 * SIZE
4494	(p3) FMA	f81   = f41, f58, f81	// A2 * B3
4495	nop	__LINE__
4496	}
4497	{ .mfb
4498	(p5) LDFD	f15 = [C9], - 3 * SIZE
4499	(p3) FMA	f89   = f41, f59, f89	// A2 * B4
4500	nop	__LINE__
4501	}
4502	;;
4503	{ .mfb
4504	(p5) LDFD	f16  = [C2 ], SIZE
4505	(p3) FMA	f66   = f42, f56, f66	// A3 * B1
4506	nop	__LINE__
4507	}
4508	{ .mfb
4509	(p5) LDFD	f17  = [C10], SIZE
4510	(p3) FMA	f74   = f42, f57, f74	// A3 * B2
4511	nop	__LINE__
4512	}
4513	;;
4514	{ .mfb
4515	(p5) LDFD	f18 = [C2 ], SIZE
4516	(p3) FMA	f82   = f42, f58, f82	// A3 * B3
4517	nop	__LINE__
4518	}
4519	{ .mfb
4520	(p5) LDFD	f19 = [C10], SIZE
4521	(p3) FMA	f90   = f42, f59, f90	// A3 * B4
4522	nop	__LINE__
4523	}
4524	;;
4525	{ .mfb
4526	(p5) LDFD	f20 = [C2 ], SIZE
4527	(p3) FMA	f67   = f43, f56, f67	// A4 * B1
4528	nop	__LINE__
4529	}
4530	{ .mfb
4531	(p5) LDFD	f21 = [C10], SIZE
4532	(p3) FMA	f75   = f43, f57, f75	// A4 * B2
4533	nop	__LINE__
4534	}
4535	;;
4536	{ .mfi
4537	(p5) LDFD	f22 = [C2 ], -3 * SIZE
4538	(p3) FMA	f83   = f43, f58, f83	// A4 * B3
4539	adds	L = -1, L
4540	}
4541	{ .mfb
4542	(p5) LDFD	f23 = [C10], -3 * SIZE
4543	(p3) FMA	f91   = f43, f59, f91	// A4 * B4
4544	br.cloop.sptk.few .L062
4545	}
4546	;;
4547	{ .mmf
4548	LDFD	f24 = [C3 ], SIZE
4549	LDFD	f25 = [C11], SIZE
4550	FMA	f6   = ALPHA_R, f64, f6
4551	}
4552	{ .mmf
4553	nop	__LINE__
4554	nop	__LINE__
4555	FMA	f7   = ALPHA_R, f66, f7
4556	}
4557	;;
4558	{ .mmf
4559	LDFD	f26 = [C3 ], SIZE
4560	LDFD	f27 = [C11], SIZE
4561	FMA	f10  = ALPHA_I, f64, f10
4562	}
4563	{ .mmf
4564	nop	__LINE__
4565	nop	__LINE__
4566	FMA	f11  = ALPHA_I, f66, f11
4567	}
4568	;;
4569	{ .mmf
4570	LDFD	f28 = [C3 ], SIZE
4571	LDFD	f29 = [C11], SIZE
4572	FMA	f12  = ALPHA_R, f65, f12
4573	}
4574	{ .mmf
4575	nop	__LINE__
4576	nop	__LINE__
4577	FMA	f13  = ALPHA_R, f67, f13
4578	}
4579	;;
4580	{ .mmf
4581	LDFD	f30 = [C3 ], - 3 * SIZE
4582	LDFD	f31 = [C11], - 3 * SIZE
4583	FMA	f14  = ALPHA_I, f65, f14
4584	}
4585	{ .mmf
4586	nop	__LINE__
4587	nop	__LINE__
4588	FMA	f15  = ALPHA_I, f67, f15
4589	}
4590	;;
4591	{ .mmf
4592	STFD	[C1 ] = f6, SIZE
4593	STFD	[C9 ] = f7, SIZE
4594	FMA	f16  = ALPHA_R, f72, f16
4595	}
4596	{ .mmf
4597	LDFD	f32 = [C4 ], SIZE
4598	LDFD	f33 = [C12], SIZE
4599	FMA	f17  = ALPHA_R, f74, f17
4600	}
4601	;;
4602	{ .mmf
4603	STFD	[C1 ] = f10, SIZE
4604	STFD	[C9 ] = f11, SIZE
4605	FMA	f18  = ALPHA_I, f72, f18
4606	}
4607	{ .mmf
4608	LDFD	f34 = [C4 ], SIZE
4609	LDFD	f35 = [C12], SIZE
4610	FMA	f19  = ALPHA_I, f74, f19
4611	}
4612	;;
4613	{ .mmf
4614	STFD	[C1 ] = f12, SIZE
4615	STFD	[C9 ] = f13, SIZE
4616	FMA	f20  = ALPHA_R, f73, f20
4617	}
4618	{ .mmf
4619	LDFD	f36 = [C4 ], SIZE
4620	LDFD	f37 = [C12], SIZE
4621	FMA	f21  = ALPHA_R, f75, f21
4622	}
4623	;;
4624	{ .mmf
4625	STFD	[C1 ] = f14, 5 * SIZE
4626	STFD	[C9 ] = f15, 5 * SIZE
4627	FMA	f22  = ALPHA_I, f73, f22
4628	}
4629	{ .mmf
4630	LDFD	f38 = [C4 ], - 3 * SIZE
4631	LDFD	f39 = [C12], - 3 * SIZE
4632	FMA	f23  = ALPHA_I, f75, f23
4633	}
4634	;;
4635	{ .mmf
4636	STFD	[C2 ] = f16, SIZE
4637	STFD	[C10] = f17, SIZE
4638	FMA	f24  = ALPHA_R, f80, f24
4639	}
4640	{ .mmf
4641	nop	__LINE__
4642	nop	__LINE__
4643	FMA	f25  = ALPHA_R, f82, f25
4644	}
4645	;;
4646	{ .mmf
4647	STFD	[C2 ] = f18, SIZE
4648	STFD	[C10] = f19, SIZE
4649	FMA	f26  = ALPHA_I, f80, f26
4650	}
4651	{ .mmf
4652	nop	__LINE__
4653	nop	__LINE__
4654	FMA	f27  = ALPHA_I, f82, f27
4655	}
4656	;;
4657	{ .mmf
4658	STFD	[C2 ] = f20, SIZE
4659	STFD	[C10] = f21, SIZE
4660	FMA	f28  = ALPHA_R, f81, f28
4661	}
4662	{ .mmf
4663	nop	__LINE__
4664	nop	__LINE__
4665	FMA	f29  = ALPHA_R, f83, f29
4666	}
4667	;;
4668	{ .mmf
4669	STFD	[C2 ] = f22, 5 * SIZE
4670	STFD	[C10] = f23, 5 * SIZE
4671	FMA	f30  = ALPHA_I, f81, f30
4672	}
4673	{ .mmf
4674	nop	__LINE__
4675	nop	__LINE__
4676	FMA	f31  = ALPHA_I, f83, f31
4677	}
4678	;;
4679	{ .mmf
4680	STFD	[C3 ] = f24, SIZE
4681	STFD	[C11] = f25, SIZE
4682	FMA	f32  = ALPHA_R, f88, f32
4683	}
4684	{ .mmf
4685	nop	__LINE__
4686	nop	__LINE__
4687	FMA	f33  = ALPHA_R, f90, f33
4688	}
4689	;;
4690	{ .mmf
4691	STFD	[C3 ] = f26, SIZE
4692	STFD	[C11] = f27, SIZE
4693	FMA	f34  = ALPHA_I, f88, f34
4694	}
4695	{ .mmf
4696	nop	__LINE__
4697	nop	__LINE__
4698	FMA	f35  = ALPHA_I, f90, f35
4699	}
4700	;;
4701	{ .mmf
4702	STFD	[C3 ] = f28, SIZE
4703	STFD	[C11] = f29, SIZE
4704	FMA	f36  = ALPHA_R, f89, f36
4705	}
4706	{ .mmf
4707	nop	__LINE__
4708	nop	__LINE__
4709	FMA	f37  = ALPHA_R, f91, f37
4710	}
4711	;;
4712	{ .mmf
4713	STFD	[C3 ] = f30, 5 * SIZE
4714	STFD	[C11] = f31, 5 * SIZE
4715	FMA	f38  = ALPHA_I, f89, f38
4716	}
4717	{ .mmf
4718	nop	__LINE__
4719	nop	__LINE__
4720	FMA	f39  = ALPHA_I, f91, f39
4721	}
4722	;;
4723	{ .mmf
4724	STFD	[C4 ] = f32, SIZE
4725	STFD	[C12] = f33, SIZE
4726	mov	f64  = f0
4727	}
4728	{ .mmf
4729	nop	__LINE__
4730	nop	__LINE__
4731	mov	f72  = f0
4732	}
4733	;;
4734	{ .mmf
4735	STFD	[C4 ] = f34, SIZE
4736	STFD	[C12] = f35, SIZE
4737	mov	f80  = f0
4738	}
4739	{ .mmf
4740	nop	__LINE__
4741	nop	__LINE__
4742	mov	f88  = f0
4743	}
4744	;;
4745	{ .mmf
4746	STFD	[C4 ] = f36, SIZE
4747	STFD	[C12] = f37, SIZE
4748	mov	f81  = f0
4749	}
4750	{ .mmf
4751	nop	__LINE__
4752	nop	__LINE__
4753	mov	f65  = f0
4754	}
4755	;;
4756	{ .mmf
4757	STFD	[C4 ] = f38, 5 * SIZE
4758	STFD	[C12] = f39, 5 * SIZE
4759	mov	f89  = f0
4760	}
4761	{ .mmf
4762	nop	__LINE__
4763	nop	__LINE__
4764	mov	f73  = f0
4765	}
4766	;;
4767	.align 32
4768
4769.L070:
4770	{ .mib
4771	nop	__LINE__
4772	tbit.z	p6,p7  = M, 1
4773	(p6)	br.cond.dptk .L080
4774	}
4775	;;
4776	{ .mmi
4777	LDFPD	f48, f49 = [B]
4778	adds	BOFFSET = 2 * SIZE, B
4779	adds	L =  1, K
4780	}
4781	;;
4782	{ .mii
4783	cmp.eq	p3, p0 = r0, r0
4784	tbit.z	p12, p0 = L, 0
4785	shr	L = L, 1
4786	}
4787	;;
4788	{ .mmi
4789	(p7) LDFPD	f32, f33 = [AOFFSET], 2 * SIZE
4790	adds	L =  -1, L
4791	adds	PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
4792	}
4793	;;
4794	{ .mmi
4795	LDFPD	f50, f51 = [BOFFSET], 2 * SIZE
4796	adds	PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
4797	mov	ar.lc = L
4798	}
4799	;;
4800	.align 32
4801
4802.L072:
4803	{ .mfb
4804	lfetch.nt1	[PREA],  4 * SIZE
4805	FMA	f64   = f32, f48, f64	// A1 * B1
4806	nop	__LINE__
4807	}
4808	{ .mfi
4809	nop	__LINE__
4810	FMA	f72   = f32, f49, f72	// A1 * B2
4811	(p12) cmp.ne p3, p0 =  0, L
4812	}
4813	;;
4814	{ .mfi
4815	lfetch.nt1	[PREB],   8 * SIZE
4816	FMA	f80   = f32, f50, f80	// A1 * B3
4817	cmp.ne	p4, p5 =  0, L
4818	}
4819	{ .mfb
4820	nop	__LINE__
4821	FMA	f88   = f32, f51, f88	// A1 * B4
4822	nop	__LINE__
4823	}
4824	;;
4825	{ .mfi
4826	(p3) LDFPD	f40, f41 = [AOFFSET], 2 * SIZE
4827	FMA	f65   = f33, f48, f65	// A2 * B1
4828	}
4829	{ .mfi
4830	nop	__LINE__
4831	FMA	f73   = f33, f49, f73	// A2 * B2
4832	}
4833	;;
4834	{ .mfi
4835	(p3) LDFPD	f56, f57 = [BOFFSET], 2 * SIZE
4836	FMA	f81   = f33, f50, f81	// A2 * B3
4837	}
4838	{ .mmf
4839	(p5) LDFD	f6  = [C1 ], SIZE
4840	(p5) LDFD	f12 = [C2 ], SIZE
4841	FMA	f89   = f33, f51, f89	// A2 * B4
4842	}
4843	;;
4844	{ .mfb
4845	(p3) LDFPD	f58, f59 = [BOFFSET], 2 * SIZE
4846	(p3) FMA	f64   = f40, f56, f64	// A1 * B1
4847	nop	__LINE__
4848	}
4849	{ .mmf
4850	(p5) LDFD	f7  = [C1 ], SIZE
4851	(p5) LDFD	f13 = [C2 ], SIZE
4852	(p3) FMA	f72   = f40, f57, f72	// A1 * B2
4853	}
4854	;;
4855	{ .mfb
4856	(p4) LDFPD	f32, f33 = [AOFFSET], 2 * SIZE
4857	(p3) FMA	f80   = f40, f58, f80	// A1 * B3
4858	nop	__LINE__
4859	}
4860	{ .mmf
4861	(p5) LDFD	f10 = [C1 ], SIZE
4862	(p5) LDFD	f14 = [C2 ], SIZE
4863	(p3) FMA	f88   = f40, f59, f88	// A1 * B4
4864	}
4865	;;
4866	{ .mfb
4867	(p4) LDFPD	f48, f49 = [BOFFSET],   2 * SIZE
4868	(p3) FMA	f65   = f41, f56, f65	// A2 * B1
4869	nop	__LINE__
4870	}
4871	{ .mfb
4872	(p5) LDFD	f11 = [C1 ], - 3 * SIZE
4873	(p3) FMA	f73   = f41, f57, f73	// A2 * B2
4874	nop	__LINE__
4875	}
4876	;;
4877	{ .mfi
4878	(p4) LDFPD	f50, f51 = [BOFFSET],  2 * SIZE
4879	(p3) FMA	f81   = f41, f58, f81	// A2 * B3
4880	adds	L = -1, L
4881	}
4882	{ .mfb
4883	(p5) LDFD	f15 = [C2 ], - 3 * SIZE
4884	(p3) FMA	f89   = f41, f59, f89	// A2 * B4
4885	br.cloop.sptk.few .L072
4886	}
4887	;;
4888	{ .mmf
4889	LDFD	f16 = [C3], SIZE
4890	LDFD	f20 = [C4], SIZE
4891	FMA	f6   = ALPHA_R, f64, f6
4892	}
4893	{ .mmf
4894	nop	__LINE__
4895	nop	__LINE__
4896	FMA	f12  = ALPHA_R, f72, f12
4897	}
4898	;;
4899	{ .mmf
4900	LDFD	f17 = [C3], SIZE
4901	LDFD	f21 = [C4], SIZE
4902	FMA	f7   = ALPHA_I, f64, f7
4903	}
4904	{ .mmf
4905	nop	__LINE__
4906	nop	__LINE__
4907	FMA	f13  = ALPHA_I, f72, f13
4908	}
4909	;;
4910	{ .mmf
4911	LDFD	f18 = [C3], SIZE
4912	LDFD	f22 = [C4], SIZE
4913	FMA	f10  = ALPHA_R, f65, f10
4914	}
4915	{ .mmf
4916	nop	__LINE__
4917	nop	__LINE__
4918	FMA	f14  = ALPHA_R, f73, f14
4919	}
4920	;;
4921	{ .mmf
4922	LDFD	f19 = [C3], - 3 * SIZE
4923	LDFD	f23 = [C4], - 3 * SIZE
4924	FMA	f11  = ALPHA_I, f65, f11
4925	}
4926	{ .mmf
4927	nop	__LINE__
4928	nop	__LINE__
4929	FMA	f15  = ALPHA_I, f73, f15
4930	}
4931	;;
4932	{ .mmf
4933	STFD	[C1] = f6,  SIZE
4934	STFD	[C2] = f12, SIZE
4935	FMA	f16  = ALPHA_R, f80, f16
4936	}
4937	{ .mmf
4938	nop	__LINE__
4939	nop	__LINE__
4940	FMA	f20  = ALPHA_R, f88, f20
4941	}
4942	;;
4943	{ .mmf
4944	STFD	[C1] = f7,  SIZE
4945	STFD	[C2] = f13, SIZE
4946	FMA	f17  = ALPHA_I, f80, f17
4947	}
4948	{ .mmf
4949	nop	__LINE__
4950	nop	__LINE__
4951	FMA	f21  = ALPHA_I, f88, f21
4952	}
4953	;;
4954	{ .mmf
4955	STFD	[C1] = f10, SIZE
4956	STFD	[C2] = f14, SIZE
4957	FMA	f18  = ALPHA_R, f81, f18
4958	}
4959	{ .mmf
4960	nop	__LINE__
4961	nop	__LINE__
4962	FMA	f22  = ALPHA_R, f89, f22
4963	}
4964	;;
4965	{ .mmf
4966	STFD	[C1] = f11, SIZE
4967	STFD	[C2] = f15, SIZE
4968	FMA	f19  = ALPHA_I, f81, f19
4969	}
4970	{ .mmf
4971	nop	__LINE__
4972	nop	__LINE__
4973	FMA	f23  = ALPHA_I, f89, f23
4974	}
4975	;;
4976	{ .mmf
4977	STFD	[C3] = f16, SIZE
4978	STFD	[C4] = f20, SIZE
4979	mov	f64  = f0
4980	}
4981	;;
4982	{ .mmf
4983	STFD	[C3] = f17, SIZE
4984	STFD	[C4] = f21, SIZE
4985	mov	f72  = f0
4986	}
4987	;;
4988	{ .mmf
4989	STFD	[C3] = f18, SIZE
4990	STFD	[C4] = f22, SIZE
4991	mov	f80  = f0
4992	}
4993	;;
4994	{ .mmf
4995	STFD	[C3] = f19, SIZE
4996	STFD	[C4] = f23, SIZE
4997	mov	f88  = f0
4998	}
4999	;;
5000	.align 32
5001
5002.L080:
5003	{ .mib
5004	nop	__LINE__
5005	tbit.z	p6,p7  = M, 0
5006	(p6)	br.cond.dptk .L089
5007	}
5008	;;
5009	{ .mmi
5010	LDFPD	f48, f49 = [B]
5011	adds	BOFFSET = 2 * SIZE, B
5012	adds	L =  1, K
5013	}
5014	;;
5015	{ .mii
5016	LDFD	f32 = [AOFFSET], 1 * SIZE
5017	tbit.z	p12, p0 = L, 0
5018	shr	L = L, 1
5019	}
5020	;;
5021	{ .mmi
5022	nop	__LINE__
5023	nop	__LINE__
5024	adds	L =  -1, L
5025	}
5026	;;
5027	{ .mmi
5028	LDFPD	f50, f51 = [BOFFSET], 2 * SIZE
5029	cmp.eq	p3, p0 = r0, r0
5030	mov	ar.lc = L
5031	}
5032	;;
5033	.align 32
5034
5035.L082:
5036	{ .mfb
5037	cmp.ne	p4, p5 =  0, L
5038	FMA	f64   = f32, f48, f64	// A1 * B1
5039	nop	__LINE__
5040	}
5041	{ .mfi
5042	(p12) cmp.ne p3, p0 =  0, L
5043	FMA	f72   = f32, f49, f72	// A1 * B2
5044	nop	__LINE__
5045	}
5046	;;
5047	{ .mfb
5048	(p3) LDFPD	f56, f57 = [BOFFSET],   2 * SIZE
5049	FMA	f80   = f32, f50, f80	// A1 * B3
5050	nop	__LINE__
5051	}
5052	{ .mfb
5053	(p3) LDFD	f40 = [AOFFSET], 1 * SIZE
5054	FMA	f88   = f32, f51, f88	// A1 * B4
5055	nop	__LINE__
5056	}
5057	;;
5058	{ .mfb
5059	(p3) LDFPD	f58, f59 = [BOFFSET],  2 * SIZE
5060	(p3) FMA	f64   = f40, f56, f64	// A1 * B1
5061	nop	__LINE__
5062	}
5063	{ .mmf
5064	(p5) LDFD	f6   = [C1], SIZE
5065	(p5) LDFD	f10  = [C2], SIZE
5066	(p3) FMA	f72   = f40, f57, f72	// A1 * B2
5067	}
5068	;;
5069	{ .mmf
5070	(p4) LDFPD	f48, f49 = [BOFFSET],   2 * SIZE
5071	(p4) LDFD	f32 = [AOFFSET],   1 * SIZE
5072	(p3) FMA	f80   = f40, f58, f80	// A1 * B3
5073	}
5074	{ .mmf
5075	(p5) LDFD	f7  = [C1], -SIZE
5076	(p5) LDFD	f11 = [C2], -SIZE
5077	(p3) FMA	f88   = f40, f59, f88	// A1 * B4
5078	}
5079	;;
5080	{ .mib
5081	(p4) LDFPD	f50, f51 = [BOFFSET],  2 * SIZE
5082	adds	L = -1, L
5083	br.cloop.sptk.few .L082
5084	}
5085	;;
5086	{ .mmf
5087	LDFD	f12 = [C3], SIZE
5088	LDFD	f14 = [C4], SIZE
5089	FMA	f6   = ALPHA_R, f64, f6
5090	}
5091	{ .mmf
5092	nop	__LINE__
5093	nop	__LINE__
5094	FMA	f10  = ALPHA_R, f72, f10
5095	}
5096	;;
5097	{ .mmf
5098	LDFD	f13 = [C3], -SIZE
5099	LDFD	f15 = [C4], -SIZE
5100	FMA	f7   = ALPHA_I, f64, f7
5101	}
5102	{ .mmf
5103	nop	__LINE__
5104	nop	__LINE__
5105	FMA	f11  = ALPHA_I, f72, f11
5106	}
5107	;;
5108	{ .mmf
5109	nop	__LINE__
5110	nop	__LINE__
5111	FMA	f12  = ALPHA_R, f80, f12
5112	}
5113	{ .mmf
5114	nop	__LINE__
5115	nop	__LINE__
5116	FMA	f14  = ALPHA_R, f88, f14
5117	}
5118	;;
5119	{ .mmf
5120	nop	__LINE__
5121	nop	__LINE__
5122	FMA	f13  = ALPHA_I, f80, f13
5123	}
5124	{ .mmf
5125	nop	__LINE__
5126	nop	__LINE__
5127	FMA	f15  = ALPHA_I, f88, f15
5128	}
5129	;;
5130	{ .mmi
5131	STFD	[C1] = f6,  SIZE
5132	STFD	[C2] = f10, SIZE
5133	nop	__LINE__
5134	}
5135	;;
5136	{ .mmi
5137	STFD	[C1] = f7,  SIZE
5138	STFD	[C2] = f11, SIZE
5139	nop	__LINE__
5140	}
5141	;;
5142	{ .mmi
5143	STFD	[C3] = f12, SIZE
5144	STFD	[C4] = f14, SIZE
5145	nop	__LINE__
5146	}
5147	;;
5148	{ .mmi
5149	STFD	[C3] = f13, SIZE
5150	STFD	[C4] = f15, SIZE
5151	nop	__LINE__
5152	}
5153	;;
5154	.align 32
5155
5156.L089:
5157	{ .mmi
5158	mov	B = BOFFSET
5159	mov	AOFFSET = A
5160	nop	__LINE__
5161	}
5162	;;
5163	.align 16
5164
5165.L090:
5166	{ .mfi
5167 	mov	C1 = C
5168	mov	f64  = f0
5169	tbit.z	p6, p0 = N, 1
5170	}
5171	{ .mfi
5172	add	C2 = LDC, C
5173	mov	f72  = f0
5174	shr	I  = M, 3
5175	}
5176	;;
5177	{ .mfi
5178	setf.d	f66  = r0
5179	mov	f65  = f0
5180	nop	__LINE__
5181	}
5182	{ .mfb
5183	mov	AOFFSET = A
5184	mov	f73  = f0
5185	(p6)	br.cond.dpnt .L130
5186	}
5187	;;
5188	{ .mfi
5189	nop	__LINE__
5190	mov	f67  = f0
5191	shladd	C = LDC, 1, C
5192	}
5193	{ .mfb
5194	cmp.eq	p6, p7 = 0, I
5195	mov	f74  = f0
5196	(p6)	br.cond.dpnt .L100
5197	}
5198	;;
5199	.align 32
5200
5201.L092:
5202	{ .mfb
5203	LDFPD	f48, f49 = [B]
5204	mov	f68  = f0
5205	nop	__LINE__
5206	}
5207	{ .mfb
5208	adds	BOFFSET = 2 * SIZE, B
5209	mov	f79  = f0
5210	nop	__LINE__
5211	}
5212	;;
5213	{ .mfi
5214	LDFPD	f32, f33 = [AOFFSET], 2 * SIZE
5215	mov	f75  = f0
5216	nop	__LINE__
5217	}
5218	;;
5219	{ .mfi
5220	adds	PREC = CPREFETCHSIZE * SIZE, C1
5221	mov	f76  = f0
5222	adds	L =  1, K
5223	}
5224	;;
5225	{ .mfi
5226	LDFPD	f34, f35 = [AOFFSET], 2 * SIZE
5227	mov	f69  = f0
5228	tbit.z	p12, p0 = L, 0
5229	}
5230	{ .mfi
5231	cmp.eq	p3, p0 = r0, r0
5232	mov	f77  = f0
5233	shr	L = L, 1
5234	}
5235	;;
5236	{ .mfi
5237	adds	PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
5238	adds	L =  -1, L
5239	}
5240	{ .mmf
5241	LDFPD	f36, f37 = [AOFFSET], 2 * SIZE
5242	CPREFETCH [PREC], LDC
5243	mov	f70  = f0
5244	}
5245	;;
5246	{ .mfi
5247	LDFPD	f38, f39 = [AOFFSET], 2 * SIZE
5248	mov	f78  = f0
5249	mov	ar.lc = L
5250	}
5251	{ .mfi
5252	CPREFETCH [PREC]
5253	mov	f71  = f0
5254	adds	PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
5255	}
5256	;;
5257	.align 32
5258
5259.L093:
5260/*  1 */
5261	{ .mfi
5262	lfetch.nt1	[PREA],  16 * SIZE
5263	FMA	f64   = f32, f48, f64	// A1 * B1
5264	cmp.ne	p4, p5 =  0, L
5265	}
5266	{ .mfi
5267	nop	__LINE__
5268	FMA	f72   = f32, f49, f72	// A1 * B2
5269	(p12) cmp.ne p3, p0 =  0, L
5270	}
5271	;;
5272	{ .mfi
5273	lfetch.nt1	[PREB],   4 * SIZE
5274	FMA	f65   = f33, f48, f65	// A2 * B1
5275	adds	C9  = 4 * SIZE, C1
5276	}
5277	{ .mfi
5278	nop	__LINE__
5279	FMA	f73   = f33, f49, f73	// A2 * B2
5280	adds	C10 = 4 * SIZE, C2
5281	}
5282	;;
5283	{ .mfi
5284	(p3) LDFPD	f56, f57 = [BOFFSET],   2 * SIZE
5285	FMA	f66   = f34, f48, f66	// A3 * B1
5286	adds	C11 = 4 * SIZE, C3
5287	}
5288	{ .mfi
5289	nop	__LINE__
5290	FMA	f74   = f34, f49, f74	// A3 * B2
5291	adds	C12 = 4 * SIZE, C4
5292	}
5293	;;
5294	{ .mfb
5295	(p3) LDFPD	f40, f41 = [AOFFSET], 2 * SIZE
5296	FMA	f67   = f35, f48, f67	// A4 * B1
5297	nop	__LINE__
5298	}
5299	{ .mfb
5300	(p5) LDFD	f6  = [C1 ], SIZE
5301	FMA	f75   = f35, f49, f75	// A4 * B2
5302	nop	__LINE__
5303	}
5304	;;
5305	{ .mfb
5306	(p3) LDFPD	f42, f43 = [AOFFSET], 2 * SIZE
5307	FMA	f68   = f36, f48, f68	// A5 * B1
5308	nop	__LINE__
5309	}
5310	{ .mfb
5311	(p5) LDFD	f7 = [C9 ], SIZE
5312	FMA	f76   = f36, f49, f76	// A5 * B2
5313	nop	__LINE__
5314	}
5315	;;
5316	{ .mfb
5317	(p3) LDFPD	f44, f45 = [AOFFSET], 2 * SIZE
5318	FMA	f69   = f37, f48, f69	// A6 * B1
5319	nop	__LINE__
5320	}
5321	{ .mfb
5322	(p5) LDFD	f10 = [C1 ], SIZE
5323	FMA	f77   = f37, f49, f77	// A6 * B2
5324	nop	__LINE__
5325	}
5326	;;
5327	{ .mfb
5328	(p3) LDFPD	f46, f47 = [AOFFSET], 2 * SIZE
5329	FMA	f70   = f38, f48, f70	// A7 * B1
5330	nop	__LINE__
5331	}
5332	{ .mfb
5333	(p5) LDFD	f11 = [C9 ], SIZE
5334	FMA	f78   = f38, f49, f78	// A7 * B2
5335	nop	__LINE__
5336	}
5337	;;
5338	{ .mfb
5339	(p4) LDFPD	f32, f33 = [AOFFSET],   2 * SIZE
5340	FMA	f71   = f39, f48, f71	// A8 * B1
5341	nop	__LINE__
5342	}
5343	{ .mfb
5344	(p5) LDFD	f12 = [C1 ], SIZE
5345	FMA	f79   = f39, f49, f79	// A8 * B2
5346	nop	__LINE__
5347	}
5348	;;
5349	{ .mfb
5350	(p4) LDFPD	f48, f49 = [BOFFSET],  2 * SIZE
5351	(p3) FMA	f64   = f40, f56, f64	// A1 * B1
5352	nop	__LINE__
5353	}
5354	{ .mfb
5355	(p5) LDFD	f13 = [C9 ], SIZE
5356	(p3) FMA	f72   = f40, f57, f72	// A1 * B2
5357	nop	__LINE__
5358	}
5359	;;
5360	{ .mfb
5361	(p4) LDFPD	f34, f35 = [AOFFSET], 2 * SIZE
5362	(p3) FMA	f65   = f41, f56, f65	// A2 * B1
5363	nop	__LINE__
5364	}
5365	{ .mfb
5366	(p5) LDFD	f14 = [C1 ], 5 * SIZE
5367	(p3) FMA	f73   = f41, f57, f73	// A2 * B2
5368	nop	__LINE__
5369	}
5370	;;
5371	{ .mfb
5372	(p4) LDFPD	f36, f37 = [AOFFSET], 2 * SIZE
5373	(p3) FMA	f66   = f42, f56, f66	// A3 * B1
5374	nop	__LINE__
5375	}
5376	{ .mfb
5377	(p5) LDFD	f15 = [C9 ], 5 * SIZE
5378	(p3) FMA	f74   = f42, f57, f74	// A3 * B2
5379	nop	__LINE__
5380	}
5381	;;
5382	{ .mfb
5383	(p4) LDFPD	f38, f39 = [AOFFSET], 2 * SIZE
5384	(p3) FMA	f67   = f43, f56, f67	// A4 * B1
5385	nop	__LINE__
5386	}
5387	{ .mfb
5388	nop	__LINE__
5389	(p3) FMA	f75   = f43, f57, f75	// A4 * B2
5390	nop	__LINE__
5391	}
5392	;;
5393	{ .mfb
5394	(p5) LDFD	f16 = [C1 ], SIZE
5395	(p3) FMA	f68   = f44, f56, f68	// A5 * B1
5396	nop	__LINE__
5397	}
5398	{ .mfb
5399	(p5) LDFD	f17 = [C9 ], SIZE
5400	(p3) FMA	f76   = f44, f57, f76	// A5 * B2
5401	nop	__LINE__
5402	}
5403	;;
5404	{ .mfb
5405	(p5) LDFD	f18 = [C1 ], SIZE
5406	(p3) FMA	f69   = f45, f56, f69	// A6 * B1
5407	nop	__LINE__
5408	}
5409	{ .mfb
5410	(p5) LDFD	f19 = [C9 ], SIZE
5411	(p3) FMA	f77   = f45, f57, f77	// A6 * B2
5412	nop	__LINE__
5413	}
5414	;;
5415	{ .mfb
5416	(p5) LDFD	f20 = [C1 ], SIZE
5417	(p3) FMA	f70   = f46, f56, f70	// A7 * B1
5418	nop	__LINE__
5419	}
5420	{ .mfb
5421	(p5) LDFD	f21 = [C9 ], SIZE
5422	(p3) FMA	f78   = f46, f57, f78	// A7 * B2
5423	nop	__LINE__
5424	}
5425	;;
5426	{ .mfi
5427	(p5) LDFD	f22 = [C1 ], -11 * SIZE
5428	(p3) FMA	f71   = f47, f56, f71	// A8 * B1
5429	adds	L = -1, L
5430	}
5431	{ .mfb
5432	(p5) LDFD	f23 = [C9 ], -11 * SIZE
5433	(p3) FMA	f79   = f47, f57, f79	// A8 * B2
5434	br.cloop.sptk.few .L093
5435	}
5436	;;
5437	{ .mmf
5438	LDFD	f24 = [C2 ], SIZE
5439	LDFD	f25 = [C10], SIZE
5440	FMA	f6   = ALPHA_R, f64, f6
5441	}
5442	{ .mmf
5443	nop	__LINE__
5444	nop	__LINE__
5445	FMA	f7   = ALPHA_R, f66, f7
5446	}
5447	;;
5448	{ .mmf
5449	LDFD	f26 = [C2 ], SIZE
5450	LDFD	f27 = [C10], SIZE
5451	FMA	f10  = ALPHA_I, f64, f10
5452	}
5453	{ .mmf
5454	nop	__LINE__
5455	nop	__LINE__
5456	FMA	f11  = ALPHA_I, f66, f11
5457	}
5458	;;
5459	{ .mmf
5460	LDFD	f28 = [C2 ], SIZE
5461	LDFD	f29 = [C10], SIZE
5462	FMA	f12  = ALPHA_R, f65, f12
5463	}
5464	{ .mmf
5465	nop	__LINE__
5466	nop	__LINE__
5467	FMA	f13  = ALPHA_R, f67, f13
5468	}
5469	;;
5470	{ .mmf
5471	LDFD	f30 = [C2 ], 5 * SIZE
5472	LDFD	f31 = [C10], 5 * SIZE
5473	FMA	f14  = ALPHA_I, f65, f14
5474	}
5475	{ .mmf
5476	nop	__LINE__
5477	nop	__LINE__
5478	FMA	f15  = ALPHA_I, f67, f15
5479	}
5480	;;
5481	{ .mmf
5482	STFD	[C1 ] = f6, SIZE
5483	STFD	[C9 ] = f7, SIZE
5484	FMA	f16  = ALPHA_R, f68, f16
5485	}
5486	{ .mmf
5487	LDFD	f32 = [C2 ], SIZE
5488	LDFD	f33 = [C10], SIZE
5489	FMA	f17  = ALPHA_R, f70, f17
5490	}
5491	;;
5492	{ .mmf
5493	STFD	[C1 ] = f10, SIZE
5494	STFD	[C9 ] = f11, SIZE
5495	FMA	f18  = ALPHA_I, f68, f18
5496	}
5497	{ .mmf
5498	LDFD	f34 = [C2 ], SIZE
5499	LDFD	f35 = [C10], SIZE
5500	FMA	f19  = ALPHA_I, f70, f19
5501	}
5502	;;
5503	{ .mmf
5504	STFD	[C1 ] = f12, SIZE
5505	STFD	[C9 ] = f13, SIZE
5506	FMA	f20  = ALPHA_R, f69, f20
5507	}
5508	{ .mmf
5509	LDFD	f36 = [C2 ], SIZE
5510	LDFD	f37 = [C10], SIZE
5511	FMA	f21  = ALPHA_R, f71, f21
5512	}
5513	;;
5514	{ .mmf
5515	STFD	[C1 ] = f14, 5 * SIZE
5516	STFD	[C9 ] = f15, 5 * SIZE
5517	FMA	f22  = ALPHA_I, f69, f22
5518	}
5519	{ .mmf
5520	LDFD	f38 = [C2 ], - 11 * SIZE
5521	LDFD	f39 = [C10], - 11 * SIZE
5522	FMA	f23  = ALPHA_I, f71, f23
5523	}
5524	;;
5525	{ .mmf
5526	STFD	[C1 ] = f16, SIZE
5527	STFD	[C9 ] = f17, SIZE
5528	FMA	f24  = ALPHA_R, f72, f24
5529	}
5530	{ .mmf
5531	nop	__LINE__
5532	nop	__LINE__
5533	FMA	f25  = ALPHA_R, f74, f25
5534	}
5535	;;
5536	{ .mmf
5537	STFD	[C1 ] = f18, SIZE
5538	STFD	[C9 ] = f19, SIZE
5539	FMA	f26  = ALPHA_I, f72, f26
5540	}
5541	{ .mmf
5542	nop	__LINE__
5543	nop	__LINE__
5544	FMA	f27  = ALPHA_I, f74, f27
5545	}
5546	;;
5547	{ .mmf
5548	STFD	[C1 ] = f20, SIZE
5549	STFD	[C9 ] = f21, SIZE
5550	FMA	f28  = ALPHA_R, f73, f28
5551	}
5552	{ .mmf
5553	nop	__LINE__
5554	nop	__LINE__
5555	FMA	f29  = ALPHA_R, f75, f29
5556	}
5557	;;
5558	{ .mmf
5559	STFD	[C1 ] = f22, 5 * SIZE
5560	STFD	[C9 ] = f23, 5 * SIZE
5561	FMA	f30  = ALPHA_I, f73, f30
5562	}
5563	{ .mmf
5564	nop	__LINE__
5565	nop	__LINE__
5566	FMA	f31  = ALPHA_I, f75, f31
5567	}
5568	;;
5569	{ .mmf
5570	STFD	[C2 ] = f24, SIZE
5571	STFD	[C10] = f25, SIZE
5572	FMA	f32  = ALPHA_R, f76, f32
5573	}
5574	{ .mmf
5575	nop	__LINE__
5576	nop	__LINE__
5577	FMA	f33  = ALPHA_R, f78, f33
5578	}
5579	;;
5580	{ .mmf
5581	STFD	[C2 ] = f26, SIZE
5582	STFD	[C10] = f27, SIZE
5583	FMA	f34  = ALPHA_I, f76, f34
5584	}
5585	{ .mmf
5586	nop	__LINE__
5587	nop	__LINE__
5588	FMA	f35  = ALPHA_I, f78, f35
5589	}
5590	;;
5591	{ .mmf
5592	STFD	[C2 ] = f28, SIZE
5593	STFD	[C10] = f29, SIZE
5594	FMA	f36  = ALPHA_R, f77, f36
5595	}
5596	{ .mmf
5597	nop	__LINE__
5598	nop	__LINE__
5599	FMA	f37  = ALPHA_R, f79, f37
5600	}
5601	;;
5602	{ .mmf
5603	STFD	[C2 ] = f30, 5 * SIZE
5604	STFD	[C10] = f31, 5 * SIZE
5605	FMA	f38  = ALPHA_I, f77, f38
5606	}
5607	{ .mmf
5608	nop	__LINE__
5609	nop	__LINE__
5610	FMA	f39  = ALPHA_I, f79, f39
5611	}
5612	;;
5613	{ .mmf
5614	STFD	[C2 ] = f32, SIZE
5615	STFD	[C10] = f33, SIZE
5616	mov	f64  = f0
5617	}
5618	{ .mmf
5619	cmp.ne	p6, p0 = 1, I
5620	nop	__LINE__
5621	mov	f72  = f0
5622	}
5623	;;
5624	{ .mmf
5625	STFD	[C2 ] = f34, SIZE
5626	STFD	[C10] = f35, SIZE
5627	mov	f65  = f0
5628	}
5629	{ .mmf
5630	nop	__LINE__
5631	nop	__LINE__
5632	mov	f73  = f0
5633	}
5634	;;
5635	{ .mmf
5636	STFD	[C2 ] = f36, SIZE
5637	STFD	[C10] = f37, SIZE
5638	mov	f66  = f0
5639	}
5640	{ .mmf
5641	nop	__LINE__
5642	nop	__LINE__
5643	mov	f74  = f0
5644	}
5645	;;
5646	{ .mmf
5647	STFD	[C2 ] = f38, 5 * SIZE
5648	STFD	[C10] = f39, 5 * SIZE
5649	mov	f67  = f0
5650	}
5651	{ .mfb
5652	adds	I = -1, I
5653	mov	f75  = f0
5654	(p6)	br.cond.dptk .L092
5655	}
5656	;;
5657	.align 32
5658
5659.L100:
5660	{ .mib
5661	nop	__LINE__
5662	tbit.z	p6, p7 = M, 2
5663	(p6)	br.cond.dptk .L110
5664	}
5665	;;
5666	{ .mmf
5667	LDFPD	f48, f49 = [B]
5668	adds	BOFFSET = 2 * SIZE, B
5669	mov	f75  = f0
5670	}
5671	{ .mii
5672	nop	__LINE__
5673	adds	L =  1, K
5674	}
5675	;;
5676	{ .mii
5677	adds	PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
5678	tbit.z	p12, p0 = L, 0
5679	shr	L = L, 1
5680	}
5681	;;
5682	{ .mmi
5683	LDFPD	f32, f33 = [AOFFSET], 2 * SIZE
5684	nop	__LINE__
5685	adds	L =  -1, L
5686	}
5687	;;
5688	{ .mmi
5689	LDFPD	f34, f35 = [AOFFSET], 2 * SIZE
5690	cmp.eq	p3, p0 = r0, r0
5691	mov	ar.lc = L
5692	}
5693	;;
5694	.align 32
5695
5696.L102:
5697	{ .mfi
5698	lfetch.nt1	[PREA],  8 * SIZE
5699	FMA	f64   = f32, f48, f64	// A1 * B1
5700	cmp.ne	p4, p5 =  0, L
5701	}
5702	{ .mfi
5703	adds	PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
5704	FMA	f72   = f32, f49, f72	// A1 * B2
5705	(p12) cmp.ne p3, p0 =  0, L
5706	}
5707	;;
5708	{ .mfi
5709	lfetch.nt1	[PREB],  4 * SIZE
5710	FMA	f65   = f33, f48, f65	// A2 * B1
5711	adds	C9  = 4 * SIZE, C1
5712	}
5713	{ .mfi
5714	nop	__LINE__
5715	FMA	f73   = f33, f49, f73	// A2 * B2
5716	adds	C10 = 4 * SIZE, C2
5717	}
5718	;;
5719	{ .mfb
5720	(p3) LDFPD	f56, f57 = [BOFFSET], 2 * SIZE
5721	FMA	f66   = f34, f48, f66	// A3 * B1
5722	nop	__LINE__
5723	}
5724	{ .mfb
5725	(p5) LDFD	f6 = [C1 ], SIZE
5726	FMA	f74   = f34, f49, f74	// A3 * B2
5727	nop	__LINE__
5728	}
5729	;;
5730	{ .mfb
5731	(p3) LDFPD	f40, f41 = [AOFFSET], 2 * SIZE
5732	FMA	f67   = f35, f48, f67	// A4 * B1
5733	nop	__LINE__
5734	}
5735	{ .mfb
5736	(p5) LDFD	f7 = [C9 ], SIZE
5737	FMA	f75   = f35, f49, f75	// A4 * B2
5738	nop	__LINE__
5739	}
5740	;;
5741	{ .mfb
5742	(p3) LDFPD	f42, f43 = [AOFFSET], 2 * SIZE
5743	(p3) FMA	f64   = f40, f56, f64	// A1 * B1
5744	nop	__LINE__
5745	}
5746	{ .mfb
5747	(p5) LDFD	f10 = [C1 ], SIZE
5748	(p3) FMA	f72   = f40, f57, f72	// A1 * B2
5749	nop	__LINE__
5750	}
5751	;;
5752	{ .mfb
5753	(p4) LDFPD	f32, f33 = [AOFFSET],   2 * SIZE
5754	(p3) FMA	f65   = f41, f56, f65	// A2 * B1
5755	nop	__LINE__
5756	}
5757	{ .mfb
5758	(p5) LDFD	f11 = [C9 ], SIZE
5759	(p3) FMA	f73   = f41, f57, f73	// A2 * B2
5760	nop	__LINE__
5761	}
5762	;;
5763	{ .mfb
5764	(p4) LDFPD	f48, f49 = [BOFFSET],   2 * SIZE
5765	(p3) FMA	f66   = f42, f56, f66	// A3 * B1
5766	nop	__LINE__
5767	}
5768	{ .mfb
5769	(p5) LDFD	f12 = [C1], SIZE
5770	(p3) FMA	f74   = f42, f57, f74	// A3 * B2
5771	nop	__LINE__
5772	}
5773	;;
5774	{ .mfi
5775	(p4) LDFPD	f34, f35 = [AOFFSET], 2 * SIZE
5776	(p3) FMA	f67   = f43, f56, f67	// A4 * B1
5777	adds	L = -1, L
5778	}
5779	{ .mfb
5780	(p5) LDFD	f13 = [C9], SIZE
5781	(p3) FMA	f75   = f43, f57, f75	// A4 * B2
5782	br.cloop.sptk.few .L102
5783	}
5784	;;
5785	{ .mmf
5786	LDFD	f14 = [C1], - 3 * SIZE
5787	LDFD	f15 = [C9], - 3 * SIZE
5788	FMA	f6   = ALPHA_R, f64, f6
5789	}
5790	{ .mmf
5791	nop	__LINE__
5792	nop	__LINE__
5793	FMA	f7   = ALPHA_R, f66, f7
5794	}
5795	;;
5796	{ .mmf
5797	LDFD	f16 = [C2 ], SIZE
5798	LDFD	f17 = [C10], SIZE
5799	FMA	f10  = ALPHA_I, f64, f10
5800	}
5801	{ .mmf
5802	nop	__LINE__
5803	nop	__LINE__
5804	FMA	f11  = ALPHA_I, f66, f11
5805	}
5806	;;
5807	{ .mmf
5808	LDFD	f18 = [C2 ], SIZE
5809	LDFD	f19 = [C10], SIZE
5810	FMA	f12  = ALPHA_R, f65, f12
5811	}
5812	{ .mmf
5813	nop	__LINE__
5814	nop	__LINE__
5815	FMA	f13  = ALPHA_R, f67, f13
5816	}
5817	;;
5818	{ .mmf
5819	LDFD	f20 = [C2 ], SIZE
5820	LDFD	f21 = [C10], SIZE
5821	FMA	f14  = ALPHA_I, f65, f14
5822	}
5823	{ .mmf
5824	nop	__LINE__
5825	nop	__LINE__
5826	FMA	f15  = ALPHA_I, f67, f15
5827	}
5828	;;
5829	{ .mmf
5830	STFD	[C1 ] = f6, SIZE
5831	STFD	[C9 ] = f7, SIZE
5832	FMA	f16  = ALPHA_R, f72, f16
5833	}
5834	{ .mmf
5835	LDFD	f22 = [C2 ], - 3 * SIZE
5836	LDFD	f23 = [C10], - 3 * SIZE
5837	FMA	f17  = ALPHA_R, f74, f17
5838	}
5839	;;
5840	{ .mmf
5841	STFD	[C1 ] = f10, SIZE
5842	STFD	[C9 ] = f11, SIZE
5843	FMA	f18  = ALPHA_I, f72, f18
5844	}
5845	{ .mmf
5846	nop	__LINE__
5847	nop	__LINE__
5848	FMA	f19  = ALPHA_I, f74, f19
5849	}
5850	;;
5851	{ .mmf
5852	STFD	[C1 ] = f12, SIZE
5853	STFD	[C9 ] = f13, SIZE
5854	FMA	f20  = ALPHA_R, f73, f20
5855	}
5856	{ .mmf
5857	nop	__LINE__
5858	nop	__LINE__
5859	FMA	f21  = ALPHA_R, f75, f21
5860	}
5861	;;
5862	{ .mmf
5863	STFD	[C1 ] = f14, 5 * SIZE
5864	STFD	[C9 ] = f15, 5 * SIZE
5865	FMA	f22  = ALPHA_I, f73, f22
5866	}
5867	{ .mmf
5868	nop	__LINE__
5869	nop	__LINE__
5870	FMA	f23  = ALPHA_I, f75, f23
5871	}
5872	;;
5873	{ .mmf
5874	STFD	[C2 ] = f16, SIZE
5875	STFD	[C10] = f17, SIZE
5876	mov	f64  = f0
5877	}
5878	;;
5879	{ .mmf
5880	STFD	[C2 ] = f18, SIZE
5881	STFD	[C10] = f19, SIZE
5882	mov	f65  = f0
5883	}
5884	;;
5885	{ .mmf
5886	STFD	[C2 ] = f20, SIZE
5887	STFD	[C10] = f21, SIZE
5888	mov	f72  = f0
5889	}
5890	;;
5891	{ .mmf
5892	STFD	[C2 ] = f22, 5 * SIZE
5893	STFD	[C10] = f23, 5 * SIZE
5894	mov	f73  = f0
5895	}
5896	;;
5897	.align 32
5898
5899.L110:
5900	{ .mib
5901	nop	__LINE__
5902	tbit.z	p6, p7 = M, 1
5903	(p6)	br.cond.dptk .L120
5904	}
5905	;;
5906	{ .mmi
5907	LDFPD	f48, f49 = [B]
5908	adds	BOFFSET = 2 * SIZE, B
5909	adds	L =  1, K
5910	}
5911	;;
5912	{ .mii
5913	adds	PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
5914	tbit.z	p12, p0 = L, 0
5915	shr	L = L, 1
5916	}
5917	;;
5918	{ .mmi
5919	LDFPD	f32, f33 = [AOFFSET], 2 * SIZE
5920	nop	__LINE__
5921	adds	L =  -1, L
5922	}
5923	;;
5924	{ .mmi
5925	cmp.eq	p3, p0 = r0, r0
5926	adds	PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
5927	mov	ar.lc = L
5928	}
5929	;;
5930	.align 32
5931
5932.L112:
5933	{ .mfi
5934	lfetch.nt1	[PREA],  4 * SIZE
5935	FMA	f64   = f32, f48, f64	// A1 * B1
5936	cmp.ne	p4, p5 =  0, L
5937	}
5938	{ .mfi
5939	lfetch.nt1	[PREB],   4 * SIZE
5940	FMA	f72   = f32, f49, f72	// A1 * B2
5941	(p12) cmp.ne p3, p0 =  0, L
5942	}
5943	;;
5944	{ .mmf
5945	(p3) LDFPD	f40, f41 = [AOFFSET], 2 * SIZE
5946	(p3) LDFPD	f56, f57 = [BOFFSET], 2 * SIZE
5947	FMA	f65   = f33, f48, f65	// A2 * B1
5948	}
5949	{ .mmf
5950	(p5) LDFD	f6 = [C1 ], SIZE
5951	(p5) LDFD	f7 = [C2 ], SIZE
5952	FMA	f73   = f33, f49, f73	// A2 * B2
5953	}
5954	;;
5955	{ .mfb
5956	(p4) LDFPD	f32, f33 = [AOFFSET],   2 * SIZE
5957	(p3) FMA	f64   = f40, f56, f64	// A1 * B1
5958	nop	__LINE__
5959	}
5960	{ .mfb
5961	(p5) LDFD	f10 = [C1 ], SIZE
5962	(p3) FMA	f72   = f40, f57, f72	// A1 * B2
5963	nop	__LINE__
5964	}
5965	;;
5966	{ .mfi
5967	(p4) LDFPD	f48, f49 = [BOFFSET],   2 * SIZE
5968	(p3) FMA	f65   = f41, f56, f65	// A2 * B1
5969	adds	L = -1, L
5970	}
5971	{ .mfb
5972	(p5) LDFD	f11 = [C2 ],  SIZE
5973	(p3) FMA	f73   = f41, f57, f73	// A2 * B2
5974	br.cloop.sptk.few .L112
5975	}
5976	;;
5977	{ .mmf
5978	LDFD	f12 = [C1], SIZE
5979	LDFD	f13 = [C2], SIZE
5980	FMA	f6   = ALPHA_R, f64, f6
5981	}
5982	{ .mmf
5983	nop	__LINE__
5984	nop	__LINE__
5985	FMA	f7   = ALPHA_R, f72, f7
5986	}
5987	;;
5988	{ .mmf
5989	LDFD	f14 = [C1], - 3 * SIZE
5990	LDFD	f15 = [C2], - 3 * SIZE
5991	FMA	f10  = ALPHA_I, f64, f10
5992	}
5993	{ .mmf
5994	nop	__LINE__
5995	nop	__LINE__
5996	FMA	f11  = ALPHA_I, f72, f11
5997	}
5998	;;
5999	{ .mmf
6000	nop	__LINE__
6001	nop	__LINE__
6002	FMA	f12  = ALPHA_R, f65, f12
6003	}
6004	{ .mmf
6005	nop	__LINE__
6006	nop	__LINE__
6007	FMA	f13  = ALPHA_R, f73, f13
6008	}
6009	;;
6010	{ .mmf
6011	nop	__LINE__
6012	nop	__LINE__
6013	FMA	f14  = ALPHA_I, f65, f14
6014	}
6015	{ .mmf
6016	nop	__LINE__
6017	nop	__LINE__
6018	FMA	f15  = ALPHA_I, f73, f15
6019	}
6020	;;
6021	{ .mmf
6022	STFD	[C1] = f6, SIZE
6023	STFD	[C2] = f7, SIZE
6024	mov	f64  = f0
6025	}
6026	;;
6027	{ .mmf
6028	STFD	[C1] = f10, SIZE
6029	STFD	[C2] = f11, SIZE
6030	mov	f72  = f0
6031	}
6032	;;
6033	{ .mmf
6034	STFD	[C1] = f12, SIZE
6035	STFD	[C2] = f13, SIZE
6036	mov	f65  = f0
6037	}
6038	;;
6039	{ .mmf
6040	STFD	[C1] = f14, SIZE
6041	STFD	[C2] = f15, SIZE
6042	mov	f73  = f0
6043	}
6044	;;
6045	.align 32
6046
6047.L120:
6048	{ .mib
6049	nop	__LINE__
6050	tbit.z	p6, p7 = M, 0
6051	(p6)	br.cond.dptk .L129
6052	}
6053	;;
6054	{ .mmi
6055	LDFPD	f48, f49 = [B]
6056	adds	BOFFSET = 2 * SIZE, B
6057	adds	L =  1, K
6058	}
6059	;;
6060	{ .mii
6061	nop	__LINE__
6062	tbit.z	p12, p0 = L, 0
6063	shr	L = L, 1
6064	}
6065	;;
6066	{ .mmi
6067	LDFD	f32 = [AOFFSET], 1 * SIZE
6068	nop	__LINE__
6069	adds	L =  -1, L
6070	}
6071	;;
6072	{ .mmi
6073	cmp.eq	p3, p0 = r0, r0
6074	nop	__LINE__
6075	mov	ar.lc = L
6076	}
6077	;;
6078	.align 32
6079
6080.L122:
6081	{ .mfi
6082	FMA	f64   = f32, f48, f64	// A1 * B1
6083	cmp.ne	p4, p5 =  0, L
6084	}
6085	{ .mfi
6086	nop	__LINE__
6087	FMA	f72   = f32, f49, f72	// A1 * B2
6088	(p12) cmp.ne p3, p0 =  0, L
6089	}
6090	;;
6091	{ .mmi
6092	(p3) LDFPD	f56, f57 = [BOFFSET],   2 * SIZE
6093	(p3) LDFD	f40 = [AOFFSET], 1 * SIZE
6094	nop  __LINE__
6095	}
6096	{ .mmi
6097	(p5) LDFD	f6 = [C1], SIZE
6098	(p5) LDFD	f7 = [C2], SIZE
6099	}
6100	;;
6101	{ .mfi
6102	(p4) LDFPD	f48, f49 = [BOFFSET],   2 * SIZE
6103	(p3) FMA	f64   = f40, f56, f64	// A1 * B1
6104	adds	L = -1, L
6105	}
6106	{ .mfb
6107	(p4) LDFD	f32 = [AOFFSET],   1 * SIZE
6108	(p3) FMA	f72   = f40, f57, f72	// A1 * B2
6109	br.cloop.sptk.few .L122
6110	}
6111	;;
6112
6113.L128:
6114	{ .mmf
6115	(p5) LDFD	f10  = [C1], -SIZE
6116	(p5) LDFD	f11  = [C2], -SIZE
6117	FMA	f6   = ALPHA_R, f64, f6
6118	}
6119	{ .mmf
6120	nop	__LINE__
6121	nop	__LINE__
6122	FMA	f7   = ALPHA_R, f72, f7
6123	}
6124	;;
6125	{ .mmf
6126	nop	__LINE__
6127	nop	__LINE__
6128	FMA	f10  = ALPHA_I, f64, f10
6129	}
6130	{ .mmf
6131	nop	__LINE__
6132	nop	__LINE__
6133	FMA	f11  = ALPHA_I, f72, f11
6134	}
6135	;;
6136	{ .mmi
6137	STFD	[C1 ] = f6, SIZE
6138	STFD	[C2 ] = f7, SIZE
6139	nop	__LINE__
6140	}
6141	;;
6142	{ .mmi
6143	STFD	[C1 ] = f10, SIZE
6144	STFD	[C2 ] = f11, SIZE
6145	nop	__LINE__
6146	}
6147	;;
6148	.align 32
6149
6150.L129:
6151	{ .mmi
6152	mov	B = BOFFSET
6153	mov	AOFFSET = A
6154	nop	__LINE__
6155	}
6156	;;
6157	.align 16
6158
6159.L130:
6160	{ .mfi
6161	nop	__LINE__
6162	mov	f64  = f0
6163	tbit.z	p6, p0 = N, 0
6164	}
6165	{ .mib
6166	mov	AOFFSET = A
6167	shr	I  = M, 3
6168	(p6)	br.cond.dpnt .L999
6169	}
6170	;;
6171	{ .mfi
6172	mov	C1 = C
6173	mov	f65  = f0
6174	nop	__LINE__
6175	}
6176	;;
6177	{ .mfi
6178	nop	__LINE__
6179	mov	f66  = f0
6180	nop	__LINE__
6181	}
6182	{ .mfb
6183	cmp.eq	p7, p0 = 0, I
6184 	mov	f67  = f0
6185	(p7)	br.cond.dpnt .L140
6186	}
6187	;;
6188	.align 32
6189
6190.L132:
6191	{ .mfb
6192	LDFD	f48 = [B]
6193	mov	f68  = f0
6194	nop	__LINE__
6195	}
6196	{ .mfi
6197	adds	BOFFSET = 1 * SIZE, B
6198	mov	f69  = f0
6199	nop	__LINE__
6200	}
6201	;;
6202	{ .mfi
6203	LDFPD	f32, f33 = [AOFFSET], 2 * SIZE
6204	mov	f70  = f0
6205	adds	L =  1, K
6206	}
6207	;;
6208	{ .mii
6209	LDFPD	f34, f35 = [AOFFSET], 2 * SIZE
6210	tbit.z	p12, p0 = L, 0
6211	shr	L = L, 1
6212	}
6213	;;
6214	{ .mfi
6215	LDFPD	f36, f37 = [AOFFSET], 2 * SIZE
6216	mov	f71  = f0
6217	adds	L =  -1, L
6218	}
6219	;;
6220	{ .mmi
6221	LDFPD	f38, f39 = [AOFFSET], 2 * SIZE
6222	adds	PREC = CPREFETCHSIZE * SIZE, C1
6223	cmp.eq	p3, p0 = r0, r0
6224	}
6225	;;
6226	{ .mmi
6227	CPREFETCH [PREC]
6228	adds	PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
6229	mov	ar.lc = L
6230	}
6231	;;
6232	.align 32
6233
6234.L133:
6235	{ .mfi
6236	lfetch.nt1	[PREA],  16 * SIZE
6237	FMA	f64   = f32, f48, f64	// A1 * B1
6238	cmp.ne	p4, p5 =  0, L
6239	}
6240	{ .mfi
6241	adds	PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
6242	FMA	f65   = f33, f48, f65	// A2 * B1
6243	(p12) cmp.ne p3, p0 =  0, L
6244	}
6245	;;
6246	{ .mfi
6247	(p3) LDFPD	f40, f41 = [AOFFSET], 2 * SIZE
6248	FMA	f66   = f34, f48, f66	// A3 * B1
6249	adds	C9  = 4 * SIZE, C1
6250	}
6251	{ .mmf
6252	(p3) LDFD	f56 = [BOFFSET],   1 * SIZE
6253	(p5) LDFD	f6  = [C1 ], SIZE
6254	FMA	f67   = f35, f48, f67	// A4 * B1
6255	}
6256	;;
6257	{ .mfb
6258	(p3) LDFPD	f42, f43 = [AOFFSET], 2 * SIZE
6259	FMA	f68   = f36, f48, f68	// A5 * B1
6260	nop	__LINE__
6261	}
6262	{ .mfb
6263	(p5) LDFD	f7  = [C9 ], SIZE
6264	FMA	f69   = f37, f48, f69	// A6 * B1
6265	nop	__LINE__
6266	}
6267	;;
6268	{ .mfb
6269	(p3) LDFPD	f44, f45 = [AOFFSET], 2 * SIZE
6270	FMA	f70   = f38, f48, f70	// A7 * B1
6271	nop	__LINE__
6272	}
6273	{ .mfb
6274	(p5) LDFD	f10 = [C1 ], SIZE
6275	FMA	f71   = f39, f48, f71	// A8 * B1
6276	nop	__LINE__
6277	}
6278	;;
6279	{ .mfb
6280	(p3) LDFPD	f46, f47 = [AOFFSET], 2 * SIZE
6281	(p3) FMA	f64   = f40, f56, f64	// A1 * B1
6282	nop	__LINE__
6283	}
6284	{ .mfb
6285	(p5) LDFD	f11 = [C9 ], SIZE
6286	(p3) FMA	f65   = f41, f56, f65	// A2 * B1
6287	nop	__LINE__
6288	}
6289	;;
6290	{ .mfb
6291	(p4) LDFPD	f32, f33 = [AOFFSET],   2 * SIZE
6292	(p3) FMA	f66   = f42, f56, f66	// A3 * B1
6293	nop	__LINE__
6294	}
6295	{ .mmf
6296	(p4) LDFD	f48 = [BOFFSET],  1 * SIZE
6297	(p5) LDFD	f12 = [C1 ], SIZE
6298	(p3) FMA	f67   = f43, f56, f67	// A4 * B1
6299	}
6300	;;
6301	{ .mfb
6302	(p4) LDFPD	f34, f35 = [AOFFSET], 2 * SIZE
6303	(p3) FMA	f68   = f44, f56, f68	// A5 * B1
6304	nop	__LINE__
6305	}
6306	{ .mfb
6307	(p5) LDFD	f13 = [C9 ], SIZE
6308	(p3) FMA	f69   = f45, f56, f69	// A6 * B1
6309	nop	__LINE__
6310	}
6311	;;
6312	{ .mfi
6313	(p4) LDFPD	f36, f37 = [AOFFSET], 2 * SIZE
6314	(p3) FMA	f70   = f46, f56, f70	// A7 * B1
6315	adds	L = -1, L
6316	}
6317	{ .mfb
6318	(p5) LDFD	f14 = [C1 ], 5 * SIZE
6319	(p3) FMA	f71   = f47, f56, f71	// A8 * B1
6320	nop	__LINE__
6321	}
6322	;;
6323	{ .mfb
6324	(p4) LDFPD	f38, f39 = [AOFFSET], 2 * SIZE
6325	nop	__LINE__
6326	nop	__LINE__
6327	}
6328	{ .mfb
6329	(p5) LDFD	f15 = [C9 ], 5 * SIZE
6330	nop	__LINE__
6331	br.cloop.sptk.few .L133
6332	}
6333	;;
6334
6335.L138:
6336	{ .mmf
6337	LDFD	f16 = [C1 ], SIZE
6338	LDFD	f17 = [C9 ], SIZE
6339	FMA	f6   = ALPHA_R, f64, f6
6340	}
6341	{ .mmf
6342	nop	__LINE__
6343	nop	__LINE__
6344	FMA	f7   = ALPHA_R, f66, f7
6345	}
6346	;;
6347	{ .mmf
6348	LDFD	f18 = [C1 ], SIZE
6349	LDFD	f19 = [C9 ], SIZE
6350	FMA	f10  = ALPHA_I, f64, f10
6351	}
6352	{ .mmf
6353	nop	__LINE__
6354	nop	__LINE__
6355	FMA	f11  = ALPHA_I, f66, f11
6356	}
6357	;;
6358	{ .mmf
6359	LDFD	f20 = [C1 ], SIZE
6360	LDFD	f21 = [C9 ], SIZE
6361	FMA	f12  = ALPHA_R, f65, f12
6362	}
6363	{ .mmf
6364	nop	__LINE__
6365	nop	__LINE__
6366	FMA	f13  = ALPHA_R, f67, f13
6367	}
6368	;;
6369	{ .mmf
6370	LDFD	f22 = [C1 ], - 11 * SIZE
6371	LDFD	f23 = [C9 ], - 11 * SIZE
6372	FMA	f14  = ALPHA_I, f65, f14
6373	}
6374	{ .mmf
6375	nop	__LINE__
6376	nop	__LINE__
6377	FMA	f15  = ALPHA_I, f67, f15
6378	}
6379	;;
6380	{ .mmf
6381	STFD	[C1 ] = f6, SIZE
6382	STFD	[C9 ] = f7, SIZE
6383	FMA	f16  = ALPHA_R, f68, f16
6384	}
6385	{ .mmf
6386	nop	__LINE__
6387	nop	__LINE__
6388	FMA	f17  = ALPHA_R, f70, f17
6389	}
6390	;;
6391	{ .mmf
6392	STFD	[C1 ] = f10, SIZE
6393	STFD	[C9 ] = f11, SIZE
6394	FMA	f18  = ALPHA_I, f68, f18
6395	}
6396	{ .mmf
6397	nop	__LINE__
6398	nop	__LINE__
6399	FMA	f19  = ALPHA_I, f70, f19
6400	}
6401	;;
6402	{ .mmf
6403	STFD	[C1 ] = f12, SIZE
6404	STFD	[C9 ] = f13, SIZE
6405	FMA	f20  = ALPHA_R, f69, f20
6406	}
6407	{ .mmf
6408	cmp.ne	p6, p0 = 1, I
6409	adds	I = -1, I
6410	FMA	f21  = ALPHA_R, f71, f21
6411	}
6412	;;
6413	{ .mmf
6414	STFD	[C1 ] = f14, 5 * SIZE
6415	STFD	[C9 ] = f15, 5 * SIZE
6416	FMA	f22  = ALPHA_I, f69, f22
6417	}
6418	{ .mmf
6419	nop	__LINE__
6420	nop	__LINE__
6421	FMA	f23  = ALPHA_I, f71, f23
6422	}
6423	;;
6424	{ .mmf
6425	STFD	[C1 ] = f16, SIZE
6426	STFD	[C9 ] = f17, SIZE
6427	mov	f64  = f0
6428	}
6429	;;
6430	{ .mmf
6431	STFD	[C1 ] = f18, SIZE
6432	STFD	[C9 ] = f19, SIZE
6433	mov	f65  = f0
6434	}
6435	;;
6436	{ .mmf
6437	STFD	[C1 ] = f20, SIZE
6438	STFD	[C9 ] = f21, SIZE
6439	mov	f66  = f0
6440	}
6441	;;
6442	{ .mmf
6443	STFD	[C1 ] = f22, 5 * SIZE
6444	STFD	[C9 ] = f23, 5 * SIZE
6445	mov	f67  = f0
6446	}
6447	{ .mmb
6448	nop	__LINE__
6449	nop	__LINE__
6450	(p6)	br.cond.dptk .L132
6451	}
6452	;;
6453	.align 32
6454
6455.L140:
6456	{ .mib
6457	nop	__LINE__
6458	tbit.z	p6, p7 = M, 2
6459	(p6)	br.cond.dptk .L150
6460	}
6461	;;
6462	{ .mmi
6463	LDFD	f48 = [B]
6464	adds	BOFFSET = 1 * SIZE, B
6465	adds	L =  1, K
6466	}
6467	;;
6468	{ .mii
6469	(p7) LDFPD	f32, f33 = [AOFFSET], 2 * SIZE
6470	tbit.z	p12, p0 = L, 0
6471	shr	L = L, 1
6472	}
6473	;;
6474	{ .mmi
6475	LDFPD	f34, f35 = [AOFFSET], 2 * SIZE
6476	adds	L =  -1, L
6477	nop	__LINE__
6478	}
6479	;;
6480	{ .mmi
6481	adds	PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
6482	cmp.eq	p3, p0 = r0, r0
6483	mov	ar.lc = L
6484	}
6485	;;
6486	.align 32
6487
6488.L142:
6489	{ .mfi
6490	lfetch.nt1	[PREA],  8 * SIZE
6491	FMA	f64   = f32, f48, f64	// A1 * B1
6492	cmp.ne	p4, p5 =  0, L
6493	}
6494	{ .mfi
6495	nop	__LINE__
6496	FMA	f65   = f33, f48, f65	// A2 * B1
6497	(p12) cmp.ne p3, p0 =  0, L
6498	}
6499	;;
6500	{ .mfi
6501	(p3) LDFPD	f40, f41 = [AOFFSET], 2 * SIZE
6502	FMA	f66   = f34, f48, f66	// A3 * B1
6503	(p5) adds	C9  = 4 * SIZE, C1
6504	}
6505	{ .mmf
6506	(p3) LDFD	f56 = [BOFFSET],   1 * SIZE
6507	FMA	f67   = f35, f48, f67	// A4 * B1
6508	}
6509	;;
6510	{ .mfi
6511	(p3) LDFPD	f42, f43 = [AOFFSET], 2 * SIZE
6512	(p3) FMA	f64   = f40, f56, f64	// A1 * B1
6513	(p5) adds	C10 = 2 * SIZE, C2
6514	}
6515	{ .mmf
6516	(p5) LDFD	f6  = [C1 ], SIZE
6517	(p5) LDFD	f7  = [C9 ], SIZE
6518	(p3) FMA	f65   = f41, f56, f65	// A2 * B1
6519	}
6520	;;
6521	{ .mmf
6522	(p4) LDFPD	f32, f33 = [AOFFSET],   2 * SIZE
6523	(p4) LDFD	f48 = [BOFFSET],   1 * SIZE
6524	(p3) FMA	f66   = f42, f56, f66	// A3 * B1
6525	}
6526	{ .mmf
6527	(p5) LDFD	f10  = [C1 ], SIZE
6528	(p5) LDFD	f11  = [C9 ], SIZE
6529	(p3) FMA	f67   = f43, f56, f67	// A4 * B1
6530	}
6531	;;
6532	{ .mfi
6533	(p4) LDFPD	f34, f35 = [AOFFSET], 2 * SIZE
6534	nop	__LINE__
6535	adds	L = -1, L
6536	}
6537	{ .mmb
6538	(p5) LDFD	f12  = [C1 ], SIZE
6539	(p5) LDFD	f13  = [C9 ], SIZE
6540	br.cloop.sptk.few .L142
6541	}
6542	;;
6543
6544.L148:
6545	{ .mmf
6546	LDFD	f14  = [C1 ], - 3 * SIZE
6547	LDFD	f15  = [C9 ], - 3 * SIZE
6548	FMA	f6   = ALPHA_R, f64, f6
6549	}
6550	{ .mmf
6551	nop	__LINE__
6552	nop	__LINE__
6553	FMA	f7   = ALPHA_R, f66, f7
6554	}
6555	;;
6556	{ .mmf
6557	nop	__LINE__
6558	nop	__LINE__
6559	FMA	f10  = ALPHA_I, f64, f10
6560	}
6561	{ .mmf
6562	nop	__LINE__
6563	nop	__LINE__
6564	FMA	f11  = ALPHA_I, f66, f11
6565	}
6566	;;
6567	{ .mmf
6568	nop	__LINE__
6569	nop	__LINE__
6570	FMA	f12  = ALPHA_R, f65, f12
6571	}
6572	{ .mmf
6573	nop	__LINE__
6574	nop	__LINE__
6575	FMA	f13  = ALPHA_R, f67, f13
6576	}
6577	;;
6578	{ .mmf
6579	nop	__LINE__
6580	nop	__LINE__
6581	FMA	f14  = ALPHA_I, f65, f14
6582	}
6583	{ .mmf
6584	nop	__LINE__
6585	nop	__LINE__
6586	FMA	f15  = ALPHA_I, f67, f15
6587	}
6588	;;
6589	{ .mmf
6590	STFD	[C1 ] = f6, SIZE
6591	STFD	[C9 ] = f7, SIZE
6592	mov	f64  = f0
6593	}
6594	;;
6595	{ .mmf
6596	STFD	[C1 ] = f10, SIZE
6597	STFD	[C9 ] = f11, SIZE
6598	mov	f65  = f0
6599	}
6600	;;
6601	{ .mmf
6602	STFD	[C1 ] = f12, SIZE
6603	STFD	[C9 ] = f13, SIZE
6604	mov	f66  = f0
6605	}
6606	;;
6607	{ .mmf
6608	STFD	[C1 ] = f14, 5 * SIZE
6609	STFD	[C9 ] = f15, 5 * SIZE
6610	mov	f67  = f0
6611	}
6612	;;
6613	.align 32
6614
6615.L150:
6616	{ .mib
6617	nop	__LINE__
6618	tbit.z	p6, p7 = M, 1
6619	(p6)	br.cond.dptk .L160
6620	}
6621	;;
6622	{ .mmi
6623	LDFD	f48 = [B]
6624	adds	BOFFSET = 1 * SIZE, B
6625	adds	L =  1, K
6626	}
6627	;;
6628	{ .mii
6629	cmp.eq	p3, p0 = r0, r0
6630	tbit.z	p12, p0 = L, 0
6631	shr	L = L, 1
6632	}
6633	;;
6634	{ .mii
6635	 LDFPD	f32, f33 = [AOFFSET], 2 * SIZE
6636	adds	L =  -1, L
6637	;;
6638	mov	ar.lc = L
6639	}
6640	;;
6641	.align 32
6642
6643.L152:
6644	{ .mfi
6645	cmp.ne	p4, p5 =  0, L
6646	FMA	f64   = f32, f48, f64	// A1 * B1
6647	(p12) cmp.ne p3, p0 =  0, L
6648	}
6649	;;
6650	{ .mmf
6651	(p3) LDFD	f56 = [BOFFSET],   1 * SIZE
6652	(p3) LDFPD	f40, f41 = [AOFFSET], 2 * SIZE
6653	FMA	f65   = f33, f48, f65	// A2 * B1
6654	}
6655	;;
6656	{ .mfi
6657	(p4) LDFPD	f32, f33 = [AOFFSET],   2 * SIZE
6658	(p3) FMA	f64   = f40, f56, f64	// A1 * B1
6659	adds	L = -1, L
6660	}
6661	;;
6662	{ .mfb
6663	(p4) LDFD	f48 = [BOFFSET],   1 * SIZE
6664	(p3) FMA	f65   = f41, f56, f65	// A2 * B1
6665	br.cloop.sptk.few .L152
6666	}
6667	;;
6668
6669.L158:
6670	LDFD	f68 = [C1 ], 1 * SIZE
6671	;;
6672	LDFD	f69 = [C1 ], 1 * SIZE
6673	;;
6674	LDFD	f70 = [C1 ], 1 * SIZE
6675	;;
6676	LDFD	f71 = [C1 ], - 3 * SIZE
6677	;;
6678	FMA	f68  = ALPHA_R, f64, f68
6679	FMA	f69  = ALPHA_I, f64, f69
6680	FMA	f70  = ALPHA_R, f65, f70
6681	FMA	f71  = ALPHA_I, f65, f71
6682	;;
6683	STFD	[C1 ] = f68, SIZE
6684	;;
6685	STFD	[C1 ] = f69, SIZE
6686	;;
6687	STFD	[C1 ] = f70, SIZE
6688	mov	f64  = f0
6689	;;
6690	STFD	[C1 ] = f71, SIZE
6691	mov	f65  = f0
6692	;;
6693	.align 32
6694
6695.L160:
6696	{ .mib
6697	nop	__LINE__
6698	tbit.z	p6, p7 = M, 0
6699	(p6)	br.cond.dptk .L169
6700	}
6701	;;
6702	{ .mmi
6703	LDFD	f48 = [B]
6704	adds	BOFFSET = 1 * SIZE, B
6705	adds	L =  1, K
6706	}
6707	;;
6708	{ .mii
6709	LDFD f32 = [AOFFSET], 1 * SIZE
6710	tbit.z	p12, p0 = L, 0
6711	shr	L = L, 1
6712	}
6713	;;
6714	{ .mii
6715	adds	L =  -1, L
6716	cmp.eq	p3, p0 = r0, r0
6717	;;
6718	mov	ar.lc = L
6719	}
6720	;;
6721	.align 32
6722
6723.L162:
6724	{ .mmf
6725	cmp.ne	p4, p5 =  0, L
6726	(p12) cmp.ne p3, p0 =  0, L
6727	FMA	f64   = f32, f48, f64	// A1 * B1
6728	}
6729	;;
6730	{ .mmi
6731	(p3) LDFD	f56 = [BOFFSET], 1 * SIZE
6732	(p3) LDFD	f40 = [AOFFSET], 1 * SIZE
6733	nop	__LINE__
6734	}
6735	;;
6736	{ .mmi
6737	(p4) LDFD	f32 = [AOFFSET],   1 * SIZE
6738	(p5) LDFD	f68 = [C1], 1 * SIZE
6739	adds	L = -1, L
6740	}
6741	;;
6742	{ .mmf
6743	(p4) LDFD	f48 = [BOFFSET],   1 * SIZE
6744	(p5) LDFD	f69 = [C1], - 1 * SIZE
6745	(p3) FMA	f64   = f40, f56, f64	// A1 * B1
6746	}
6747	{ .mib
6748	nop	__LINE__
6749	nop	__LINE__
6750	br.cloop.sptk.few .L162
6751	}
6752	;;
6753	FMA	f68  = ALPHA_R, f64, f68
6754	FMA	f69  = ALPHA_I, f64, f69
6755	;;
6756	STFD	[C1 ] = f68, SIZE
6757	;;
6758	STFD	[C1 ] = f69, SIZE
6759	;;
6760	.align 32
6761
6762.L169:
6763	{ .mmi
6764	mov	B = BOFFSET
6765	mov	AOFFSET = A
6766	nop	__LINE__
6767	}
6768	;;
6769	.align 16
6770
6771.L999:
6772	mov	r8 = r0
6773	adds	r9 = 1 * 16, SP
6774	;;
6775	ldf.fill  f16 = [SP], 32
6776	ldf.fill  f17 = [r9], 32
6777	;;
6778	ldf.fill  f18 = [SP], 32
6779	ldf.fill  f19 = [r9], 32
6780	;;
6781	ldf.fill  f20 = [SP], 32
6782	ldf.fill  f21 = [r9], 32
6783	;;
6784	ldf.fill  f22 = [SP], 32
6785	ldf.fill  f23 = [r9], 32
6786	mov	 ar.lc = ARLC
6787	;;
6788	ldf.fill  f24 = [SP], 32
6789	ldf.fill  f25 = [r9], 32
6790	mov pr    = PR, -1
6791	;;
6792	ldf.fill  f26 = [SP], 32
6793	ldf.fill  f27 = [r9], 32
6794	mov	ar.pfs = ARPFS
6795	;;
6796	ldf.fill  f28 = [SP], 32
6797	ldf.fill  f29 = [r9], 32
6798	;;
6799	ldf.fill  f30 = [SP], 32
6800	ldf.fill  f31 = [r9]
6801	br.ret.sptk.many b0
6802	EPILOGUE
6803
6804