1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define M	%rdi
43#define N	%rsi
44#define K	%rdx
45
46#define A	%rcx
47#define B	%r8
48#define C	%r9
49#define LDC	%r10
50
51#define I	%r11
52#define AO	%r13
53#define BO	%r14
54#define	CO1	%r15
55#define CO2	%rbx
56#define KK	%rbp
57#define BB	%r12
58
59#ifndef WINDOWS_ABI
60
61#define STACKSIZE 128
62
63#define OLD_LDC		 8 + STACKSIZE(%rsp)
64#define OLD_OFFSET	16 + STACKSIZE(%rsp)
65
66#define OFFSET	 48(%rsp)
67#define J	 56(%rsp)
68#define KKK	 64(%rsp)
69#define AORIG	 72(%rsp)
70
71#else
72
73#define STACKSIZE 256
74
75#define OLD_A		40 + STACKSIZE(%rsp)
76#define OLD_B		48 + STACKSIZE(%rsp)
77#define OLD_C		56 + STACKSIZE(%rsp)
78#define OLD_LDC		64 + STACKSIZE(%rsp)
79#define OLD_OFFSET	72 + STACKSIZE(%rsp)
80
81#define OFFSET	 224(%rsp)
82#define J	 232(%rsp)
83#define KKK	 240(%rsp)
84#define AORIG	 248(%rsp)
85
86#endif
87
88#define PREFETCH     prefetcht0
89#define PREFETCHSIZE (8 * 8 + 3)
90
91	PROLOGUE
92	PROFCODE
93
94	subq	$STACKSIZE, %rsp
95	movq	%rbx,  0(%rsp)
96	movq	%rbp,  8(%rsp)
97	movq	%r12, 16(%rsp)
98	movq	%r13, 24(%rsp)
99	movq	%r14, 32(%rsp)
100	movq	%r15, 40(%rsp)
101
102#ifdef WINDOWS_ABI
103	movq	%rdi,    48(%rsp)
104	movq	%rsi,    56(%rsp)
105	movups	%xmm6,   64(%rsp)
106	movups	%xmm7,   80(%rsp)
107	movups	%xmm8,   96(%rsp)
108	movups	%xmm9,  112(%rsp)
109	movups	%xmm10, 128(%rsp)
110	movups	%xmm11, 144(%rsp)
111	movups	%xmm12, 160(%rsp)
112	movups	%xmm13, 176(%rsp)
113	movups	%xmm14, 192(%rsp)
114	movups	%xmm15, 208(%rsp)
115
116	movq	ARG1,      M
117	movq	ARG2,      N
118	movq	ARG3,      K
119	movq	OLD_A,     A
120	movq	OLD_B,     B
121	movq	OLD_C,     C
122#endif
123
124	movq	OLD_LDC,   LDC
125	movq	OLD_OFFSET, KK
126
127	movq	KK, OFFSET
128
129	leaq	(, LDC, SIZE), LDC
130
131#ifdef LN
132       leaq	(, M, SIZE), %rax
133       addq	%rax, C
134       imulq	K, %rax
135       addq	%rax, A
136#endif
137
138#ifdef RT
139       leaq	(, N, SIZE), %rax
140       imulq	K, %rax
141       addq	%rax, B
142       movq	N, %rax
143       imulq	LDC, %rax
144       addq	%rax, C
145#endif
146
147#ifdef RN
148	negq	KK
149#endif
150
151#ifdef RT
152       movq	N, %rax
153       subq	OFFSET, %rax
154       movq	%rax, KK
155#endif
156
157	testq	$1, N
158	je	.L40
159	ALIGN_4
160
161#if defined(LT) || defined(RN)
162	movq	A, AO
163#else
164	movq	A, AORIG
165#endif
166
167#ifdef RT
168       movq	K, %rax
169       salq	$0 + BASE_SHIFT, %rax
170       subq	%rax, B
171
172       subq	LDC, C
173#endif
174
175	movq	C, CO1
176#ifndef RT
177	addq	LDC, C
178#endif
179
180#ifdef LN
181	movq	OFFSET, %rax
182	addq	M, %rax
183	movq	%rax, KK
184#endif
185
186#ifdef LT
187	movq	OFFSET, %rax
188	movq	%rax, KK
189#endif
190
191	movq	M,  I
192	sarq	$2, I
193	jle	.L50
194	ALIGN_4
195
196.L41:
197#ifdef LN
198       movq	K, %rax
199       salq	$2 + BASE_SHIFT, %rax
200       subq	%rax, AORIG
201#endif
202
203#if defined(LN) || defined(RT)
204	movq	KK, %rax
205	leaq	(, %rax, SIZE), %rax
206	movq	AORIG, AO
207	leaq	(AO, %rax, 4), AO
208	leaq	(B,  %rax, 1), BO
209#else
210	movq	B, BO
211#endif
212
213	movsd	 0 * SIZE(AO), %xmm0
214	xorps	%xmm9,   %xmm9
215	movsd	 1 * SIZE(AO), %xmm1
216	xorps	%xmm11,  %xmm11
217	movsd	 2 * SIZE(AO), %xmm2
218	xorps	%xmm13,  %xmm13
219	movsd	 3 * SIZE(AO), %xmm3
220	xorps	%xmm15,  %xmm15
221
222	movsd	 0 * SIZE(BO), %xmm4
223	xorps	%xmm8,  %xmm8
224	movsd	 1 * SIZE(BO), %xmm5
225	xorps	%xmm10, %xmm10
226	prefetcht0     3 * SIZE(CO1)
227	xorps	%xmm12, %xmm12
228	xorps	%xmm14, %xmm14
229
230#if defined(LT) || defined(RN)
231	movq	KK, %rax
232#else
233	movq	K, %rax
234	subq	KK, %rax
235#endif
236	sarq	$2, %rax
237	je	.L45
238	ALIGN_4
239
240.L42:
241	addsd	 %xmm9,  %xmm8
242	movsd	 4 * SIZE(AO), %xmm9
243	mulsd	 %xmm4, %xmm0
244	PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
245
246	addsd	 %xmm11, %xmm10
247	movsd	 5 * SIZE(AO), %xmm11
248	mulsd	 %xmm4, %xmm1
249
250	addsd	 %xmm13, %xmm12
251	movsd	 6 * SIZE(AO), %xmm13
252	mulsd	 %xmm4, %xmm2
253
254	addsd	 %xmm15, %xmm14
255	movsd	 7 * SIZE(AO), %xmm15
256	mulsd	 %xmm4, %xmm3
257	movsd	 2 * SIZE(BO), %xmm4
258
259	addsd	 %xmm0, %xmm8
260	movsd	 8 * SIZE(AO), %xmm0
261	mulsd	 %xmm5, %xmm9
262
263	addsd	 %xmm1, %xmm10
264	movsd	 9 * SIZE(AO), %xmm1
265	mulsd	 %xmm5, %xmm11
266
267	addsd	 %xmm2, %xmm12
268	movsd	10 * SIZE(AO), %xmm2
269	mulsd	 %xmm5, %xmm13
270
271	addsd	 %xmm3, %xmm14
272	movsd	11 * SIZE(AO), %xmm3
273	mulsd	 %xmm5, %xmm15
274	movsd	 3 * SIZE(BO), %xmm5
275
276	addsd	 %xmm9,  %xmm8
277	movsd	12 * SIZE(AO), %xmm9
278	mulsd	 %xmm4, %xmm0
279	PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
280
281	addsd	 %xmm11, %xmm10
282	movsd	13 * SIZE(AO), %xmm11
283	mulsd	 %xmm4, %xmm1
284
285	addsd	 %xmm13, %xmm12
286	movsd	14 * SIZE(AO), %xmm13
287	mulsd	 %xmm4, %xmm2
288
289	addsd	 %xmm15, %xmm14
290	movsd	15 * SIZE(AO), %xmm15
291	mulsd	 %xmm4, %xmm3
292	movsd	 4 * SIZE(BO), %xmm4
293	subq	$-16 * SIZE, AO
294
295	addsd	 %xmm0, %xmm8
296	movsd	 0 * SIZE(AO), %xmm0
297	mulsd	 %xmm5, %xmm9
298
299	addsd	 %xmm1, %xmm10
300	movsd	 1 * SIZE(AO), %xmm1
301	mulsd	 %xmm5, %xmm11
302	addq	$  4 * SIZE, BO
303
304	addsd	 %xmm2, %xmm12
305	movsd	 2 * SIZE(AO), %xmm2
306	mulsd	 %xmm5, %xmm13
307	decq	%rax
308
309	addsd	 %xmm3, %xmm14
310	movsd	 3 * SIZE(AO), %xmm3
311	mulsd	 %xmm5, %xmm15
312	movsd	 1 * SIZE(BO), %xmm5
313
314	jne    .L42
315	ALIGN_4
316
317.L45:
318#if defined(LT) || defined(RN)
319	movq	KK, %rax
320#else
321	movq	K, %rax
322	subq	KK, %rax
323#endif
324
325	addsd	 %xmm9,  %xmm8
326	addsd	 %xmm11, %xmm10
327	addsd	 %xmm13, %xmm12
328	addsd	 %xmm15, %xmm14
329
330	andq	$3, %rax
331	BRANCH
332	BRANCH
333	je	.L49
334	ALIGN_4
335
336.L46:
337	mulsd	 %xmm4, %xmm0
338	mulsd	 %xmm4, %xmm1
339	mulsd	 %xmm4, %xmm2
340	mulsd	 %xmm4, %xmm3
341	movsd	 1 * SIZE(BO), %xmm4
342
343	addsd	 %xmm0, %xmm8
344	movsd	 4 * SIZE(AO), %xmm0
345	addsd	 %xmm1, %xmm10
346	movsd	 5 * SIZE(AO), %xmm1
347	addsd	 %xmm2, %xmm12
348	movsd	 6 * SIZE(AO), %xmm2
349	addsd	 %xmm3, %xmm14
350	movsd	 7 * SIZE(AO), %xmm3
351
352	addq	$4 * SIZE, AO
353	addq	$1 * SIZE, BO
354	decq	%rax
355	BRANCH
356	jg	.L46
357	ALIGN_4
358
359.L49:
360#if defined(LN) || defined(RT)
361	movq	KK, %rax
362#ifdef LN
363	subq	$4, %rax
364#else
365	subq	$1, %rax
366#endif
367	leaq	(, %rax, SIZE), %rax
368
369	movq	AORIG, AO
370	leaq	(AO, %rax, 4), AO
371	leaq	(B,  %rax, 1), BO
372#endif
373
374#if defined(LN) || defined(LT)
375	movsd	 0 * SIZE(BO), %xmm0
376	movsd	 1 * SIZE(BO), %xmm2
377	movsd	 2 * SIZE(BO), %xmm4
378	movsd	 3 * SIZE(BO), %xmm6
379
380	subsd	%xmm8,  %xmm0
381	subsd	%xmm10, %xmm2
382	subsd	%xmm12, %xmm4
383	subsd	%xmm14, %xmm6
384#else
385	movsd	 0 * SIZE(AO), %xmm0
386	movsd	 1 * SIZE(AO), %xmm2
387	movsd	 2 * SIZE(AO), %xmm4
388	movsd	 3 * SIZE(AO), %xmm6
389
390	subsd	%xmm8,  %xmm0
391	subsd	%xmm10, %xmm2
392	subsd	%xmm12, %xmm4
393	subsd	%xmm14, %xmm6
394#endif
395
396#ifdef LN
397	movsd	15 * SIZE(AO), %xmm8
398	mulsd	 %xmm8, %xmm6
399	movsd	14 * SIZE(AO), %xmm9
400	mulsd	%xmm6,  %xmm9
401	movsd	13 * SIZE(AO), %xmm11
402	subsd	%xmm9,  %xmm4
403	movsd	12 * SIZE(AO), %xmm13
404	mulsd	%xmm6,  %xmm11
405	movsd	10 * SIZE(AO), %xmm8
406	subsd	%xmm11, %xmm2
407	movsd	 9 * SIZE(AO), %xmm9
408	mulsd	%xmm6,  %xmm13
409	movsd	 8 * SIZE(AO), %xmm11
410	subsd	%xmm13, %xmm0
411
412	mulsd	 %xmm8, %xmm4
413	movsd	 5 * SIZE(AO), %xmm8
414	mulsd	%xmm4,  %xmm9
415	subsd	%xmm9,  %xmm2
416	movsd	 4 * SIZE(AO), %xmm9
417	mulsd	%xmm4,  %xmm11
418	subsd	%xmm11, %xmm0
419	movsd	 0 * SIZE(AO), %xmm11
420	mulsd	 %xmm8, %xmm2
421	mulsd	%xmm2,  %xmm9
422	subsd	%xmm9,  %xmm0
423	mulsd	 %xmm11, %xmm0
424#endif
425
426#ifdef LT
427	movsd	 0 * SIZE(AO), %xmm8
428	mulsd	 %xmm8, %xmm0
429	movsd	 1 * SIZE(AO), %xmm9
430	mulsd	%xmm0,  %xmm9
431	movsd	 2 * SIZE(AO), %xmm11
432	subsd	%xmm9,  %xmm2
433	movsd	 3 * SIZE(AO), %xmm13
434	mulsd	%xmm0,  %xmm11
435	movsd	 5 * SIZE(AO), %xmm8
436	subsd	%xmm11, %xmm4
437	movsd	 6 * SIZE(AO), %xmm9
438	mulsd	%xmm0,  %xmm13
439	movsd	 7 * SIZE(AO), %xmm11
440	subsd	%xmm13, %xmm6
441
442	mulsd	 %xmm8, %xmm2
443	movsd	10 * SIZE(AO), %xmm8
444	mulsd	%xmm2,  %xmm9
445	subsd	%xmm9,  %xmm4
446	movsd	11 * SIZE(AO), %xmm9
447	mulsd	%xmm2,  %xmm11
448	subsd	%xmm11, %xmm6
449	mulsd	 %xmm8, %xmm4
450	movsd	15 * SIZE(AO), %xmm8
451	mulsd	%xmm4,  %xmm9
452	subsd	%xmm9,  %xmm6
453	mulsd	%xmm8,  %xmm6
454#endif
455
456#if defined(RN) || defined(RT)
457	movsd	 0 * SIZE(BO), %xmm8
458	mulsd	 %xmm8, %xmm0
459	mulsd	 %xmm8, %xmm2
460	mulsd	 %xmm8, %xmm4
461	mulsd	 %xmm8, %xmm6
462#endif
463
464#ifdef LN
465	subq	$4 * SIZE, CO1
466#endif
467
468	movsd	%xmm0,  0 * SIZE(CO1)
469	movsd	%xmm2,  1 * SIZE(CO1)
470	movsd	%xmm4,  2 * SIZE(CO1)
471	movsd	%xmm6,  3 * SIZE(CO1)
472
473#if defined(LN) || defined(LT)
474	movsd	%xmm0,   0 * SIZE(BO)
475	movsd	%xmm2,   1 * SIZE(BO)
476	movsd	%xmm4,   2 * SIZE(BO)
477	movsd	%xmm6,   3 * SIZE(BO)
478#else
479	movsd	%xmm0,   0 * SIZE(AO)
480	movsd	%xmm2,   1 * SIZE(AO)
481	movsd	%xmm4,   2 * SIZE(AO)
482	movsd	%xmm6,   3 * SIZE(AO)
483#endif
484
485#ifndef LN
486	addq	$4 * SIZE, CO1
487#endif
488
489#if defined(LT) || defined(RN)
490	movq	K,  %rax
491	subq	KK, %rax
492	leaq	(,%rax, SIZE), %rax
493	leaq	(AO, %rax, 4), AO
494	leaq	(BO, %rax, 1), BO
495#endif
496
497#ifdef LN
498	subq	$4, KK
499#endif
500
501#ifdef LT
502	addq	$4, KK
503#endif
504
505#ifdef RT
506       movq	K, %rax
507       salq	$2 + BASE_SHIFT, %rax
508       addq	%rax, AORIG
509#endif
510
511	decq	I			# i --
512	jg	.L41
513	ALIGN_4
514
515.L50:
516	testq	$2, M
517	je	.L60
518
519#ifdef LN
520       movq	K, %rax
521       salq	$1 + BASE_SHIFT, %rax
522       subq	%rax, AORIG
523#endif
524
525#if defined(LN) || defined(RT)
526	movq	KK, %rax
527	leaq	(, %rax, SIZE), %rax
528	movq	AORIG, AO
529	leaq	(AO, %rax, 2), AO
530	leaq	(B,  %rax, 1), BO
531#else
532	movq	B, BO
533#endif
534
535	movsd	 0 * SIZE(AO), %xmm0
536	xorps	%xmm2,   %xmm2
537	movsd	 1 * SIZE(AO), %xmm1
538	xorps	%xmm3,   %xmm3
539
540	movsd	 0 * SIZE(BO), %xmm4
541	xorps	%xmm8,  %xmm8
542	movsd	 1 * SIZE(BO), %xmm5
543	xorps	%xmm10, %xmm10
544
545#if defined(LT) || defined(RN)
546	movq	KK, %rax
547#else
548	movq	K, %rax
549	subq	KK, %rax
550#endif
551	sarq	$2, %rax
552	je	.L55
553	ALIGN_4
554
555.L52:
556	addsd	 %xmm2, %xmm8
557	movsd	 2 * SIZE(AO), %xmm2
558	mulsd	 %xmm4, %xmm0
559	PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
560
561	addsd	 %xmm3, %xmm10
562	movsd	 3 * SIZE(AO), %xmm3
563	mulsd	 %xmm4, %xmm1
564	movsd	 2 * SIZE(BO), %xmm4
565
566	addsd	 %xmm0, %xmm8
567	movsd	 4 * SIZE(AO), %xmm0
568	mulsd	 %xmm5, %xmm2
569	addq	$8 * SIZE, AO
570
571	addsd	 %xmm1, %xmm10
572	movsd	-3 * SIZE(AO), %xmm1
573	mulsd	 %xmm5, %xmm3
574	movsd	 3 * SIZE(BO), %xmm5
575
576	addsd	 %xmm2, %xmm8
577	movsd	-2 * SIZE(AO), %xmm2
578	mulsd	 %xmm4, %xmm0
579	addq	$4 * SIZE, BO
580
581	addsd	 %xmm3, %xmm10
582	movsd	-1 * SIZE(AO), %xmm3
583	mulsd	 %xmm4, %xmm1
584	movsd	 0 * SIZE(BO), %xmm4
585
586	addsd	 %xmm0, %xmm8
587	movsd	 0 * SIZE(AO), %xmm0
588	mulsd	 %xmm5, %xmm2
589	decq	%rax
590
591	addsd	 %xmm1, %xmm10
592	movsd	 1 * SIZE(AO), %xmm1
593	mulsd	 %xmm5, %xmm3
594	movsd	 1 * SIZE(BO), %xmm5
595
596	jne    .L52
597	ALIGN_4
598
599.L55:
600#if defined(LT) || defined(RN)
601	movq	KK, %rax
602#else
603	movq	K, %rax
604	subq	KK, %rax
605#endif
606	addsd	 %xmm2, %xmm8
607	addsd	 %xmm3, %xmm10
608
609	andq	$3, %rax
610	BRANCH
611	je .L59
612	ALIGN_4
613
614.L56:
615	mulsd	 %xmm4, %xmm0
616	mulsd	 %xmm4, %xmm1
617	movsd	 1 * SIZE(BO), %xmm4
618
619	addsd	 %xmm0, %xmm8
620	movsd	 2 * SIZE(AO), %xmm0
621	addsd	 %xmm1, %xmm10
622	movsd	 3 * SIZE(AO), %xmm1
623
624	addq	$2 * SIZE, AO
625	addq	$1 * SIZE, BO
626	decq	%rax
627	BRANCH
628	jg	.L56
629	ALIGN_4
630
631.L59:
632#if defined(LN) || defined(RT)
633	movq	KK, %rax
634#ifdef LN
635	subq	$2, %rax
636#else
637	subq	$1, %rax
638#endif
639	leaq	(, %rax, SIZE), %rax
640
641	movq	AORIG, AO
642	leaq	(AO, %rax, 2), AO
643	leaq	(B,  %rax, 1), BO
644#endif
645
646#if defined(LN) || defined(LT)
647	movsd	 0 * SIZE(BO), %xmm0
648	movsd	 1 * SIZE(BO), %xmm2
649
650	subsd	%xmm8,  %xmm0
651	subsd	%xmm10, %xmm2
652#else
653	movsd	 0 * SIZE(AO), %xmm0
654	movsd	 1 * SIZE(AO), %xmm2
655
656	subsd	%xmm8,  %xmm0
657	subsd	%xmm10, %xmm2
658#endif
659
660#ifdef LN
661	movsd	 3 * SIZE(AO), %xmm8
662	movsd	 2 * SIZE(AO), %xmm9
663	movsd	 0 * SIZE(AO), %xmm11
664	mulsd	%xmm8, %xmm2
665	mulsd	%xmm2, %xmm9
666	subsd	%xmm9, %xmm0
667	mulsd	%xmm11,%xmm0
668#endif
669
670#ifdef LT
671	movsd	 0 * SIZE(AO), %xmm8
672	movsd	 1 * SIZE(AO), %xmm9
673	movsd	 3 * SIZE(AO), %xmm11
674	mulsd	%xmm8, %xmm0
675	mulsd	%xmm0, %xmm9
676	subsd	%xmm9, %xmm2
677	mulsd	%xmm11,%xmm2
678#endif
679
680#if defined(RN) || defined(RT)
681	movsd	 0 * SIZE(BO), %xmm8
682	mulsd	 %xmm8, %xmm0
683	mulsd	 %xmm8, %xmm2
684#endif
685
686#ifdef LN
687	subq	$2 * SIZE, CO1
688#endif
689
690	movsd	%xmm0,  0 * SIZE(CO1)
691	movsd	%xmm2,  1 * SIZE(CO1)
692
693#if defined(LN) || defined(LT)
694	movsd	%xmm0,   0 * SIZE(BO)
695	movsd	%xmm2,   1 * SIZE(BO)
696#else
697	movsd	%xmm0,   0 * SIZE(AO)
698	movsd	%xmm2,   1 * SIZE(AO)
699#endif
700
701#ifndef LN
702	addq	$2 * SIZE, CO1
703#endif
704
705#if defined(LT) || defined(RN)
706	movq	K,  %rax
707	subq	KK, %rax
708	leaq	(,%rax, SIZE), %rax
709	leaq	(AO, %rax, 2), AO
710	leaq	(BO, %rax, 1), BO
711#endif
712
713#ifdef LN
714	subq	$2, KK
715#endif
716
717#ifdef LT
718	addq	$2, KK
719#endif
720
721#ifdef RT
722       movq	K, %rax
723       salq	$1 + BASE_SHIFT, %rax
724       addq	%rax, AORIG
725#endif
726	ALIGN_4
727
728.L60:
729	testq	$1, M
730	je	.L69
731
732#ifdef LN
733       movq	K, %rax
734       salq	$0 + BASE_SHIFT, %rax
735       subq	%rax, AORIG
736#endif
737
738#if defined(LN) || defined(RT)
739	movq	KK, %rax
740	leaq	(, %rax, SIZE), %rax
741	movq	AORIG, AO
742	leaq	(AO, %rax, 1), AO
743	leaq	(B,  %rax, 1), BO
744#else
745	movq	B, BO
746#endif
747
748	movsd	 0 * SIZE(AO), %xmm0
749	xorps	%xmm5,  %xmm5
750	movsd	 1 * SIZE(AO), %xmm2
751	xorps	%xmm7,  %xmm7
752
753	movsd	 0 * SIZE(BO), %xmm1
754	xorps	%xmm8,  %xmm8
755	movsd	 1 * SIZE(BO), %xmm3
756	xorps	%xmm9,  %xmm9
757	movsd	 2 * SIZE(AO), %xmm4
758	movsd	 3 * SIZE(AO), %xmm6
759
760#if defined(LT) || defined(RN)
761	movq	KK, %rax
762#else
763	movq	K, %rax
764	subq	KK, %rax
765#endif
766	sarq	$2, %rax
767	je	.L65
768	ALIGN_4
769
770.L62:
771	addsd	 %xmm5, %xmm8
772	movsd	 2 * SIZE(BO), %xmm5
773	mulsd	 %xmm0, %xmm1
774	movsd	 4 * SIZE(AO), %xmm0
775
776	addsd	 %xmm7, %xmm9
777	movsd	 3 * SIZE(BO), %xmm7
778	mulsd	 %xmm2, %xmm3
779	movsd	 5 * SIZE(AO), %xmm2
780
781	addsd	 %xmm1, %xmm8
782	movsd	 4 * SIZE(BO), %xmm1
783	mulsd	 %xmm4, %xmm5
784	movsd	 6 * SIZE(AO), %xmm4
785
786	addsd	 %xmm3, %xmm9
787	movsd	 5 * SIZE(BO), %xmm3
788	mulsd	 %xmm6, %xmm7
789	movsd	 7 * SIZE(AO), %xmm6
790
791	addq	$4 * SIZE, AO
792	addq	$4 * SIZE, BO
793
794	decq	%rax
795	jne    .L62
796
797	addsd	 %xmm5, %xmm8
798	addsd	 %xmm7, %xmm9
799	ALIGN_4
800
801.L65:
802#if defined(LT) || defined(RN)
803	movq	KK, %rax
804#else
805	movq	K, %rax
806	subq	KK, %rax
807#endif
808	andq	$3, %rax
809	BRANCH
810	je .L68
811	ALIGN_4
812
813.L66:
814	movsd	 0 * SIZE(AO), %xmm0
815	movsd	 0 * SIZE(BO), %xmm1
816
817	mulsd	 %xmm0, %xmm1
818	addsd	 %xmm1, %xmm8
819
820	addq	$1 * SIZE, AO
821	addq	$1 * SIZE, BO
822
823	decq	%rax
824	BRANCH
825	jg	.L66
826	ALIGN_4
827
828.L68:
829	addsd	%xmm9, %xmm8
830
831#if defined(LN) || defined(RT)
832	movq	KK, %rax
833#ifdef LN
834	subq	$1, %rax
835#else
836	subq	$1, %rax
837#endif
838	leaq	(, %rax, SIZE), %rax
839
840	movq	AORIG, AO
841	leaq	(AO, %rax, 1), AO
842	leaq	(B,  %rax, 1), BO
843#endif
844
845#if defined(LN) || defined(LT)
846	movsd	 0 * SIZE(BO), %xmm0
847	subsd	%xmm8,  %xmm0
848#else
849	movsd	 0 * SIZE(AO), %xmm0
850	subsd	%xmm8,  %xmm0
851#endif
852
853#if defined(LN) || defined(LT)
854	movsd	 0 * SIZE(AO), %xmm8
855	mulsd	 %xmm8, %xmm0
856#endif
857
858#if defined(RN) || defined(RT)
859	movsd	 0 * SIZE(BO), %xmm8
860	mulsd	 %xmm8, %xmm0
861#endif
862
863#ifdef LN
864	subq	$1 * SIZE, CO1
865#endif
866
867	movsd	%xmm0,  0 * SIZE(CO1)
868
869#if defined(LN) || defined(LT)
870	movsd	%xmm0,   0 * SIZE(BO)
871#else
872	movsd	%xmm0,   0 * SIZE(AO)
873#endif
874
875#ifndef LN
876	addq	$1 * SIZE, CO1
877#endif
878
879#if defined(LT) || defined(RN)
880	movq	K,  %rax
881	subq	KK, %rax
882	leaq	(,%rax, SIZE), %rax
883	leaq	(AO, %rax, 1), AO
884	leaq	(BO, %rax, 1), BO
885#endif
886
887#ifdef LN
888	subq	$1, KK
889#endif
890
891#ifdef LT
892	addq	$1, KK
893#endif
894
895#ifdef RT
896       movq	K, %rax
897       salq	$0 + BASE_SHIFT, %rax
898       addq	%rax, AORIG
899#endif
900	ALIGN_4
901
902.L69:
903#ifdef LN
904       leaq	(, K, SIZE), %rax
905       leaq	(B, %rax, 1), B
906#endif
907
908#if defined(LT) || defined(RN)
909	movq	BO, B
910#endif
911
912#ifdef RN
913	addq	$1, KK
914#endif
915
916#ifdef RT
917	subq	$1, KK
918#endif
919	ALIGN_2
920
921.L40:
922	movq	N,  J
923	sarq	$1, J
924	jle	.L999
925	ALIGN_4
926
927.L10:
928#if defined(LT) || defined(RN)
929	movq	A, AO
930#else
931	movq	A, AORIG
932#endif
933
934#ifdef RT
935       movq	K, %rax
936       salq	$1 + BASE_SHIFT, %rax
937       subq	%rax, B
938
939       leaq	(, LDC, 2), %rax
940       subq	%rax, C
941#endif
942
943	movq	C, CO1
944	leaq	(C, LDC, 1), CO2
945#ifndef RT
946	leaq	(C, LDC, 2), C
947#endif
948
949#ifdef LN
950	movq	OFFSET, %rax
951	addq	M, %rax
952	movq	%rax, KK
953#endif
954
955	movq	K, %rax
956	salq	$BASE_SHIFT + 1, %rax
957	leaq	(B, %rax), BB
958
959#ifdef LT
960	movq	OFFSET, %rax
961	movq	%rax, KK
962#endif
963
964	movq	M,  I
965	sarq	$2, I
966	jle	.L20
967	ALIGN_4
968
969.L11:
970#ifdef LN
971       movq	K, %rax
972       salq	$2 + BASE_SHIFT, %rax
973       subq	%rax, AORIG
974#endif
975
976#if defined(LN) || defined(RT)
977	movq	KK, %rax
978	leaq	(, %rax, SIZE), %rax
979	movq	AORIG, AO
980	leaq	(AO, %rax, 4), AO
981	leaq	(B,  %rax, 2), BO
982#else
983	movq	B, BO
984#endif
985
986	prefetcht0	  0 * SIZE(BB)
987	subq	   $-8 * SIZE, BB
988
989	movsd	 0 * SIZE(AO), %xmm0
990	xorps	%xmm2,  %xmm2
991	movsd	 1 * SIZE(AO), %xmm4
992	xorps	%xmm5,  %xmm5
993	movsd	 2 * SIZE(AO), %xmm5
994	xorps	%xmm6,  %xmm6
995	xorps	%xmm7,  %xmm7
996
997	movsd	 0 * SIZE(BO), %xmm1
998	xorps	%xmm8,  %xmm8
999	xorps	%xmm9,  %xmm9
1000	movsd	 1 * SIZE(BO), %xmm3
1001	xorps	%xmm10, %xmm10
1002	xorps	%xmm11, %xmm11
1003
1004	prefetcht0     3 * SIZE(CO1)
1005	xorps	%xmm12, %xmm12
1006	xorps	%xmm13, %xmm13
1007	prefetcht0     3 * SIZE(CO2)
1008	xorps	%xmm14, %xmm14
1009	xorps	%xmm15, %xmm15
1010
1011#if defined(LT) || defined(RN)
1012	movq	KK, %rax
1013#else
1014	movq	K, %rax
1015	subq	KK, %rax
1016#endif
1017	sarq	$2, %rax
1018	je	.L15
1019	ALIGN_4
1020
1021.L12:
1022	addsd	 %xmm2, %xmm13
1023	PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
1024	movaps	 %xmm0, %xmm2
1025	mulsd	 %xmm1, %xmm0
1026
1027	addsd	 %xmm7, %xmm14
1028	movsd	 3 * SIZE(AO), %xmm7
1029	mulsd	 %xmm3, %xmm2
1030
1031	addsd	 %xmm6, %xmm15
1032	PREFETCH (PREFETCHSIZE + 0) * SIZE(BO)
1033	movaps	 %xmm4, %xmm6
1034	mulsd	 %xmm1, %xmm4
1035
1036	addsd	 %xmm0, %xmm8
1037	movsd	 4 * SIZE(AO), %xmm0
1038	mulsd	 %xmm3, %xmm6
1039
1040	addsd	 %xmm2, %xmm9
1041	movaps	 %xmm5, %xmm2
1042	mulsd	 %xmm1, %xmm5
1043
1044	addsd	 %xmm4, %xmm10
1045	movsd	 5 * SIZE(AO), %xmm4
1046	mulsd	 %xmm3, %xmm2
1047
1048	addsd	 %xmm6, %xmm11
1049	movaps	 %xmm7, %xmm6
1050	mulsd	 %xmm1, %xmm7
1051	movsd	 2 * SIZE(BO), %xmm1
1052
1053	addsd	 %xmm5, %xmm12
1054	movsd	 6 * SIZE(AO), %xmm5
1055	mulsd	 %xmm3, %xmm6
1056	movsd	 3 * SIZE(BO), %xmm3
1057
1058	addsd	 %xmm2, %xmm13
1059	movaps	 %xmm0, %xmm2
1060	mulsd	 %xmm1, %xmm0
1061
1062	addsd	 %xmm7, %xmm14
1063	movsd	 7 * SIZE(AO), %xmm7
1064	mulsd	 %xmm3, %xmm2
1065
1066	addsd	 %xmm6, %xmm15
1067	movaps	 %xmm4, %xmm6
1068	mulsd	 %xmm1, %xmm4
1069
1070	addsd	 %xmm0, %xmm8
1071	movsd	 8 * SIZE(AO), %xmm0
1072	mulsd	 %xmm3, %xmm6
1073
1074	addsd	 %xmm2, %xmm9
1075	movaps	 %xmm5, %xmm2
1076	mulsd	 %xmm1, %xmm5
1077
1078	addsd	 %xmm4, %xmm10
1079	movsd	 9 * SIZE(AO), %xmm4
1080	mulsd	 %xmm3, %xmm2
1081
1082	addsd	 %xmm6, %xmm11
1083	movaps	 %xmm7, %xmm6
1084	mulsd	 %xmm1, %xmm7
1085	movsd	 4 * SIZE(BO), %xmm1
1086
1087	addsd	 %xmm5, %xmm12
1088	movsd	10 * SIZE(AO), %xmm5
1089	mulsd	 %xmm3, %xmm6
1090	movsd	 5 * SIZE(BO), %xmm3
1091
1092	addsd	 %xmm2, %xmm13
1093	PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
1094	movaps	 %xmm0, %xmm2
1095	mulsd	 %xmm1, %xmm0
1096
1097	addsd	 %xmm7, %xmm14
1098	movsd	11 * SIZE(AO), %xmm7
1099	mulsd	 %xmm3, %xmm2
1100
1101	addsd	 %xmm6, %xmm15
1102	movaps	 %xmm4, %xmm6
1103	mulsd	 %xmm1, %xmm4
1104
1105	addsd	 %xmm0, %xmm8
1106	movsd	12 * SIZE(AO), %xmm0
1107	mulsd	 %xmm3, %xmm6
1108
1109	addsd	 %xmm2, %xmm9
1110	movaps	 %xmm5, %xmm2
1111	mulsd	 %xmm1, %xmm5
1112
1113	addsd	 %xmm4, %xmm10
1114	movsd	13 * SIZE(AO), %xmm4
1115	mulsd	 %xmm3, %xmm2
1116
1117	addsd	 %xmm6, %xmm11
1118	movaps	 %xmm7, %xmm6
1119	mulsd	 %xmm1, %xmm7
1120	movsd	 6 * SIZE(BO), %xmm1
1121
1122	addsd	 %xmm5, %xmm12
1123	movsd	14 * SIZE(AO), %xmm5
1124	mulsd	 %xmm3, %xmm6
1125	movsd	 7 * SIZE(BO), %xmm3
1126
1127	addsd	 %xmm2, %xmm13
1128	movaps	 %xmm0, %xmm2
1129	mulsd	 %xmm1, %xmm0
1130
1131	addsd	 %xmm7, %xmm14
1132	movsd	15 * SIZE(AO), %xmm7
1133	mulsd	 %xmm3, %xmm2
1134	subq   $-16 * SIZE, AO
1135
1136	addsd	 %xmm6, %xmm15
1137	movaps	 %xmm4, %xmm6
1138	mulsd	 %xmm1, %xmm4
1139
1140	addsd	 %xmm0, %xmm8
1141	movsd	 0 * SIZE(AO), %xmm0
1142	mulsd	 %xmm3, %xmm6
1143
1144	addsd	 %xmm2, %xmm9
1145	movaps	 %xmm5, %xmm2
1146	mulsd	 %xmm1, %xmm5
1147	addq   $  8 * SIZE, BO
1148
1149	addsd	 %xmm4, %xmm10
1150	movsd	 1 * SIZE(AO), %xmm4
1151	mulsd	 %xmm3, %xmm2
1152	decq   %rax
1153
1154	addsd	 %xmm6, %xmm11
1155	movaps	 %xmm7, %xmm6
1156	mulsd	 %xmm1, %xmm7
1157	movsd	 0 * SIZE(BO), %xmm1
1158
1159	addsd	 %xmm5, %xmm12
1160	movsd	 2 * SIZE(AO), %xmm5
1161	mulsd	 %xmm3, %xmm6
1162	movsd	 1 * SIZE(BO), %xmm3
1163
1164	jne    .L12
1165	ALIGN_4
1166
1167.L15:
1168#if defined(LT) || defined(RN)
1169	movq	KK, %rax
1170#else
1171	movq	K, %rax
1172	subq	KK, %rax
1173#endif
1174	andq	$3, %rax
1175	BRANCH
1176	je .L19
1177	ALIGN_4
1178
1179.L16:
1180	addsd	 %xmm2, %xmm13
1181	movaps	 %xmm0, %xmm2
1182	mulsd	 %xmm1, %xmm0
1183
1184	addsd	 %xmm7, %xmm14
1185	movsd	 3 * SIZE(AO), %xmm7
1186	mulsd	 %xmm3, %xmm2
1187
1188	addsd	 %xmm6, %xmm15
1189	movaps	 %xmm4, %xmm6
1190	mulsd	 %xmm1, %xmm4
1191
1192	addsd	 %xmm0, %xmm8
1193	movsd	 4 * SIZE(AO), %xmm0
1194	mulsd	 %xmm3, %xmm6
1195
1196	addsd	 %xmm2, %xmm9
1197	movaps	 %xmm5, %xmm2
1198	mulsd	 %xmm1, %xmm5
1199
1200	addsd	 %xmm4, %xmm10
1201	movsd	 5 * SIZE(AO), %xmm4
1202	mulsd	 %xmm3, %xmm2
1203
1204	addsd	 %xmm6, %xmm11
1205	movaps	 %xmm7, %xmm6
1206	mulsd	 %xmm1, %xmm7
1207	movsd	 2 * SIZE(BO), %xmm1
1208
1209	addsd	 %xmm5, %xmm12
1210	movsd	 6 * SIZE(AO), %xmm5
1211	mulsd	 %xmm3, %xmm6
1212	movsd	 3 * SIZE(BO), %xmm3
1213
1214	addq	$4 * SIZE, AO
1215	addq	$2 * SIZE, BO
1216	decq	%rax
1217	BRANCH
1218	jg	.L16
1219	ALIGN_4
1220
1221.L19:
1222	addsd	 %xmm2, %xmm13
1223	addsd	 %xmm7, %xmm14
1224	addsd	 %xmm6, %xmm15
1225
1226#if defined(LN) || defined(RT)
1227	movq	KK, %rax
1228#ifdef LN
1229	subq	$4, %rax
1230#else
1231	subq	$2, %rax
1232#endif
1233
1234	leaq	(, %rax, SIZE), %rax
1235
1236	movq	AORIG, AO
1237	leaq	(AO, %rax, 4), AO
1238	leaq	(B,  %rax, 2), BO
1239#endif
1240
1241#if defined(LN) || defined(LT)
1242	movsd	 0 * SIZE(BO), %xmm0
1243	movsd	 1 * SIZE(BO), %xmm1
1244	movsd	 2 * SIZE(BO), %xmm2
1245	movsd	 3 * SIZE(BO), %xmm3
1246	movsd	 4 * SIZE(BO), %xmm4
1247	movsd	 5 * SIZE(BO), %xmm5
1248	movsd	 6 * SIZE(BO), %xmm6
1249	movsd	 7 * SIZE(BO), %xmm7
1250
1251	subsd	%xmm8,  %xmm0
1252	subsd	%xmm9,  %xmm1
1253	subsd	%xmm10, %xmm2
1254	subsd	%xmm11, %xmm3
1255	subsd	%xmm12, %xmm4
1256	subsd	%xmm13, %xmm5
1257	subsd	%xmm14, %xmm6
1258	subsd	%xmm15, %xmm7
1259#else
1260	movsd	 0 * SIZE(AO), %xmm0
1261	movsd	 1 * SIZE(AO), %xmm2
1262	movsd	 2 * SIZE(AO), %xmm4
1263	movsd	 3 * SIZE(AO), %xmm6
1264
1265	movsd	 4 * SIZE(AO), %xmm1
1266	movsd	 5 * SIZE(AO), %xmm3
1267	movsd	 6 * SIZE(AO), %xmm5
1268	movsd	 7 * SIZE(AO), %xmm7
1269
1270	subsd	%xmm8,  %xmm0
1271	subsd	%xmm10, %xmm2
1272	subsd	%xmm12, %xmm4
1273	subsd	%xmm14, %xmm6
1274	subsd	%xmm9,  %xmm1
1275	subsd	%xmm11, %xmm3
1276	subsd	%xmm13, %xmm5
1277	subsd	%xmm15, %xmm7
1278#endif
1279
1280#ifdef LN
1281	movsd	15 * SIZE(AO), %xmm8
1282	mulsd	 %xmm8, %xmm6
1283	movsd	14 * SIZE(AO), %xmm9
1284	mulsd	 %xmm8, %xmm7
1285	movsd	13 * SIZE(AO), %xmm11
1286
1287	movaps	%xmm9,  %xmm10
1288	movsd	12 * SIZE(AO), %xmm13
1289	mulsd	%xmm6,  %xmm9
1290	movsd	10 * SIZE(AO), %xmm8
1291	mulsd	%xmm7,  %xmm10
1292	subsd	%xmm9,  %xmm4
1293	movsd	 9 * SIZE(AO), %xmm9
1294	subsd	%xmm10, %xmm5
1295
1296	movaps	%xmm11,  %xmm12
1297	mulsd	%xmm6,  %xmm11
1298	mulsd	%xmm7,  %xmm12
1299	subsd	%xmm11, %xmm2
1300	movsd	 8 * SIZE(AO), %xmm11
1301	subsd	%xmm12, %xmm3
1302
1303	movaps	%xmm13,  %xmm14
1304	mulsd	%xmm6,  %xmm13
1305	mulsd	%xmm7,  %xmm14
1306	subsd	%xmm13, %xmm0
1307	subsd	%xmm14, %xmm1
1308
1309	mulsd	 %xmm8, %xmm4
1310	mulsd	 %xmm8, %xmm5
1311	movsd	 5 * SIZE(AO), %xmm8
1312
1313	movaps	%xmm9,  %xmm10
1314	mulsd	%xmm4,  %xmm9
1315	mulsd	%xmm5,  %xmm10
1316	subsd	%xmm9,  %xmm2
1317	movsd	 4 * SIZE(AO), %xmm9
1318	subsd	%xmm10, %xmm3
1319
1320	movaps	%xmm11,  %xmm12
1321	mulsd	%xmm4,  %xmm11
1322	mulsd	%xmm5,  %xmm12
1323	subsd	%xmm11, %xmm0
1324	movsd	 0 * SIZE(AO), %xmm11
1325	subsd	%xmm12, %xmm1
1326
1327	mulsd	 %xmm8, %xmm2
1328	mulsd	 %xmm8, %xmm3
1329
1330	movaps	%xmm9,  %xmm10
1331	mulsd	%xmm2,  %xmm9
1332	mulsd	%xmm3,  %xmm10
1333	subsd	%xmm9,  %xmm0
1334	subsd	%xmm10, %xmm1
1335
1336	mulsd	 %xmm11, %xmm0
1337	mulsd	 %xmm11, %xmm1
1338#endif
1339
1340#ifdef LT
1341	movsd	 0 * SIZE(AO), %xmm8
1342	mulsd	 %xmm8, %xmm0
1343	movsd	 1 * SIZE(AO), %xmm9
1344	mulsd	 %xmm8, %xmm1
1345
1346	movsd	 2 * SIZE(AO), %xmm11
1347	movaps	%xmm9,  %xmm10
1348	movsd	 3 * SIZE(AO), %xmm13
1349	mulsd	%xmm0,  %xmm9
1350	movsd	 5 * SIZE(AO), %xmm8
1351	mulsd	%xmm1,  %xmm10
1352	subsd	%xmm9,  %xmm2
1353	movsd	 6 * SIZE(AO), %xmm9
1354	subsd	%xmm10, %xmm3
1355
1356	movaps	%xmm11, %xmm12
1357	mulsd	%xmm0,  %xmm11
1358	mulsd	%xmm1,  %xmm12
1359	subsd	%xmm11, %xmm4
1360	movsd	 7 * SIZE(AO), %xmm11
1361	subsd	%xmm12, %xmm5
1362
1363	movaps	%xmm13, %xmm14
1364	mulsd	%xmm0,  %xmm13
1365	mulsd	%xmm1,  %xmm14
1366	subsd	%xmm13, %xmm6
1367	subsd	%xmm14, %xmm7
1368
1369	mulsd	 %xmm8, %xmm2
1370	mulsd	 %xmm8, %xmm3
1371	movsd	10 * SIZE(AO), %xmm8
1372
1373	movaps	%xmm9,  %xmm10
1374	mulsd	%xmm2,  %xmm9
1375	mulsd	%xmm3,  %xmm10
1376	subsd	%xmm9,  %xmm4
1377	movsd	11 * SIZE(AO), %xmm9
1378	subsd	%xmm10, %xmm5
1379
1380	movaps	%xmm11, %xmm12
1381	mulsd	%xmm2,  %xmm11
1382	mulsd	%xmm3,  %xmm12
1383	subsd	%xmm11, %xmm6
1384	subsd	%xmm12, %xmm7
1385
1386	mulsd	 %xmm8, %xmm4
1387	mulsd	 %xmm8, %xmm5
1388	movsd	15 * SIZE(AO), %xmm8
1389
1390	movaps	%xmm9,  %xmm10
1391	mulsd	%xmm4,  %xmm9
1392	mulsd	%xmm5,  %xmm10
1393	subsd	%xmm9,  %xmm6
1394	subsd	%xmm10, %xmm7
1395
1396	mulsd	 %xmm8, %xmm6
1397	mulsd	 %xmm8, %xmm7
1398#endif
1399
1400#ifdef RN
1401	movsd	 0 * SIZE(BO), %xmm8
1402	mulsd	 %xmm8, %xmm0
1403	movsd	 1 * SIZE(BO), %xmm9
1404	mulsd	 %xmm8, %xmm2
1405	movsd	 3 * SIZE(BO), %xmm13
1406	mulsd	 %xmm8, %xmm4
1407	mulsd	 %xmm8, %xmm6
1408
1409	movaps	 %xmm9, %xmm10
1410	movaps	 %xmm9, %xmm11
1411	movaps	 %xmm9, %xmm12
1412
1413	mulsd	 %xmm0, %xmm9
1414	mulsd	 %xmm2, %xmm10
1415	mulsd	 %xmm4, %xmm11
1416	mulsd	 %xmm6, %xmm12
1417
1418	subsd	 %xmm9,  %xmm1
1419	subsd	 %xmm10, %xmm3
1420	subsd	 %xmm11, %xmm5
1421	subsd	 %xmm12, %xmm7
1422
1423	mulsd	 %xmm13, %xmm1
1424	mulsd	 %xmm13, %xmm3
1425	mulsd	 %xmm13, %xmm5
1426	mulsd	 %xmm13, %xmm7
1427#endif
1428
1429#ifdef RT
1430	movsd	 3 * SIZE(BO), %xmm8
1431	mulsd	 %xmm8, %xmm1
1432	movsd	 2 * SIZE(BO), %xmm9
1433	mulsd	 %xmm8, %xmm3
1434	movsd	 0 * SIZE(BO), %xmm13
1435	mulsd	 %xmm8, %xmm5
1436	mulsd	 %xmm8, %xmm7
1437
1438	movaps	 %xmm9, %xmm10
1439	movaps	 %xmm9, %xmm11
1440	movaps	 %xmm9, %xmm12
1441
1442	mulsd	 %xmm1, %xmm9
1443	mulsd	 %xmm3, %xmm10
1444	mulsd	 %xmm5, %xmm11
1445	mulsd	 %xmm7, %xmm12
1446
1447	subsd	 %xmm9,  %xmm0
1448	subsd	 %xmm10, %xmm2
1449	subsd	 %xmm11, %xmm4
1450	subsd	 %xmm12, %xmm6
1451
1452	mulsd	 %xmm13, %xmm0
1453	mulsd	 %xmm13, %xmm2
1454	mulsd	 %xmm13, %xmm4
1455	mulsd	 %xmm13, %xmm6
1456#endif
1457
1458#ifdef LN
1459	subq	$4 * SIZE, CO1
1460	subq	$4 * SIZE, CO2
1461#endif
1462
1463	movsd	%xmm0,  0 * SIZE(CO1)
1464	movsd	%xmm2,  1 * SIZE(CO1)
1465	movsd	%xmm4,  2 * SIZE(CO1)
1466	movsd	%xmm6,  3 * SIZE(CO1)
1467
1468	movsd	%xmm1,  0 * SIZE(CO2)
1469	movsd	%xmm3,  1 * SIZE(CO2)
1470	movsd	%xmm5,  2 * SIZE(CO2)
1471	movsd	%xmm7,  3 * SIZE(CO2)
1472
1473#if defined(LN) || defined(LT)
1474	movsd	%xmm0,   0 * SIZE(BO)
1475	movsd	%xmm1,   1 * SIZE(BO)
1476	movsd	%xmm2,   2 * SIZE(BO)
1477	movsd	%xmm3,   3 * SIZE(BO)
1478	movsd	%xmm4,   4 * SIZE(BO)
1479	movsd	%xmm5,   5 * SIZE(BO)
1480	movsd	%xmm6,   6 * SIZE(BO)
1481	movsd	%xmm7,   7 * SIZE(BO)
1482#else
1483	movsd	%xmm0,   0 * SIZE(AO)
1484	movsd	%xmm2,   1 * SIZE(AO)
1485	movsd	%xmm4,   2 * SIZE(AO)
1486	movsd	%xmm6,   3 * SIZE(AO)
1487	movsd	%xmm1,   4 * SIZE(AO)
1488	movsd	%xmm3,   5 * SIZE(AO)
1489	movsd	%xmm5,   6 * SIZE(AO)
1490	movsd	%xmm7,   7 * SIZE(AO)
1491#endif
1492
1493#ifndef LN
1494	addq	$4 * SIZE, CO1
1495	addq	$4 * SIZE, CO2
1496#endif
1497
1498#if defined(LT) || defined(RN)
1499	movq	K,  %rax
1500	subq	KK, %rax
1501	leaq	(,%rax, SIZE), %rax
1502	leaq	(AO, %rax, 4), AO
1503	leaq	(BO, %rax, 2), BO
1504#endif
1505
1506#ifdef LN
1507	subq	$4, KK
1508#endif
1509
1510#ifdef LT
1511	addq	$4, KK
1512#endif
1513
1514#ifdef RT
1515       movq	K, %rax
1516       salq	$2 + BASE_SHIFT, %rax
1517       addq	%rax, AORIG
1518#endif
1519
1520	decq	I			# i --
1521	jg	.L11
1522	ALIGN_4
1523
1524.L20:
1525	testq	$2, M
1526	BRANCH
1527	je	.L30
1528
1529#ifdef LN
1530       movq	K, %rax
1531       salq	$1 + BASE_SHIFT, %rax
1532       subq	%rax, AORIG
1533#endif
1534
1535#if defined(LN) || defined(RT)
1536	movq	KK, %rax
1537	leaq	(, %rax, SIZE), %rax
1538	movq	AORIG, AO
1539	leaq	(AO, %rax, 2), AO
1540	leaq	(B,  %rax, 2), BO
1541#else
1542	movq	B, BO
1543#endif
1544
1545	movsd	 0 * SIZE(AO), %xmm0
1546	xorps	%xmm2,  %xmm2
1547	movsd	 1 * SIZE(AO), %xmm4
1548	xorps	%xmm5,  %xmm5
1549	movsd	 2 * SIZE(AO), %xmm5
1550	xorps	%xmm6,  %xmm6
1551	movsd	 3 * SIZE(AO), %xmm7
1552
1553	movsd	 0 * SIZE(BO), %xmm1
1554	xorps	%xmm8,  %xmm8
1555	xorps	%xmm9,  %xmm9
1556	movsd	 1 * SIZE(BO), %xmm3
1557	xorps	%xmm10, %xmm10
1558	xorps	%xmm11, %xmm11
1559
1560#if defined(LT) || defined(RN)
1561	movq	KK, %rax
1562#else
1563	movq	K, %rax
1564	subq	KK, %rax
1565#endif
1566	sarq	$2, %rax
1567	je	.L25
1568	ALIGN_4
1569
1570.L22:
1571	PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
1572	addsd	 %xmm2, %xmm9
1573	movaps	 %xmm0, %xmm2
1574	mulsd	 %xmm1, %xmm0
1575
1576	addsd	 %xmm6, %xmm11
1577	movaps	 %xmm4, %xmm6
1578	mulsd	 %xmm1, %xmm4
1579	movsd	 2 * SIZE(BO), %xmm1
1580
1581	addsd	 %xmm0, %xmm8
1582	movsd	 4 * SIZE(AO), %xmm0
1583	mulsd	 %xmm3, %xmm2
1584
1585	addsd	 %xmm4, %xmm10
1586	movsd	 5 * SIZE(AO), %xmm4
1587	mulsd	 %xmm3, %xmm6
1588	movsd	 3 * SIZE(BO), %xmm3
1589
1590	addsd	 %xmm2, %xmm9
1591	movaps	 %xmm5, %xmm2
1592	mulsd	 %xmm1, %xmm5
1593
1594	addsd	 %xmm6, %xmm11
1595	movaps	 %xmm7, %xmm6
1596	mulsd	 %xmm1, %xmm7
1597	movsd	 4 * SIZE(BO), %xmm1
1598
1599	addsd	 %xmm5, %xmm8
1600	movsd	 6 * SIZE(AO), %xmm5
1601	mulsd	 %xmm3, %xmm2
1602
1603	addsd	 %xmm7, %xmm10
1604	movsd	 7 * SIZE(AO), %xmm7
1605	mulsd	 %xmm3, %xmm6
1606	movsd	 5 * SIZE(BO), %xmm3
1607
1608	addsd	 %xmm2, %xmm9
1609	movaps	 %xmm0, %xmm2
1610	mulsd	 %xmm1, %xmm0
1611
1612	addsd	 %xmm6, %xmm11
1613	movaps	 %xmm4, %xmm6
1614	mulsd	 %xmm1, %xmm4
1615	movsd	 6 * SIZE(BO), %xmm1
1616
1617	addsd	 %xmm0, %xmm8
1618	movsd	 8 * SIZE(AO), %xmm0
1619	mulsd	 %xmm3, %xmm2
1620
1621	addsd	 %xmm4, %xmm10
1622	movsd	 9 * SIZE(AO), %xmm4
1623	mulsd	 %xmm3, %xmm6
1624	movsd	 7 * SIZE(BO), %xmm3
1625
1626	addsd	 %xmm2, %xmm9
1627	movaps	 %xmm5, %xmm2
1628	mulsd	 %xmm1, %xmm5
1629
1630	addsd	 %xmm6, %xmm11
1631	movaps	 %xmm7, %xmm6
1632	mulsd	 %xmm1, %xmm7
1633	movsd	 8 * SIZE(BO), %xmm1
1634
1635	addsd	 %xmm5, %xmm8
1636	movsd	10 * SIZE(AO), %xmm5
1637	mulsd	 %xmm3, %xmm2
1638
1639	addsd	 %xmm7, %xmm10
1640	movsd	11 * SIZE(AO), %xmm7
1641	mulsd	 %xmm3, %xmm6
1642	movsd	 9 * SIZE(BO), %xmm3
1643
1644	addq	$8 * SIZE, AO
1645	addq	$8 * SIZE, BO
1646
1647	decq	%rax
1648	jne    .L22
1649	ALIGN_4
1650
1651.L25:
1652#if defined(LT) || defined(RN)
1653	movq	KK, %rax
1654#else
1655	movq	K, %rax
1656	subq	KK, %rax
1657#endif
1658	andq	$3, %rax
1659	BRANCH
1660	je .L29
1661	ALIGN_4
1662
1663.L26:
1664	addsd	 %xmm2, %xmm9
1665	movaps	 %xmm0, %xmm2
1666	mulsd	 %xmm1, %xmm0
1667
1668	addsd	 %xmm6, %xmm11
1669	movaps	 %xmm4, %xmm6
1670	mulsd	 %xmm1, %xmm4
1671	movsd	 2 * SIZE(BO), %xmm1
1672
1673	mulsd	 %xmm3, %xmm2
1674	addsd	 %xmm0, %xmm8
1675	movsd	 2 * SIZE(AO), %xmm0
1676
1677	mulsd	 %xmm3, %xmm6
1678	movsd	 3 * SIZE(BO), %xmm3
1679	addsd	 %xmm4, %xmm10
1680	movsd	 3 * SIZE(AO), %xmm4
1681
1682	addq	$2 * SIZE, AO
1683	addq	$2 * SIZE, BO
1684	decq	%rax
1685	BRANCH
1686	jg	.L26
1687	ALIGN_4
1688
1689.L29:
1690	addsd	 %xmm2, %xmm9
1691	addsd	 %xmm6, %xmm11
1692
1693#if defined(LN) || defined(RT)
1694	movq	KK, %rax
1695#ifdef LN
1696	subq	$2, %rax
1697#else
1698	subq	$2, %rax
1699#endif
1700
1701	leaq	(, %rax, SIZE), %rax
1702	movq	AORIG, AO
1703	leaq	(AO, %rax, 2), AO
1704	leaq	(B,  %rax, 2), BO
1705#endif
1706
1707#if defined(LN) || defined(LT)
1708	movsd	 0 * SIZE(BO), %xmm0
1709	movsd	 1 * SIZE(BO), %xmm1
1710	movsd	 2 * SIZE(BO), %xmm2
1711	movsd	 3 * SIZE(BO), %xmm3
1712
1713	subsd	%xmm8,  %xmm0
1714	subsd	%xmm9,  %xmm1
1715	subsd	%xmm10, %xmm2
1716	subsd	%xmm11, %xmm3
1717#else
1718	movsd	 0 * SIZE(AO), %xmm0
1719	movsd	 1 * SIZE(AO), %xmm2
1720	movsd	 2 * SIZE(AO), %xmm1
1721	movsd	 3 * SIZE(AO), %xmm3
1722
1723	subsd	%xmm8,  %xmm0
1724	subsd	%xmm10, %xmm2
1725	subsd	%xmm9,  %xmm1
1726	subsd	%xmm11, %xmm3
1727#endif
1728
1729#ifdef LN
1730	movsd	 3 * SIZE(AO), %xmm8
1731	mulsd	 %xmm8, %xmm2
1732	movsd	 2 * SIZE(AO), %xmm9
1733	mulsd	 %xmm8, %xmm3
1734	movsd	 0 * SIZE(AO), %xmm13
1735
1736	movaps	 %xmm9, %xmm10
1737	mulsd	 %xmm2, %xmm9
1738	mulsd	 %xmm3, %xmm10
1739
1740	subsd	 %xmm9,  %xmm0
1741	subsd	 %xmm10, %xmm1
1742
1743	mulsd	 %xmm13, %xmm0
1744	mulsd	 %xmm13, %xmm1
1745#endif
1746
1747#ifdef LT
1748	movsd	 0 * SIZE(AO), %xmm8
1749	mulsd	 %xmm8, %xmm0
1750	movsd	 1 * SIZE(AO), %xmm9
1751	mulsd	 %xmm8, %xmm1
1752	movsd	 3 * SIZE(AO), %xmm13
1753
1754	movaps	 %xmm9, %xmm10
1755	mulsd	 %xmm0, %xmm9
1756	mulsd	 %xmm1, %xmm10
1757
1758	subsd	 %xmm9,  %xmm2
1759	subsd	 %xmm10, %xmm3
1760
1761	mulsd	 %xmm13, %xmm2
1762	mulsd	 %xmm13, %xmm3
1763#endif
1764
1765#ifdef RN
1766	movsd	 0 * SIZE(BO), %xmm8
1767	mulsd	 %xmm8, %xmm0
1768	movsd	 1 * SIZE(BO), %xmm9
1769	mulsd	 %xmm8, %xmm2
1770	movsd	 3 * SIZE(BO), %xmm13
1771
1772	movaps	 %xmm9, %xmm10
1773	mulsd	 %xmm0, %xmm9
1774	mulsd	 %xmm2, %xmm10
1775
1776	subsd	 %xmm9,  %xmm1
1777	subsd	 %xmm10, %xmm3
1778
1779	mulsd	 %xmm13, %xmm1
1780	mulsd	 %xmm13, %xmm3
1781#endif
1782
1783#ifdef RT
1784	movsd	 3 * SIZE(BO), %xmm8
1785	mulsd	 %xmm8, %xmm1
1786	movsd	 2 * SIZE(BO), %xmm9
1787	mulsd	 %xmm8, %xmm3
1788	movsd	 0 * SIZE(BO), %xmm13
1789
1790	movaps	 %xmm9, %xmm10
1791	mulsd	 %xmm1, %xmm9
1792	mulsd	 %xmm3, %xmm10
1793
1794	subsd	 %xmm9,  %xmm0
1795	subsd	 %xmm10, %xmm2
1796
1797	mulsd	 %xmm13, %xmm0
1798	mulsd	 %xmm13, %xmm2
1799#endif
1800
1801#ifdef LN
1802	subq	$2 * SIZE, CO1
1803	subq	$2 * SIZE, CO2
1804#endif
1805
1806	movsd	%xmm0,  0 * SIZE(CO1)
1807	movsd	%xmm2,  1 * SIZE(CO1)
1808	movsd	%xmm1,  0 * SIZE(CO2)
1809	movsd	%xmm3,  1 * SIZE(CO2)
1810
1811#if defined(LN) || defined(LT)
1812	movsd	%xmm0,   0 * SIZE(BO)
1813	movsd	%xmm1,   1 * SIZE(BO)
1814	movsd	%xmm2,   2 * SIZE(BO)
1815	movsd	%xmm3,   3 * SIZE(BO)
1816#else
1817	movsd	%xmm0,   0 * SIZE(AO)
1818	movsd	%xmm2,   1 * SIZE(AO)
1819	movsd	%xmm1,   2 * SIZE(AO)
1820	movsd	%xmm3,   3 * SIZE(AO)
1821#endif
1822
1823#ifndef LN
1824	addq	$2 * SIZE, CO1
1825	addq	$2 * SIZE, CO2
1826#endif
1827
1828#if defined(LT) || defined(RN)
1829	movq	K,  %rax
1830	subq	KK, %rax
1831	leaq	(,%rax, SIZE), %rax
1832	leaq	(AO, %rax, 2), AO
1833	leaq	(BO, %rax, 2), BO
1834#endif
1835
1836#ifdef LN
1837	subq	$2, KK
1838#endif
1839
1840#ifdef LT
1841	addq	$2, KK
1842#endif
1843
1844#ifdef RT
1845       movq	K, %rax
1846       salq	$1 + BASE_SHIFT, %rax
1847       addq	%rax, AORIG
1848#endif
1849	ALIGN_4
1850
1851.L30:
1852	testq	$1, M
1853	je	.L39
1854
1855#ifdef LN
1856       movq	K, %rax
1857       salq	$0 + BASE_SHIFT, %rax
1858       subq	%rax, AORIG
1859#endif
1860
1861
1862#if defined(LN) || defined(RT)
1863	movq	KK, %rax
1864	leaq	(, %rax, SIZE), %rax
1865	movq	AORIG, AO
1866	leaq	(AO, %rax, 1), AO
1867	leaq	(B,  %rax, 2), BO
1868#else
1869	movq	B, BO
1870#endif
1871
1872	movsd	 0 * SIZE(AO), %xmm0
1873	xorps	%xmm7,  %xmm7
1874	movsd	 1 * SIZE(AO), %xmm2
1875	xorps	%xmm5,  %xmm5
1876
1877	movsd	 0 * SIZE(BO), %xmm1
1878	xorps	%xmm8,  %xmm8
1879	xorps	%xmm9,  %xmm9
1880	movsd	 1 * SIZE(BO), %xmm3
1881
1882#if defined(LT) || defined(RN)
1883	movq	KK, %rax
1884#else
1885	movq	K, %rax
1886	subq	KK, %rax
1887#endif
1888	sarq	$2, %rax
1889	je	.L35
1890	ALIGN_4
1891
1892.L32:
1893	addsd	 %xmm5, %xmm8
1894	movsd	 2 * SIZE(BO), %xmm5
1895	mulsd	 %xmm0, %xmm1
1896	PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
1897
1898	addsd	 %xmm7, %xmm9
1899	movsd	 3 * SIZE(BO), %xmm7
1900	mulsd	 %xmm0, %xmm3
1901	movsd	 2 * SIZE(AO), %xmm0
1902
1903	addsd	 %xmm1, %xmm8
1904	movsd	 4 * SIZE(BO), %xmm1
1905	mulsd	 %xmm2, %xmm5
1906
1907	addsd	 %xmm3, %xmm9
1908	movsd	 5 * SIZE(BO), %xmm3
1909	mulsd	 %xmm2, %xmm7
1910	movsd	 3 * SIZE(AO), %xmm2
1911
1912	addsd	 %xmm5, %xmm8
1913	movsd	 6 * SIZE(BO), %xmm5
1914	mulsd	 %xmm0, %xmm1
1915
1916	addsd	 %xmm7, %xmm9
1917	movsd	 7 * SIZE(BO), %xmm7
1918	mulsd	 %xmm0, %xmm3
1919	movsd	 4 * SIZE(AO), %xmm0
1920
1921	addsd	 %xmm1, %xmm8
1922	movsd	 8 * SIZE(BO), %xmm1
1923	mulsd	 %xmm2, %xmm5
1924
1925	addsd	 %xmm3, %xmm9
1926	movsd	 9 * SIZE(BO), %xmm3
1927	mulsd	 %xmm2, %xmm7
1928	movsd	 5 * SIZE(AO), %xmm2
1929
1930	addq	$4 * SIZE, AO
1931	addq	$8 * SIZE, BO
1932
1933	decq	%rax
1934	jne    .L32
1935	ALIGN_4
1936
1937.L35:
1938#if defined(LT) || defined(RN)
1939	movq	KK, %rax
1940#else
1941	movq	K, %rax
1942	subq	KK, %rax
1943#endif
1944	addsd	 %xmm5, %xmm8
1945	addsd	 %xmm7, %xmm9
1946
1947	andq	$3, %rax
1948	BRANCH
1949	BRANCH
1950	je	.L38
1951	ALIGN_4
1952
1953.L36:
1954	mulsd	 %xmm0, %xmm1
1955	addq	$2 * SIZE, BO
1956	mulsd	 %xmm0, %xmm3
1957	movsd	 1 * SIZE(AO), %xmm0
1958
1959	addsd	 %xmm1, %xmm8
1960	movsd	 0 * SIZE(BO), %xmm1
1961	addsd	 %xmm3, %xmm9
1962	movsd	 1 * SIZE(BO), %xmm3
1963
1964	addq	$1 * SIZE, AO
1965	decq	%rax
1966	BRANCH
1967	jg	.L36
1968	ALIGN_4
1969
1970.L38:
1971#if defined(LN) || defined(RT)
1972	movq	KK, %rax
1973#ifdef LN
1974	subq	$1, %rax
1975#else
1976	subq	$2, %rax
1977#endif
1978
1979	leaq	(, %rax, SIZE), %rax
1980	movq	AORIG, AO
1981	leaq	(AO, %rax, 1), AO
1982	leaq	(B,  %rax, 2), BO
1983#endif
1984
1985#if defined(LN) || defined(LT)
1986	movsd	 0 * SIZE(BO), %xmm0
1987	movsd	 1 * SIZE(BO), %xmm1
1988
1989	subsd	%xmm8,  %xmm0
1990	subsd	%xmm9,  %xmm1
1991#else
1992	movsd	 0 * SIZE(AO), %xmm0
1993	movsd	 1 * SIZE(AO), %xmm1
1994
1995	subsd	%xmm8,  %xmm0
1996	subsd	%xmm9,  %xmm1
1997#endif
1998
1999#if defined(LN) || defined(LT)
2000	movsd	 0 * SIZE(AO), %xmm8
2001	mulsd	 %xmm8, %xmm0
2002	mulsd	 %xmm8, %xmm1
2003#endif
2004
2005#ifdef RN
2006	movsd	 0 * SIZE(BO), %xmm8
2007	mulsd	 %xmm8, %xmm0
2008	movsd	 1 * SIZE(BO), %xmm9
2009	mulsd	 %xmm0,  %xmm9
2010	movsd	 3 * SIZE(BO), %xmm13
2011	subsd	 %xmm9,  %xmm1
2012	mulsd	 %xmm13, %xmm1
2013#endif
2014
2015#ifdef RT
2016	movsd	 3 * SIZE(BO), %xmm8
2017	mulsd	 %xmm8, %xmm1
2018	movsd	 2 * SIZE(BO), %xmm9
2019	mulsd	 %xmm1,  %xmm9
2020	movsd	 0 * SIZE(BO), %xmm13
2021	subsd	 %xmm9,  %xmm0
2022	mulsd	 %xmm13, %xmm0
2023#endif
2024
2025#ifdef LN
2026	subq	$1 * SIZE, CO1
2027	subq	$1 * SIZE, CO2
2028#endif
2029
2030	movsd	%xmm0,  0 * SIZE(CO1)
2031	movsd	%xmm1,  0 * SIZE(CO2)
2032
2033#if defined(LN) || defined(LT)
2034	movsd	%xmm0,   0 * SIZE(BO)
2035	movsd	%xmm1,   1 * SIZE(BO)
2036#else
2037	movsd	%xmm0,   0 * SIZE(AO)
2038	movsd	%xmm1,   1 * SIZE(AO)
2039#endif
2040
2041#ifndef LN
2042	addq	$1 * SIZE, CO1
2043	addq	$1 * SIZE, CO2
2044#endif
2045
2046#if defined(LT) || defined(RN)
2047	movq	K,  %rax
2048	subq	KK, %rax
2049	leaq	(,%rax, SIZE), %rax
2050	leaq	(AO, %rax, 1), AO
2051	leaq	(BO, %rax, 2), BO
2052#endif
2053
2054#ifdef LN
2055	subq	$1, KK
2056#endif
2057
2058#ifdef LT
2059	addq	$1, KK
2060#endif
2061
2062#ifdef RT
2063       movq	K, %rax
2064       salq	$0 + BASE_SHIFT, %rax
2065       addq	%rax, AORIG
2066#endif
2067	ALIGN_4
2068
2069.L39:
2070#ifdef LN
2071       leaq	(, K, SIZE), %rax
2072       leaq	(B, %rax, 2), B
2073#endif
2074#if defined(LT) || defined(RN)
2075	movq	BO, B
2076#endif
2077
2078#ifdef RN
2079	addq	$2, KK
2080#endif
2081
2082#ifdef RT
2083	subq	$2, KK
2084#endif
2085
2086	decq	J			# j --
2087	jg	.L10
2088	ALIGN_4
2089
2090.L999:
2091	movq	  0(%rsp), %rbx
2092	movq	  8(%rsp), %rbp
2093	movq	 16(%rsp), %r12
2094	movq	 24(%rsp), %r13
2095	movq	 32(%rsp), %r14
2096	movq	 40(%rsp), %r15
2097
2098#ifdef WINDOWS_ABI
2099	movq	 48(%rsp), %rdi
2100	movq	 56(%rsp), %rsi
2101	movups	 64(%rsp), %xmm6
2102	movups	 80(%rsp), %xmm7
2103	movups	 96(%rsp), %xmm8
2104	movups	112(%rsp), %xmm9
2105	movups	128(%rsp), %xmm10
2106	movups	144(%rsp), %xmm11
2107	movups	160(%rsp), %xmm12
2108	movups	176(%rsp), %xmm13
2109	movups	192(%rsp), %xmm14
2110	movups	208(%rsp), %xmm15
2111#endif
2112
2113	addq	$STACKSIZE, %rsp
2114	ret
2115
2116	EPILOGUE
2117