1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define OLD_M	%rdi
43#define OLD_N	%rsi
44#define M	%r13
45#define N	%r14
46#define K	%rdx
47
48#define A	%rcx
49#define B	%r8
50#define C	%r9
51#define LDC	%r10
52
53#define I	%r11
54#define AO	%rdi
55#define BO	%rsi
56#define	CO1	%rbx
57#define CO2	%rbp
58#define BB	%r12
59
60#ifndef WINDOWS_ABI
61
62#define STACKSIZE 64
63
64#define OLD_LDC		 8 + STACKSIZE(%rsp)
65#define OLD_OFFSET	16 + STACKSIZE(%rsp)
66
67#else
68
69#define STACKSIZE 256
70
71#define OLD_A		48 + STACKSIZE(%rsp)
72#define OLD_B		56 + STACKSIZE(%rsp)
73#define OLD_C		64 + STACKSIZE(%rsp)
74#define OLD_LDC		72 + STACKSIZE(%rsp)
75#define OLD_OFFSET	80 + STACKSIZE(%rsp)
76
77#endif
78
79#define POSINV	  0(%rsp)
80#define J	 16(%rsp)
81#define OFFSET	 24(%rsp)
82#define KK	 32(%rsp)
83#define KKK	 40(%rsp)
84#define AORIG    48(%rsp)
85#define BORIG	 56(%rsp)
86#define BUFFER	128(%rsp)
87
88#define PREFETCH_R    (8 * 4 + 0)
89#define PREFETCH_W    (PREFETCH_R)
90
91#define PREFETCHSIZE  (8 * 17 + 2)
92#define PREFETCH     prefetcht0
93
94#ifndef CONJ
95#define NN
96#else
97#if defined(LN) || defined(LT)
98#define CN
99#else
100#define NC
101#endif
102#endif
103
104#define ADD1	  addpd
105#define ADD2	  addpd
106
107	PROLOGUE
108	PROFCODE
109
110	subq	$STACKSIZE, %rsp
111
112	movq	%rbx,  0(%rsp)
113	movq	%rbp,  8(%rsp)
114	movq	%r12, 16(%rsp)
115	movq	%r13, 24(%rsp)
116	movq	%r14, 32(%rsp)
117	movq	%r15, 40(%rsp)
118
119#ifdef WINDOWS_ABI
120	movq	%rdi,    48(%rsp)
121	movq	%rsi,    56(%rsp)
122	movups	%xmm6,   64(%rsp)
123	movups	%xmm7,   80(%rsp)
124	movups	%xmm8,   96(%rsp)
125	movups	%xmm9,  112(%rsp)
126	movups	%xmm10, 128(%rsp)
127	movups	%xmm11, 144(%rsp)
128	movups	%xmm12, 160(%rsp)
129	movups	%xmm13, 176(%rsp)
130	movups	%xmm14, 192(%rsp)
131	movups	%xmm15, 208(%rsp)
132
133	movq	ARG1,      OLD_M
134	movq	ARG2,      OLD_N
135	movq	ARG3,      K
136	movq	OLD_A,     A
137	movq	OLD_B,     B
138	movq	OLD_C,     C
139#endif
140
141	movq	OLD_LDC,    LDC
142	movq	OLD_OFFSET, %rax
143
144	movq	%rsp, %r15	# save old stack
145	subq	$128 + LOCAL_BUFFER_SIZE, %rsp
146	andq	$-4096, %rsp	# align stack
147
148	STACK_TOUCHING
149
150	movq	%rax, KK
151	movq	%rax, OFFSET
152
153	movq	OLD_M, M
154	movq	OLD_N, N
155
156	subq	$-16 * SIZE, A
157	subq	$-16 * SIZE, B
158
159	pcmpeqb	%xmm15, %xmm15
160	psllq	$63, %xmm15	# Generate mask
161	pxor	%xmm2, %xmm2
162
163	movlpd	  %xmm2,  0 + POSINV
164	movlpd	  %xmm15, 8 + POSINV
165
166	salq	$ZBASE_SHIFT, LDC
167
168#ifdef LN
169       movq	M, %rax
170       salq	$ZBASE_SHIFT, %rax
171       addq	%rax, C
172       imulq	K, %rax
173       addq	%rax, A
174#endif
175
176#ifdef RT
177       movq	N, %rax
178       salq	$ZBASE_SHIFT, %rax
179       imulq	K, %rax
180       addq	%rax, B
181
182       movq	N, %rax
183       imulq	LDC, %rax
184       addq	%rax, C
185#endif
186
187#ifdef RN
188	negq	KK
189#endif
190
191#ifdef RT
192       movq	N, %rax
193       subq	OFFSET, %rax
194       movq	%rax, KK
195#endif
196
197	movq	N,  J
198	sarq	$1, J		# j = (n >> 2)
199	jle	.L100
200	ALIGN_4
201
202.L01:
203#ifdef LN
204	movq	OFFSET, %rax
205	addq	M, %rax
206	movq	%rax, KK
207#endif
208
209	leaq	16 * SIZE +  BUFFER, BO
210
211#ifdef RT
212       movq	K, %rax
213       salq	$1 + ZBASE_SHIFT, %rax
214       subq	%rax, B
215#endif
216
217#if defined(LN) || defined(RT)
218	movq	KK, %rax
219	movq	B, BORIG
220	salq	$ZBASE_SHIFT, %rax
221	leaq	(B,  %rax, 2), B
222	leaq	(BO, %rax, 4), BO
223#endif
224
225#if defined(LT)
226	movq	OFFSET, %rax
227	movq	%rax, KK
228#endif
229
230#if defined(LT) || defined(RN)
231	movq	KK, %rax
232#else
233	movq	K, %rax
234	subq	KK, %rax
235#endif
236	sarq	$2, %rax
237	jle	.L03
238
239	addq	%rax, %rax
240	ALIGN_4
241
242.L02:
243	prefetcht0	(PREFETCH_R + 0) * SIZE(B)
244
245	movddup	 -16 * SIZE(B), %xmm8
246	movddup	 -15 * SIZE(B), %xmm9
247	movddup	 -14 * SIZE(B), %xmm10
248	movddup	 -13 * SIZE(B), %xmm11
249	movddup	 -12 * SIZE(B), %xmm12
250	movddup	 -11 * SIZE(B), %xmm13
251	movddup	 -10 * SIZE(B), %xmm14
252	movddup	  -9 * SIZE(B), %xmm15
253
254	prefetcht0	(PREFETCH_W + 0) * SIZE(BO)
255
256	movapd	%xmm8,  -16 * SIZE(BO)
257	movapd	%xmm9,  -14 * SIZE(BO)
258	movapd	%xmm10,  -12 * SIZE(BO)
259	movapd	%xmm11,  -10 * SIZE(BO)
260
261	prefetcht0	(PREFETCH_W + 8) * SIZE(BO)
262
263	movapd	%xmm12,   -8 * SIZE(BO)
264	movapd	%xmm13,   -6 * SIZE(BO)
265	movapd	%xmm14,   -4 * SIZE(BO)
266	movapd	%xmm15,   -2 * SIZE(BO)
267
268	addq	$  8 * SIZE, B
269	subq	$-16 * SIZE, BO
270	decq	%rax
271	jne	.L02
272	ALIGN_4
273
274.L03:
275#if defined(LT) || defined(RN)
276	movq	KK, %rax
277#else
278	movq	K, %rax
279	subq	KK, %rax
280#endif
281	andq	$3, %rax
282	BRANCH
283	jle	.L05
284	ALIGN_4
285
286.L04:
287	movddup	 -16 * SIZE(B), %xmm8
288	movddup	 -15 * SIZE(B), %xmm9
289	movddup	 -14 * SIZE(B), %xmm10
290	movddup	 -13 * SIZE(B), %xmm11
291
292	movapd	%xmm8,  -16 * SIZE(BO)
293	movapd	%xmm9,  -14 * SIZE(BO)
294	movapd	%xmm10,  -12 * SIZE(BO)
295	movapd	%xmm11,  -10 * SIZE(BO)
296
297	addq	$ 4 * SIZE, B
298	addq	$ 8 * SIZE, BO
299
300	decq	%rax
301	jne	.L04
302	ALIGN_4
303
304.L05:
305#if defined(LT) || defined(RN)
306	movq	A, AO
307#else
308	movq	A, AORIG
309#endif
310
311#ifdef RT
312       leaq	(, LDC, 2), %rax
313       subq	%rax, C
314#endif
315
316	movq	C, CO1
317	leaq	(C, LDC, 1), CO2
318
319#ifndef RT
320	leaq	(C, LDC, 2), C
321#endif
322
323	testq	$1, M
324	jle	.L30
325
326#ifdef LN
327       movq	K, %rax
328       salq	$0 + ZBASE_SHIFT, %rax
329       subq	%rax, AORIG
330#endif
331
332#if defined(LN) || defined(RT)
333	movq	KK, %rax
334	movq	AORIG, AO
335	salq	$ZBASE_SHIFT, %rax
336	addq	%rax, AO
337#endif
338
339	leaq	16 * SIZE + BUFFER, BO
340
341#if defined(LN) || defined(RT)
342	movq	KK, %rax
343	salq	$1 + ZBASE_SHIFT, %rax
344	leaq	(BO, %rax, 2), BO
345#endif
346
347	pxor	%xmm8, %xmm8
348	pxor	%xmm9, %xmm9
349	pxor	%xmm10, %xmm10
350	pxor	%xmm11, %xmm11
351
352#if defined(LT) || defined(RN)
353	movq	KK, %rax
354#else
355	movq	K, %rax
356	subq	KK, %rax
357#endif
358	sarq	$2, %rax
359	je	.L42
360
361.L41:
362	PREFETCH (PREFETCHSIZE +  0) * SIZE(AO)
363
364	movapd	-16 * SIZE(AO), %xmm0
365	movapd	-16 * SIZE(BO), %xmm2
366	movapd	-14 * SIZE(BO), %xmm3
367	movapd	-12 * SIZE(BO), %xmm4
368	movapd	-10 * SIZE(BO), %xmm5
369
370	mulpd	%xmm0, %xmm2
371	mulpd	%xmm0, %xmm3
372	mulpd	%xmm0, %xmm4
373	mulpd	%xmm0, %xmm5
374
375	ADD1	%xmm2, %xmm8
376	ADD2	%xmm3, %xmm9
377	ADD1	%xmm4, %xmm10
378	ADD2	%xmm5, %xmm11
379
380	movapd	-14 * SIZE(AO), %xmm0
381	movapd	 -8 * SIZE(BO), %xmm2
382	movapd	 -6 * SIZE(BO), %xmm3
383	movapd	 -4 * SIZE(BO), %xmm4
384	movapd	 -2 * SIZE(BO), %xmm5
385
386	mulpd	%xmm0, %xmm2
387	mulpd	%xmm0, %xmm3
388	mulpd	%xmm0, %xmm4
389	mulpd	%xmm0, %xmm5
390
391	ADD1	%xmm2, %xmm8
392	ADD2	%xmm3, %xmm9
393	ADD1	%xmm4, %xmm10
394	ADD2	%xmm5, %xmm11
395
396	movapd	-12 * SIZE(AO), %xmm0
397	movapd	  0 * SIZE(BO), %xmm2
398	movapd	  2 * SIZE(BO), %xmm3
399	movapd	  4 * SIZE(BO), %xmm4
400	movapd	  6 * SIZE(BO), %xmm5
401
402	mulpd	%xmm0, %xmm2
403	mulpd	%xmm0, %xmm3
404	mulpd	%xmm0, %xmm4
405	mulpd	%xmm0, %xmm5
406
407	ADD1	%xmm2, %xmm8
408	ADD2	%xmm3, %xmm9
409	ADD1	%xmm4, %xmm10
410	ADD2	%xmm5, %xmm11
411
412	movapd	-10 * SIZE(AO), %xmm0
413	movapd	  8 * SIZE(BO), %xmm2
414	movapd	 10 * SIZE(BO), %xmm3
415	movapd	 12 * SIZE(BO), %xmm4
416	movapd	 14 * SIZE(BO), %xmm5
417
418	mulpd	%xmm0, %xmm2
419	mulpd	%xmm0, %xmm3
420	mulpd	%xmm0, %xmm4
421	mulpd	%xmm0, %xmm5
422
423	ADD1	%xmm2, %xmm8
424	ADD2	%xmm3, %xmm9
425	ADD1	%xmm4, %xmm10
426	ADD2	%xmm5, %xmm11
427
428	subq	$ -8 * SIZE, AO
429	subq	$-32 * SIZE, BO
430	subq	$1, %rax
431	jne    .L41
432
433.L42:
434#if defined(LT) || defined(RN)
435	movq	KK, %rax
436#else
437	movq	K, %rax
438	subq	KK, %rax
439#endif
440	movapd	POSINV,  %xmm7
441
442	andq	$3, %rax		# if (k & 1)
443	BRANCH
444	jle .L44
445
446.L43:
447	movapd	-16 * SIZE(AO), %xmm0
448	movapd	-16 * SIZE(BO), %xmm2
449	movapd	-14 * SIZE(BO), %xmm3
450	movapd	-12 * SIZE(BO), %xmm4
451	movapd	-10 * SIZE(BO), %xmm5
452
453	mulpd	%xmm0, %xmm2
454	mulpd	%xmm0, %xmm3
455	mulpd	%xmm0, %xmm4
456	mulpd	%xmm0, %xmm5
457
458	ADD1	%xmm2, %xmm8
459	ADD2	%xmm3, %xmm9
460	ADD1	%xmm4, %xmm10
461	ADD2	%xmm5, %xmm11
462
463	addq	$2 * SIZE, AO
464	addq	$8 * SIZE, BO
465	subq	$1, %rax
466	jg	.L43
467	ALIGN_4
468
469.L44:
470#if defined(LN) || defined(RT)
471	movq	KK, %rax
472#ifdef LN
473	subq	$1, %rax
474#else
475	subq	$2, %rax
476#endif
477
478	movq	AORIG, AO
479	movq	BORIG, B
480	leaq	16 * SIZE + BUFFER, BO
481
482	salq	$ZBASE_SHIFT, %rax
483	leaq	(AO, %rax, 1), AO
484	leaq	(B,  %rax, 2), B
485	leaq	(BO, %rax, 4), BO
486#endif
487
488	SHUFPD_1 %xmm9, %xmm9
489	SHUFPD_1 %xmm11, %xmm11
490
491#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
492    defined(NR) || defined(NC) || defined(TR) || defined(TC)
493	xorpd	%xmm7, %xmm9
494	xorpd	%xmm7, %xmm11
495#else
496	xorpd	%xmm7, %xmm8
497	xorpd	%xmm7, %xmm10
498#endif
499
500#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
501    defined(RR) || defined(RC) || defined(CR) || defined(CC)
502	subpd	%xmm9, %xmm8
503	subpd	%xmm11, %xmm10
504#else
505	addpd	%xmm9, %xmm8
506	addpd	%xmm11, %xmm10
507#endif
508
509#if defined(LN) || defined(LT)
510	movapd	-16 * SIZE(B), %xmm9
511	movapd	-14 * SIZE(B), %xmm11
512
513	subpd	%xmm8,  %xmm9
514	subpd	%xmm10,  %xmm11
515#else
516	movapd	-16 * SIZE(AO), %xmm9
517	movapd	-14 * SIZE(AO), %xmm11
518
519	subpd	%xmm8,  %xmm9
520	subpd	%xmm10,  %xmm11
521#endif
522
523#ifndef CONJ
524	SHUFPD_1 %xmm7, %xmm7
525#endif
526
527#if defined(LN) || defined(LT)
528	movddup	-16 * SIZE(AO), %xmm0
529	movddup	-15 * SIZE(AO), %xmm1
530
531	pshufd	$0x4e, %xmm9, %xmm8
532	pshufd	$0x4e, %xmm11, %xmm10
533
534	xorpd	 %xmm7, %xmm8
535	xorpd	 %xmm7, %xmm10
536
537	mulpd	 %xmm0, %xmm9
538	mulpd	 %xmm1, %xmm8
539	mulpd	 %xmm0, %xmm11
540	mulpd	 %xmm1, %xmm10
541
542	addpd	 %xmm8, %xmm9
543	addpd	 %xmm10, %xmm11
544#endif
545
546#ifdef RN
547	movddup	-16 * SIZE(B), %xmm0
548	movddup	-15 * SIZE(B), %xmm1
549	movddup	-14 * SIZE(B), %xmm2
550	movddup	-13 * SIZE(B), %xmm3
551	movddup	-10 * SIZE(B), %xmm4
552	movddup	 -9 * SIZE(B), %xmm5
553
554	pshufd	$0x4e, %xmm9, %xmm8
555
556	xorpd	 %xmm7, %xmm8
557
558	mulpd	 %xmm0, %xmm9
559	mulpd	 %xmm1, %xmm8
560
561	addpd	 %xmm8, %xmm9
562
563	movapd	 %xmm9, %xmm8
564	pshufd	 $0x4e, %xmm9, %xmm12
565
566	xorpd	 %xmm7, %xmm12
567
568	mulpd	 %xmm2, %xmm8
569	mulpd	 %xmm3, %xmm12
570
571	subpd	 %xmm8, %xmm11
572	subpd	 %xmm12, %xmm11
573
574	pshufd	$0x4e, %xmm11, %xmm10
575
576	xorpd	 %xmm7, %xmm10
577
578	mulpd	 %xmm4, %xmm11
579	mulpd	 %xmm5, %xmm10
580
581	addpd	 %xmm10, %xmm11
582#endif
583
584#ifdef RT
585	movddup	-10 * SIZE(B), %xmm0
586	movddup	 -9 * SIZE(B), %xmm1
587	movddup	-12 * SIZE(B), %xmm2
588	movddup	-11 * SIZE(B), %xmm3
589	movddup	-16 * SIZE(B), %xmm4
590	movddup	-15 * SIZE(B), %xmm5
591
592	pshufd	$0x4e, %xmm11, %xmm10
593
594	xorpd	 %xmm7, %xmm10
595
596	mulpd	 %xmm0, %xmm11
597	mulpd	 %xmm1, %xmm10
598
599	addpd	 %xmm10, %xmm11
600
601	movapd	 %xmm11, %xmm8
602	pshufd	 $0x4e, %xmm11, %xmm12
603
604	xorpd	 %xmm7, %xmm12
605
606	mulpd	 %xmm2, %xmm8
607	mulpd	 %xmm3, %xmm12
608
609	subpd	 %xmm8, %xmm9
610	subpd	 %xmm12, %xmm9
611
612	pshufd	$0x4e, %xmm9, %xmm8
613
614	xorpd	 %xmm7, %xmm8
615
616	mulpd	 %xmm4, %xmm9
617	mulpd	 %xmm5, %xmm8
618
619	addpd	 %xmm8, %xmm9
620#endif
621
622#ifdef LN
623	subq	$2 * SIZE, CO1
624	subq	$2 * SIZE, CO2
625#endif
626
627	movsd	%xmm9,   0 * SIZE(CO1)
628	movhpd	%xmm9,   1 * SIZE(CO1)
629
630	movsd	%xmm11,  0 * SIZE(CO2)
631	movhpd	%xmm11,  1 * SIZE(CO2)
632
633#if defined(LN) || defined(LT)
634	movapd	%xmm9,  -16 * SIZE(B)
635	movapd	%xmm11, -14 * SIZE(B)
636
637	movddup	%xmm9,  %xmm8
638	unpckhpd %xmm9,  %xmm9
639	movddup	%xmm11, %xmm10
640	unpckhpd %xmm11, %xmm11
641
642	movapd	%xmm8,  -16 * SIZE(BO)
643	movapd	%xmm9,  -14 * SIZE(BO)
644	movapd	%xmm10, -12 * SIZE(BO)
645	movapd	%xmm11, -10 * SIZE(BO)
646#else
647	movapd	%xmm9,  -16 * SIZE(AO)
648	movapd	%xmm11, -14 * SIZE(AO)
649
650#endif
651
652#ifndef LN
653	addq	$2 * SIZE, CO1
654	addq	$2 * SIZE, CO2
655#endif
656
657#if defined(LT) || defined(RN)
658	movq	K,  %rax
659	subq	KK, %rax
660	salq	$ZBASE_SHIFT, %rax
661	leaq	(AO, %rax, 1), AO
662#ifdef LT
663	addq	$4 * SIZE, B
664#endif
665#endif
666
667#ifdef LN
668	subq	$1, KK
669	movq	BORIG, B
670#endif
671
672#ifdef LT
673	addq	$1, KK
674#endif
675
676#ifdef RT
677	movq	K, %rax
678	movq	BORIG, B
679	salq	$0 + ZBASE_SHIFT, %rax
680	addq	%rax, AORIG
681#endif
682	ALIGN_4
683
684.L30:
685	movq	M,  I
686	sarq	$1, I		# i = (m >> 2)
687	jle	.L99
688	ALIGN_4
689
690.L10:
691	leaq	(PREFETCH_R +  0) * SIZE(B), BB
692
693#ifdef LN
694       movq	K, %rax
695       salq	$1 + ZBASE_SHIFT, %rax
696       subq	%rax, AORIG
697#endif
698
699#if defined(LN) || defined(RT)
700	movq	KK, %rax
701	movq	AORIG, AO
702	salq	$ZBASE_SHIFT, %rax
703	leaq	(AO, %rax, 2), AO
704#endif
705
706	leaq	16 * SIZE + BUFFER, BO
707
708#if defined(LN) || defined(RT)
709	movq	KK, %rax
710	salq	$1 + ZBASE_SHIFT, %rax
711	leaq	(BO, %rax, 2), BO
712#endif
713
714	prefetcht2	0 * SIZE(BB)
715
716#ifdef LN
717	pxor	%xmm8, %xmm8
718	prefetcht1    -3 * SIZE(CO1)
719	pxor	%xmm9, %xmm9
720	pxor	%xmm10, %xmm10
721	prefetcht1    -3 * SIZE(CO2)
722	pxor	%xmm11, %xmm11
723#else
724	pxor	%xmm8, %xmm8
725	prefetcht1     3 * SIZE(CO1)
726	pxor	%xmm9, %xmm9
727	pxor	%xmm10, %xmm10
728	prefetcht1     3 * SIZE(CO2)
729	pxor	%xmm11, %xmm11
730#endif
731
732	pxor	%xmm12, %xmm12
733	pxor	%xmm13, %xmm13
734	pxor	%xmm14, %xmm14
735	pxor	%xmm15, %xmm15
736
737	pxor	%xmm2, %xmm2
738	pxor	%xmm3, %xmm3
739	pxor	%xmm4, %xmm4
740	pxor	%xmm5, %xmm5
741
742	subq		$-8 * SIZE, BB
743
744#if defined(LT) || defined(RN)
745	movq	KK, %rax
746#else
747	movq	K, %rax
748	subq	KK, %rax
749#endif
750	sarq	$2, %rax
751	NOBRANCH
752	jle	.L15
753	ALIGN_4
754
755.L12:
756	PREFETCH (PREFETCHSIZE +  0) * SIZE(AO)
757
758	movapd	-16 * SIZE(AO), %xmm0
759	ADD1	%xmm2, %xmm10
760	movapd	-16 * SIZE(BO), %xmm2
761	ADD1	%xmm3, %xmm14
762	movapd	 %xmm2, %xmm3
763	movapd	-14 * SIZE(AO), %xmm1
764	mulpd	%xmm0, %xmm2
765	mulpd	%xmm1, %xmm3
766	ADD2	%xmm4, %xmm11
767	movapd	-14 * SIZE(BO), %xmm4
768	ADD2	%xmm5, %xmm15
769	movapd	 %xmm4, %xmm5
770	mulpd	%xmm0, %xmm4
771	mulpd	%xmm1, %xmm5
772
773	ADD1	%xmm2, %xmm8
774	movapd	-12 * SIZE(BO), %xmm2
775	ADD1	%xmm3, %xmm12
776	movapd	 %xmm2, %xmm3
777	mulpd	%xmm0, %xmm2
778	mulpd	%xmm1, %xmm3
779	ADD2	%xmm4, %xmm9
780	movapd	-10 * SIZE(BO), %xmm4
781	ADD2	%xmm5, %xmm13
782	movapd	 %xmm4, %xmm5
783	mulpd	%xmm0, %xmm4
784	mulpd	%xmm1, %xmm5
785
786	movapd	-12 * SIZE(AO), %xmm0
787	ADD1	%xmm2, %xmm10
788	movapd	 -8 * SIZE(BO), %xmm2
789	ADD1	%xmm3, %xmm14
790	movapd	 %xmm2, %xmm3
791	movapd	-10 * SIZE(AO), %xmm1
792	mulpd	%xmm0, %xmm2
793	mulpd	%xmm1, %xmm3
794	ADD2	%xmm4, %xmm11
795	ADD2	%xmm5, %xmm15
796	movapd	-6 * SIZE(BO), %xmm4
797	movapd	 %xmm4, %xmm5
798	mulpd	%xmm0, %xmm4
799	mulpd	%xmm1, %xmm5
800
801	ADD1	%xmm2, %xmm8
802	ADD1	%xmm3, %xmm12
803	movapd	-4 * SIZE(BO), %xmm2
804	movapd	 %xmm2, %xmm3
805	mulpd	%xmm0, %xmm2
806	mulpd	%xmm1, %xmm3
807	ADD2	%xmm4, %xmm9
808	ADD2	%xmm5, %xmm13
809	movapd	-2 * SIZE(BO), %xmm4
810	movapd	 %xmm4, %xmm5
811	PREFETCH (PREFETCHSIZE +  8) * SIZE(AO)
812	mulpd	%xmm0, %xmm4
813	mulpd	%xmm1, %xmm5
814
815	movapd	-8 * SIZE(AO), %xmm0
816	ADD1	%xmm2, %xmm10
817	movapd	 0 * SIZE(BO), %xmm2
818	ADD1	%xmm3, %xmm14
819	movapd	 %xmm2, %xmm3
820	movapd	-6 * SIZE(AO), %xmm1
821	mulpd	%xmm0, %xmm2
822	mulpd	%xmm1, %xmm3
823	ADD2	%xmm4, %xmm11
824	movapd	 2 * SIZE(BO), %xmm4
825	ADD2	%xmm5, %xmm15
826	movapd	 %xmm4, %xmm5
827	mulpd	%xmm0, %xmm4
828	mulpd	%xmm1, %xmm5
829
830	ADD1	%xmm2, %xmm8
831	movapd	 4 * SIZE(BO), %xmm2
832	ADD1	%xmm3, %xmm12
833	movapd	 %xmm2, %xmm3
834	mulpd	%xmm0, %xmm2
835	mulpd	%xmm1, %xmm3
836	ADD2	%xmm4, %xmm9
837	movapd	 6 * SIZE(BO), %xmm4
838	ADD2	%xmm5, %xmm13
839	movapd	 %xmm4, %xmm5
840	mulpd	%xmm0, %xmm4
841	mulpd	%xmm1, %xmm5
842
843	movapd	-4 * SIZE(AO), %xmm0
844	ADD1	%xmm2, %xmm10
845	ADD1	%xmm3, %xmm14
846	movapd	 8 * SIZE(BO), %xmm2
847	movapd	 %xmm2, %xmm3
848	mulpd	%xmm0, %xmm2
849	movapd	-2 * SIZE(AO), %xmm1
850	mulpd	%xmm1, %xmm3
851	ADD2	%xmm4, %xmm11
852	movapd	10 * SIZE(BO), %xmm4
853	ADD2	%xmm5, %xmm15
854	subq	$-32 * SIZE, BO
855	movapd	 %xmm4, %xmm5
856	mulpd	%xmm0, %xmm4
857	mulpd	%xmm1, %xmm5
858
859	ADD1	%xmm2, %xmm8
860	ADD1	%xmm3, %xmm12
861	movapd	-20 * SIZE(BO), %xmm2
862	movapd	 %xmm2, %xmm3
863	mulpd	%xmm0, %xmm2
864	subq	$-16 * SIZE, AO
865	mulpd	%xmm1, %xmm3
866	ADD2	%xmm4, %xmm9
867	ADD2	%xmm5, %xmm13
868	movapd	-18 * SIZE(BO), %xmm4
869	movapd	 %xmm4, %xmm5
870	mulpd	%xmm0, %xmm4
871	mulpd	%xmm1, %xmm5
872
873	subq	$1, %rax
874	BRANCH
875	BRANCH
876	jg	.L12
877	ALIGN_4
878
879.L15:
880#if defined(LT) || defined(RN)
881	movq	KK, %rax
882#else
883	movq	K, %rax
884	subq	KK, %rax
885#endif
886	movapd	POSINV,  %xmm7
887
888	andq	$3, %rax
889	BRANCH
890	BRANCH
891	je	.L19
892	ALIGN_4
893
894.L16:
895	ADD1	%xmm2, %xmm10
896	ADD1	%xmm3, %xmm14
897	ADD2	%xmm4, %xmm11
898	ADD2	%xmm5, %xmm15
899
900	movapd	-16 * SIZE(BO), %xmm2
901	movapd	 %xmm2, %xmm3
902	movapd	-14 * SIZE(BO), %xmm4
903	movapd	 %xmm4, %xmm5
904
905	movapd	-16 * SIZE(AO), %xmm0
906	mulpd	%xmm0, %xmm2
907	movapd	-14 * SIZE(AO), %xmm1
908	mulpd	%xmm1, %xmm3
909	mulpd	%xmm0, %xmm4
910	mulpd	%xmm1, %xmm5
911
912	ADD1	%xmm2, %xmm8
913	ADD1	%xmm3, %xmm12
914	ADD2	%xmm4, %xmm9
915	ADD2	%xmm5, %xmm13
916
917	movapd	-12 * SIZE(BO), %xmm2
918	movapd	 %xmm2, %xmm3
919	movapd	-10 * SIZE(BO), %xmm4
920	movapd	 %xmm4, %xmm5
921
922	mulpd	%xmm0, %xmm2
923	mulpd	%xmm1, %xmm3
924	mulpd	%xmm0, %xmm4
925	mulpd	%xmm1, %xmm5
926
927	addq	$4 * SIZE, AO
928	addq	$8 * SIZE, BO
929	subq	$1, %rax
930	BRANCH
931	jg	.L16
932	ALIGN_4
933
934.L19:
935	ADD1	%xmm2, %xmm10
936	ADD1	%xmm3, %xmm14
937	ADD2	%xmm4, %xmm11
938	ADD2	%xmm5, %xmm15
939
940#if defined(LN) || defined(RT)
941	movq	KK, %rax
942#ifdef LN
943	subq	$2, %rax
944#else
945	subq	$2, %rax
946#endif
947
948	movq	AORIG, AO
949	movq	BORIG, B
950	leaq	16 * SIZE + BUFFER, BO
951
952	salq	$ZBASE_SHIFT, %rax
953	leaq	(AO, %rax, 2), AO
954	leaq	(B,  %rax, 2), B
955	leaq	(BO, %rax, 4), BO
956#endif
957
958	SHUFPD_1 %xmm9,  %xmm9
959	SHUFPD_1 %xmm11, %xmm11
960	SHUFPD_1 %xmm13, %xmm13
961	SHUFPD_1 %xmm15, %xmm15
962
963#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
964    defined(NR) || defined(NC) || defined(TR) || defined(TC)
965	xorpd	%xmm7, %xmm9
966	xorpd	%xmm7, %xmm11
967	xorpd	%xmm7, %xmm13
968	xorpd	%xmm7, %xmm15
969#else
970	xorpd	%xmm7, %xmm8
971	xorpd	%xmm7, %xmm10
972	xorpd	%xmm7, %xmm12
973	xorpd	%xmm7, %xmm14
974#endif
975
976#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
977    defined(RR) || defined(RC) || defined(CR) || defined(CC)
978	subpd	%xmm9,  %xmm8
979	subpd	%xmm11, %xmm10
980	subpd	%xmm13, %xmm12
981	subpd	%xmm15, %xmm14
982#else
983	addpd	%xmm9,  %xmm8
984	addpd	%xmm11, %xmm10
985	addpd	%xmm13, %xmm12
986	addpd	%xmm15, %xmm14
987#endif
988
989#if defined(LN) || defined(LT)
990	movapd	-16 * SIZE(B), %xmm9
991	movapd	-14 * SIZE(B), %xmm11
992	movapd	-12 * SIZE(B), %xmm13
993	movapd	-10 * SIZE(B), %xmm15
994
995	subpd	%xmm8,   %xmm9
996	subpd	%xmm10,  %xmm11
997	subpd	%xmm12,  %xmm13
998	subpd	%xmm14,  %xmm15
999#else
1000	movapd	-16 * SIZE(AO), %xmm9
1001	movapd	-14 * SIZE(AO), %xmm13
1002	movapd	-12 * SIZE(AO), %xmm11
1003	movapd	-10 * SIZE(AO), %xmm15
1004
1005	subpd	%xmm8,   %xmm9
1006	subpd	%xmm10,  %xmm11
1007	subpd	%xmm12,  %xmm13
1008	subpd	%xmm14,  %xmm15
1009#endif
1010
1011#ifndef CONJ
1012	SHUFPD_1 %xmm7, %xmm7
1013#endif
1014
1015#ifdef LN
1016	movddup	-10 * SIZE(AO), %xmm0
1017	movddup	 -9 * SIZE(AO), %xmm1
1018	movddup	-12 * SIZE(AO), %xmm2
1019	movddup	-11 * SIZE(AO), %xmm3
1020	movddup	-16 * SIZE(AO), %xmm4
1021	movddup	-15 * SIZE(AO), %xmm5
1022
1023	pshufd	$0x4e, %xmm13, %xmm12
1024	pshufd	$0x4e, %xmm15, %xmm14
1025
1026	xorpd	 %xmm7, %xmm12
1027	xorpd	 %xmm7, %xmm14
1028
1029	mulpd	 %xmm0, %xmm13
1030	mulpd	 %xmm1, %xmm12
1031	mulpd	 %xmm0, %xmm15
1032	mulpd	 %xmm1, %xmm14
1033
1034	addpd	 %xmm12, %xmm13
1035	addpd	 %xmm14, %xmm15
1036
1037	movapd	 %xmm13, %xmm8
1038	movapd	 %xmm15, %xmm10
1039	pshufd	 $0x4e, %xmm13, %xmm12
1040	pshufd	 $0x4e, %xmm15, %xmm14
1041
1042	xorpd	 %xmm7, %xmm12
1043	xorpd	 %xmm7, %xmm14
1044
1045	mulpd	 %xmm2, %xmm8
1046	mulpd	 %xmm2, %xmm10
1047	mulpd	 %xmm3, %xmm12
1048	mulpd	 %xmm3, %xmm14
1049
1050	subpd	 %xmm8, %xmm9
1051	subpd	 %xmm10, %xmm11
1052	subpd	 %xmm12, %xmm9
1053	subpd	 %xmm14, %xmm11
1054
1055	pshufd	$0x4e, %xmm9, %xmm8
1056	pshufd	$0x4e, %xmm11, %xmm10
1057
1058	xorpd	 %xmm7, %xmm8
1059	xorpd	 %xmm7, %xmm10
1060
1061	mulpd	 %xmm4, %xmm9
1062	mulpd	 %xmm5, %xmm8
1063	mulpd	 %xmm4, %xmm11
1064	mulpd	 %xmm5, %xmm10
1065
1066	addpd	 %xmm8, %xmm9
1067	addpd	 %xmm10, %xmm11
1068#endif
1069
1070#ifdef LT
1071	movddup	-16 * SIZE(AO), %xmm0
1072	movddup	-15 * SIZE(AO), %xmm1
1073	movddup	-14 * SIZE(AO), %xmm2
1074	movddup	-13 * SIZE(AO), %xmm3
1075	movddup	-10 * SIZE(AO), %xmm4
1076	movddup	 -9 * SIZE(AO), %xmm5
1077
1078	pshufd	$0x4e, %xmm9, %xmm8
1079	pshufd	$0x4e, %xmm11, %xmm10
1080
1081	xorpd	 %xmm7, %xmm8
1082	xorpd	 %xmm7, %xmm10
1083
1084	mulpd	 %xmm0, %xmm9
1085	mulpd	 %xmm1, %xmm8
1086	mulpd	 %xmm0, %xmm11
1087	mulpd	 %xmm1, %xmm10
1088
1089	addpd	 %xmm8, %xmm9
1090	addpd	 %xmm10, %xmm11
1091
1092	movapd	 %xmm9, %xmm8
1093	movapd	 %xmm11, %xmm10
1094	pshufd	 $0x4e, %xmm9, %xmm12
1095	pshufd	 $0x4e, %xmm11, %xmm14
1096
1097	xorpd	 %xmm7, %xmm12
1098	xorpd	 %xmm7, %xmm14
1099
1100	mulpd	 %xmm2, %xmm8
1101	mulpd	 %xmm2, %xmm10
1102	mulpd	 %xmm3, %xmm12
1103	mulpd	 %xmm3, %xmm14
1104
1105	subpd	 %xmm8, %xmm13
1106	subpd	 %xmm10, %xmm15
1107	subpd	 %xmm12, %xmm13
1108	subpd	 %xmm14, %xmm15
1109
1110	pshufd	$0x4e, %xmm13, %xmm12
1111	pshufd	$0x4e, %xmm15, %xmm14
1112
1113	xorpd	 %xmm7, %xmm12
1114	xorpd	 %xmm7, %xmm14
1115
1116	mulpd	 %xmm4, %xmm13
1117	mulpd	 %xmm5, %xmm12
1118	mulpd	 %xmm4, %xmm15
1119	mulpd	 %xmm5, %xmm14
1120
1121	addpd	 %xmm12, %xmm13
1122	addpd	 %xmm14, %xmm15
1123#endif
1124
1125#ifdef RN
1126	movddup	-16 * SIZE(B), %xmm0
1127	movddup	-15 * SIZE(B), %xmm1
1128	movddup	-14 * SIZE(B), %xmm2
1129	movddup	-13 * SIZE(B), %xmm3
1130	movddup	-10 * SIZE(B), %xmm4
1131	movddup	 -9 * SIZE(B), %xmm5
1132
1133	pshufd	$0x4e, %xmm9, %xmm8
1134	pshufd	$0x4e, %xmm13, %xmm12
1135
1136	xorpd	 %xmm7, %xmm8
1137	xorpd	 %xmm7, %xmm12
1138
1139	mulpd	 %xmm0, %xmm9
1140	mulpd	 %xmm1, %xmm8
1141	mulpd	 %xmm0, %xmm13
1142	mulpd	 %xmm1, %xmm12
1143
1144	addpd	 %xmm8, %xmm9
1145	addpd	 %xmm12, %xmm13
1146
1147	movapd	 %xmm9, %xmm8
1148	movapd	 %xmm13, %xmm10
1149	pshufd	 $0x4e, %xmm9, %xmm12
1150	pshufd	 $0x4e, %xmm13, %xmm14
1151
1152	xorpd	 %xmm7, %xmm12
1153	xorpd	 %xmm7, %xmm14
1154
1155	mulpd	 %xmm2, %xmm8
1156	mulpd	 %xmm2, %xmm10
1157	mulpd	 %xmm3, %xmm12
1158	mulpd	 %xmm3, %xmm14
1159
1160	subpd	 %xmm8, %xmm11
1161	subpd	 %xmm10, %xmm15
1162	subpd	 %xmm12, %xmm11
1163	subpd	 %xmm14, %xmm15
1164
1165	pshufd	$0x4e, %xmm11, %xmm10
1166	pshufd	$0x4e, %xmm15, %xmm14
1167
1168	xorpd	 %xmm7, %xmm10
1169	xorpd	 %xmm7, %xmm14
1170
1171	mulpd	 %xmm4, %xmm11
1172	mulpd	 %xmm5, %xmm10
1173	mulpd	 %xmm4, %xmm15
1174	mulpd	 %xmm5, %xmm14
1175
1176	addpd	 %xmm10, %xmm11
1177	addpd	 %xmm14, %xmm15
1178#endif
1179
1180#ifdef RT
1181	movddup	-10 * SIZE(B), %xmm0
1182	movddup	 -9 * SIZE(B), %xmm1
1183	movddup	-12 * SIZE(B), %xmm2
1184	movddup	-11 * SIZE(B), %xmm3
1185	movddup	-16 * SIZE(B), %xmm4
1186	movddup	-15 * SIZE(B), %xmm5
1187
1188	pshufd	$0x4e, %xmm11, %xmm10
1189	pshufd	$0x4e, %xmm15, %xmm14
1190
1191	xorpd	 %xmm7, %xmm10
1192	xorpd	 %xmm7, %xmm14
1193
1194	mulpd	 %xmm0, %xmm11
1195	mulpd	 %xmm1, %xmm10
1196	mulpd	 %xmm0, %xmm15
1197	mulpd	 %xmm1, %xmm14
1198
1199	addpd	 %xmm10, %xmm11
1200	addpd	 %xmm14, %xmm15
1201
1202	movapd	 %xmm11, %xmm8
1203	movapd	 %xmm15, %xmm10
1204	pshufd	 $0x4e, %xmm11, %xmm12
1205	pshufd	 $0x4e, %xmm15, %xmm14
1206
1207	xorpd	 %xmm7, %xmm12
1208	xorpd	 %xmm7, %xmm14
1209
1210	mulpd	 %xmm2, %xmm8
1211	mulpd	 %xmm2, %xmm10
1212	mulpd	 %xmm3, %xmm12
1213	mulpd	 %xmm3, %xmm14
1214
1215	subpd	 %xmm8, %xmm9
1216	subpd	 %xmm10, %xmm13
1217	subpd	 %xmm12, %xmm9
1218	subpd	 %xmm14, %xmm13
1219
1220	pshufd	$0x4e, %xmm9, %xmm8
1221	pshufd	$0x4e, %xmm13, %xmm12
1222
1223	xorpd	 %xmm7, %xmm8
1224	xorpd	 %xmm7, %xmm12
1225
1226	mulpd	 %xmm4, %xmm9
1227	mulpd	 %xmm5, %xmm8
1228	mulpd	 %xmm4, %xmm13
1229	mulpd	 %xmm5, %xmm12
1230
1231	addpd	 %xmm8, %xmm9
1232	addpd	 %xmm12, %xmm13
1233#endif
1234
1235#ifdef LN
1236	subq	$4 * SIZE, CO1
1237	subq	$4 * SIZE, CO2
1238#endif
1239
1240	movsd	%xmm9,    0 * SIZE(CO1)
1241	movhpd	%xmm9,    1 * SIZE(CO1)
1242	movsd	%xmm13,   2 * SIZE(CO1)
1243	movhpd	%xmm13,   3 * SIZE(CO1)
1244
1245	movsd	%xmm11,   0 * SIZE(CO2)
1246	movhpd	%xmm11,   1 * SIZE(CO2)
1247	movsd	%xmm15,   2 * SIZE(CO2)
1248	movhpd	%xmm15,   3 * SIZE(CO2)
1249
1250#if defined(LN) || defined(LT)
1251	movapd	%xmm9,  -16 * SIZE(B)
1252	movapd	%xmm11, -14 * SIZE(B)
1253	movapd	%xmm13, -12 * SIZE(B)
1254	movapd	%xmm15, -10 * SIZE(B)
1255
1256	movddup	%xmm9,  %xmm8
1257	unpckhpd %xmm9,  %xmm9
1258	movddup	%xmm11, %xmm10
1259	unpckhpd %xmm11, %xmm11
1260	movddup	%xmm13, %xmm12
1261	unpckhpd %xmm13, %xmm13
1262	movddup	%xmm15, %xmm14
1263	unpckhpd %xmm15, %xmm15
1264
1265	movapd	%xmm8,  -16 * SIZE(BO)
1266	movapd	%xmm9,  -14 * SIZE(BO)
1267	movapd	%xmm10, -12 * SIZE(BO)
1268	movapd	%xmm11, -10 * SIZE(BO)
1269	movapd	%xmm12,  -8 * SIZE(BO)
1270	movapd	%xmm13,  -6 * SIZE(BO)
1271	movapd	%xmm14,  -4 * SIZE(BO)
1272	movapd	%xmm15,  -2 * SIZE(BO)
1273#else
1274	movapd	%xmm9,  -16 * SIZE(AO)
1275	movapd	%xmm13, -14 * SIZE(AO)
1276	movapd	%xmm11, -12 * SIZE(AO)
1277	movapd	%xmm15, -10 * SIZE(AO)
1278#endif
1279
1280#ifndef LN
1281	addq	$4 * SIZE, CO1
1282	addq	$4 * SIZE, CO2
1283#endif
1284
1285#if defined(LT) || defined(RN)
1286	movq	K,  %rax
1287	subq	KK, %rax
1288	salq	$ZBASE_SHIFT, %rax
1289	leaq	(AO, %rax, 2), AO
1290#ifdef LT
1291	addq	$8 * SIZE, B
1292#endif
1293#endif
1294
1295#ifdef LN
1296	subq	$2, KK
1297	movq	BORIG, B
1298#endif
1299
1300#ifdef LT
1301	addq	$2, KK
1302#endif
1303
1304#ifdef RT
1305	movq	K, %rax
1306	movq	BORIG, B
1307	salq	$1 + ZBASE_SHIFT, %rax
1308	addq	%rax, AORIG
1309#endif
1310
1311	decq	I			# i --
1312	jg	.L10
1313	ALIGN_4
1314
1315.L99:
1316#ifdef LN
1317       leaq	(, K, SIZE), %rax
1318       leaq	(B, %rax, 4), B
1319#endif
1320
1321#if defined(LT) || defined(RN)
1322	movq	K,  %rax
1323	subq	KK, %rax
1324	leaq	(,%rax, SIZE), %rax
1325	leaq	(B,  %rax, 2 * COMPSIZE), B
1326#endif
1327
1328#ifdef RN
1329	addq	$2, KK
1330#endif
1331
1332#ifdef RT
1333	subq	$2, KK
1334#endif
1335
1336	decq	J			# j --
1337	jg	.L01
1338
1339.L100:
1340	testq	$1, N
1341	jle	.L999
1342
1343.L101:
1344#ifdef LN
1345	movq	OFFSET, %rax
1346	addq	M, %rax
1347	movq	%rax, KK
1348#endif
1349
1350	leaq	BUFFER, BO
1351
1352#ifdef RT
1353       movq	K, %rax
1354       salq	$0 + ZBASE_SHIFT, %rax
1355       subq	%rax, B
1356#endif
1357
1358#if defined(LN) || defined(RT)
1359	movq	KK, %rax
1360	movq	B, BORIG
1361	salq	$ZBASE_SHIFT, %rax
1362	leaq	(B,  %rax, 1), B
1363	leaq	(BO, %rax, 2), BO
1364#endif
1365
1366#if defined(LT)
1367	movq	OFFSET, %rax
1368	movq	%rax, KK
1369#endif
1370
1371#if defined(LT) || defined(RN)
1372	movq	KK, %rax
1373#else
1374	movq	K, %rax
1375	subq	KK, %rax
1376#endif
1377	sarq	$2, %rax
1378	jle	.L103
1379	ALIGN_4
1380
1381.L102:
1382	movddup	 -16 * SIZE(B), %xmm8
1383	movddup	 -15 * SIZE(B), %xmm9
1384	movddup	 -14 * SIZE(B), %xmm10
1385	movddup	 -13 * SIZE(B), %xmm11
1386	movddup	 -12 * SIZE(B), %xmm12
1387	movddup	 -11 * SIZE(B), %xmm13
1388	movddup	 -10 * SIZE(B), %xmm14
1389	movddup	  -9 * SIZE(B), %xmm15
1390
1391	movapd	%xmm8,  0 * SIZE(BO)
1392	movapd	%xmm9,  2 * SIZE(BO)
1393	movapd	%xmm10,  4 * SIZE(BO)
1394	movapd	%xmm11,  6 * SIZE(BO)
1395	movapd	%xmm12,  8 * SIZE(BO)
1396	movapd	%xmm13, 10 * SIZE(BO)
1397	movapd	%xmm14, 12 * SIZE(BO)
1398	movapd	%xmm15, 14 * SIZE(BO)
1399
1400	addq	$  8 * SIZE, B
1401	subq	$-16 * SIZE, BO
1402	decq	%rax
1403	jne	.L102
1404	ALIGN_4
1405
1406.L103:
1407#if defined(LT) || defined(RN)
1408	movq	KK, %rax
1409#else
1410	movq	K, %rax
1411	subq	KK, %rax
1412#endif
1413	andq	$3, %rax
1414	BRANCH
1415	jle	.L105
1416	ALIGN_4
1417
1418.L104:
1419	movddup	 -16 * SIZE(B), %xmm8
1420	movddup	 -15 * SIZE(B), %xmm9
1421
1422	movapd	%xmm8,  0 * SIZE(BO)
1423	movapd	%xmm9,  2 * SIZE(BO)
1424
1425	addq	$4 * SIZE, BO
1426	addq	$2 * SIZE, B
1427	decq	%rax
1428	jne	.L104
1429	ALIGN_4
1430
1431.L105:
1432#if defined(LT) || defined(RN)
1433	movq	A, AO
1434#else
1435	movq	A, AORIG
1436#endif
1437
1438#ifdef RT
1439       subq	LDC, C
1440#endif
1441
1442	movq	C, CO1
1443#ifndef RT
1444	addq	LDC, C
1445#endif
1446
1447	testq	$1, M
1448	jle	.L130
1449	ALIGN_4
1450
1451.L140:
1452#ifdef LN
1453       movq	K, %rax
1454       salq	$0 + ZBASE_SHIFT, %rax
1455       subq	%rax, AORIG
1456#endif
1457
1458#if defined(LN) || defined(RT)
1459	movq	KK, %rax
1460	movq	AORIG, AO
1461	salq	$ZBASE_SHIFT, %rax
1462	leaq	(AO, %rax, 1), AO
1463#endif
1464
1465	leaq	16 * SIZE + BUFFER, BO
1466
1467#if defined(LN) || defined(RT)
1468	movq	KK, %rax
1469	salq	$0 + ZBASE_SHIFT, %rax
1470	leaq	(BO, %rax, 2), BO
1471#endif
1472
1473	pxor	%xmm8, %xmm8
1474	pxor	%xmm9, %xmm9
1475	pxor	%xmm10, %xmm10
1476	pxor	%xmm11, %xmm11
1477
1478#if defined(LT) || defined(RN)
1479	movq	KK, %rax
1480#else
1481	movq	K, %rax
1482	subq	KK, %rax
1483#endif
1484	sarq	$2, %rax
1485	je	.L142
1486
1487.L141:
1488	PREFETCH (PREFETCHSIZE +  0) * SIZE(AO)
1489
1490	movapd	-16 * SIZE(AO), %xmm0
1491	movapd	-14 * SIZE(AO), %xmm1
1492	movapd	-16 * SIZE(BO), %xmm2
1493	movapd	-14 * SIZE(BO), %xmm3
1494	movapd	-12 * SIZE(BO), %xmm4
1495	movapd	-10 * SIZE(BO), %xmm5
1496
1497	mulpd	%xmm0, %xmm2
1498	mulpd	%xmm0, %xmm3
1499	mulpd	%xmm1, %xmm4
1500	mulpd	%xmm1, %xmm5
1501
1502	ADD1	%xmm2, %xmm8
1503	ADD2	%xmm3, %xmm9
1504	ADD1	%xmm4, %xmm10
1505	ADD2	%xmm5, %xmm11
1506
1507	movapd	-12 * SIZE(AO), %xmm0
1508	movapd	-10 * SIZE(AO), %xmm1
1509	movapd	 -8 * SIZE(BO), %xmm2
1510	movapd	 -6 * SIZE(BO), %xmm3
1511	movapd	 -4 * SIZE(BO), %xmm4
1512	movapd	 -2 * SIZE(BO), %xmm5
1513
1514	mulpd	%xmm0, %xmm2
1515	mulpd	%xmm0, %xmm3
1516	mulpd	%xmm1, %xmm4
1517	mulpd	%xmm1, %xmm5
1518
1519	ADD1	%xmm2, %xmm8
1520	ADD2	%xmm3, %xmm9
1521	ADD1	%xmm4, %xmm10
1522	ADD2	%xmm5, %xmm11
1523
1524	subq	$ -8 * SIZE, AO
1525	subq	$-16 * SIZE, BO
1526	subq	$1, %rax
1527	jne    .L141
1528
1529.L142:
1530#if defined(LT) || defined(RN)
1531	movq	KK, %rax
1532#else
1533	movq	K, %rax
1534	subq	KK, %rax
1535#endif
1536	movapd	POSINV, %xmm7
1537
1538	andq	$3, %rax		# if (k & 1)
1539	BRANCH
1540	jle .L144
1541
1542.L143:
1543	movapd	-16 * SIZE(AO), %xmm0
1544	movapd	-16 * SIZE(BO), %xmm2
1545	movapd	-14 * SIZE(BO), %xmm3
1546
1547	mulpd	%xmm0, %xmm2
1548	mulpd	%xmm0, %xmm3
1549
1550	ADD1	%xmm2, %xmm8
1551	ADD2	%xmm3, %xmm9
1552
1553	addq	$2 * SIZE, AO
1554	addq	$4 * SIZE, BO
1555	subq	$1, %rax
1556	jg	.L143
1557	ALIGN_4
1558
1559.L144:
1560	addpd	%xmm10, %xmm8
1561	addpd	%xmm11, %xmm9
1562
1563#if defined(LN) || defined(RT)
1564	movq	KK, %rax
1565#ifdef LN
1566	subq	$1, %rax
1567#else
1568	subq	$1, %rax
1569#endif
1570
1571	movq	AORIG, AO
1572	movq	BORIG, B
1573	leaq	16 * SIZE + BUFFER, BO
1574
1575	salq	$ZBASE_SHIFT, %rax
1576	leaq	(AO, %rax, 1), AO
1577	leaq	(B,  %rax, 1), B
1578	leaq	(BO, %rax, 2), BO
1579#endif
1580
1581	SHUFPD_1 %xmm9, %xmm9
1582
1583#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1584    defined(NR) || defined(NC) || defined(TR) || defined(TC)
1585	xorpd	%xmm7, %xmm9
1586#else
1587	xorpd	%xmm7, %xmm8
1588#endif
1589
1590#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1591    defined(RR) || defined(RC) || defined(CR) || defined(CC)
1592	subpd	%xmm9, %xmm8
1593#else
1594	addpd	%xmm9, %xmm8
1595#endif
1596
1597
1598#if defined(LN) || defined(LT)
1599	movapd	-16 * SIZE(B), %xmm9
1600
1601	subpd	%xmm8,  %xmm9
1602#else
1603	movapd	-16 * SIZE(AO), %xmm9
1604
1605	subpd	%xmm8,  %xmm9
1606#endif
1607
1608#ifndef CONJ
1609	SHUFPD_1 %xmm7, %xmm7
1610#endif
1611
1612#ifdef LN
1613	movddup	-16 * SIZE(AO), %xmm0
1614	movddup	-15 * SIZE(AO), %xmm1
1615
1616	pshufd	$0x4e, %xmm9, %xmm8
1617	xorpd	 %xmm7, %xmm8
1618
1619	mulpd	 %xmm0, %xmm9
1620	mulpd	 %xmm1, %xmm8
1621
1622	addpd	 %xmm8, %xmm9
1623#endif
1624
1625#ifdef LT
1626	movddup	-16 * SIZE(AO), %xmm0
1627	movddup	-15 * SIZE(AO), %xmm1
1628
1629	pshufd	$0x4e, %xmm9, %xmm8
1630
1631	xorpd	 %xmm7, %xmm8
1632
1633	mulpd	 %xmm0, %xmm9
1634	mulpd	 %xmm1, %xmm8
1635
1636	addpd	 %xmm8, %xmm9
1637#endif
1638
1639#ifdef RN
1640	movddup	-16 * SIZE(B), %xmm0
1641	movddup	-15 * SIZE(B), %xmm1
1642
1643	pshufd	$0x4e, %xmm9, %xmm8
1644
1645	xorpd	 %xmm7, %xmm8
1646
1647	mulpd	 %xmm0, %xmm9
1648	mulpd	 %xmm1, %xmm8
1649
1650	addpd	 %xmm8, %xmm9
1651#endif
1652
1653#ifdef RT
1654	movddup	-16 * SIZE(B), %xmm0
1655	movddup	-15 * SIZE(B), %xmm1
1656
1657	pshufd	$0x4e, %xmm9, %xmm8
1658
1659	xorpd	 %xmm7, %xmm8
1660
1661	mulpd	 %xmm0, %xmm9
1662	mulpd	 %xmm1, %xmm8
1663
1664	addpd	 %xmm8, %xmm9
1665#endif
1666
1667#ifdef LN
1668	subq	$2 * SIZE, CO1
1669#endif
1670
1671	movsd	%xmm9,   0 * SIZE(CO1)
1672	movhpd	%xmm9,   1 * SIZE(CO1)
1673
1674#if defined(LN) || defined(LT)
1675	movapd	%xmm9, -16 * SIZE(B)
1676
1677	movddup	%xmm9,  %xmm8
1678	unpckhpd %xmm9,  %xmm9
1679
1680	movapd	%xmm8,  -16 * SIZE(BO)
1681	movapd	%xmm9,  -14 * SIZE(BO)
1682#else
1683	movapd	%xmm9, -16 * SIZE(AO)
1684#endif
1685
1686#ifndef LN
1687	addq	$2 * SIZE, CO1
1688#endif
1689
1690#if defined(LT) || defined(RN)
1691	movq	K,  %rax
1692	subq	KK, %rax
1693	salq	$ZBASE_SHIFT, %rax
1694	leaq	(AO, %rax, 1), AO
1695#ifdef LT
1696	addq	$2 * SIZE, B
1697#endif
1698#endif
1699
1700#ifdef LN
1701	subq	$1, KK
1702	movq	BORIG, B
1703#endif
1704
1705#ifdef LT
1706	addq	$1, KK
1707#endif
1708
1709#ifdef RT
1710	movq	K, %rax
1711	movq	BORIG, B
1712	salq	$0 + ZBASE_SHIFT, %rax
1713	addq	%rax, AORIG
1714#endif
1715	ALIGN_4
1716
1717.L130:
1718	movq	M,  I
1719	sarq	$1, I		# i = (m >> 2)
1720	jle	.L199
1721	ALIGN_4
1722
1723.L110:
1724#ifdef LN
1725       movq	K, %rax
1726       salq	$1 + ZBASE_SHIFT, %rax
1727       subq	%rax, AORIG
1728#endif
1729
1730#if defined(LN) || defined(RT)
1731	movq	KK, %rax
1732	movq	AORIG, AO
1733	salq	$ZBASE_SHIFT, %rax
1734	leaq	(AO, %rax, 2), AO
1735#endif
1736
1737	leaq	16 * SIZE + BUFFER, BO
1738
1739#if defined(LN) || defined(RT)
1740	movq	KK, %rax
1741	salq	$0 + ZBASE_SHIFT, %rax
1742	leaq	(BO, %rax, 2), BO
1743#endif
1744
1745	pxor	%xmm8, %xmm8
1746	pxor	%xmm9, %xmm9
1747	pxor	%xmm12, %xmm12
1748	pxor	%xmm13, %xmm13
1749	prefetcht0    -3 * SIZE(CO1)
1750
1751#if defined(LT) || defined(RN)
1752	movq	KK, %rax
1753#else
1754	movq	K, %rax
1755	subq	KK, %rax
1756#endif
1757	sarq	$2, %rax
1758	je	.L112
1759
1760.L111:
1761	PREFETCH (PREFETCHSIZE +  0) * SIZE(AO)
1762
1763	movapd	-16 * SIZE(AO), %xmm0
1764	movapd	-14 * SIZE(AO), %xmm1
1765
1766	movapd	-16 * SIZE(BO), %xmm2
1767	movapd	 %xmm2, %xmm3
1768	movapd	-14 * SIZE(BO), %xmm4
1769	movapd	 %xmm4, %xmm5
1770
1771	mulpd	%xmm0, %xmm2
1772	mulpd	%xmm1, %xmm3
1773	mulpd	%xmm0, %xmm4
1774	mulpd	%xmm1, %xmm5
1775
1776	ADD1	%xmm2, %xmm8
1777	ADD1	%xmm3, %xmm12
1778	ADD2	%xmm4, %xmm9
1779	ADD2	%xmm5, %xmm13
1780
1781	movapd	-12 * SIZE(AO), %xmm0
1782	movapd	-10 * SIZE(AO), %xmm1
1783
1784	movapd	-12 * SIZE(BO), %xmm2
1785	movapd	 %xmm2, %xmm3
1786	movapd	-10 * SIZE(BO), %xmm4
1787	movapd	 %xmm4, %xmm5
1788
1789	mulpd	%xmm0, %xmm2
1790	mulpd	%xmm1, %xmm3
1791	mulpd	%xmm0, %xmm4
1792	mulpd	%xmm1, %xmm5
1793
1794	ADD1	%xmm2, %xmm8
1795	ADD1	%xmm3, %xmm12
1796	ADD2	%xmm4, %xmm9
1797	ADD2	%xmm5, %xmm13
1798
1799	movapd	 -8 * SIZE(AO), %xmm0
1800	movapd	 -6 * SIZE(AO), %xmm1
1801
1802	movapd	 -8 * SIZE(BO), %xmm2
1803	movapd	 %xmm2, %xmm3
1804	movapd	 -6 * SIZE(BO), %xmm4
1805	movapd	 %xmm4, %xmm5
1806
1807	mulpd	%xmm0, %xmm2
1808	mulpd	%xmm1, %xmm3
1809	mulpd	%xmm0, %xmm4
1810	mulpd	%xmm1, %xmm5
1811
1812	ADD1	%xmm2, %xmm8
1813	ADD1	%xmm3, %xmm12
1814	ADD2	%xmm4, %xmm9
1815	ADD2	%xmm5, %xmm13
1816
1817	movapd	 -4 * SIZE(AO), %xmm0
1818	movapd	 -2 * SIZE(AO), %xmm1
1819
1820	movapd	 -4 * SIZE(BO), %xmm2
1821	movapd	 %xmm2, %xmm3
1822	movapd	 -2 * SIZE(BO), %xmm4
1823	movapd	 %xmm4, %xmm5
1824
1825	mulpd	%xmm0, %xmm2
1826	mulpd	%xmm1, %xmm3
1827	mulpd	%xmm0, %xmm4
1828	mulpd	%xmm1, %xmm5
1829
1830	ADD1	%xmm2, %xmm8
1831	ADD1	%xmm3, %xmm12
1832	ADD2	%xmm4, %xmm9
1833	ADD2	%xmm5, %xmm13
1834
1835	subq	$-16 * SIZE, AO
1836	subq	$-16 * SIZE, BO
1837	subq	$1, %rax
1838	jne    .L111
1839	ALIGN_4
1840
1841.L112:
1842#if defined(LT) || defined(RN)
1843	movq	KK, %rax
1844#else
1845	movq	K, %rax
1846	subq	KK, %rax
1847#endif
1848	movapd	POSINV,  %xmm7
1849	andq	$3, %rax		# if (k & 1)
1850	BRANCH
1851	jle .L114
1852
1853.L113:
1854	movapd	-16 * SIZE(AO), %xmm0
1855	movapd	-14 * SIZE(AO), %xmm1
1856
1857	movapd	-16 * SIZE(BO), %xmm2
1858	movapd	 %xmm2, %xmm3
1859	movapd	-14 * SIZE(BO), %xmm4
1860	movapd	 %xmm4, %xmm5
1861
1862	mulpd	%xmm0, %xmm2
1863	mulpd	%xmm1, %xmm3
1864	mulpd	%xmm0, %xmm4
1865	mulpd	%xmm1, %xmm5
1866
1867	ADD1	%xmm2, %xmm8
1868	ADD1	%xmm3, %xmm12
1869	ADD2	%xmm4, %xmm9
1870	ADD2	%xmm5, %xmm13
1871
1872	addq	$4 * SIZE, AO
1873	addq	$4 * SIZE, BO
1874	subq	$1, %rax
1875	jg	.L113
1876	ALIGN_4
1877
1878.L114:
1879#if defined(LN) || defined(RT)
1880	movq	KK, %rax
1881#ifdef LN
1882	subq	$2, %rax
1883#else
1884	subq	$1, %rax
1885#endif
1886
1887	movq	AORIG, AO
1888	movq	BORIG, B
1889	leaq	16 * SIZE + BUFFER, BO
1890
1891	salq	$ZBASE_SHIFT, %rax
1892	leaq	(AO, %rax, 2), AO
1893	leaq	(B,  %rax, 1), B
1894	leaq	(BO, %rax, 2), BO
1895#endif
1896
1897	SHUFPD_1 %xmm9, %xmm9
1898	SHUFPD_1 %xmm13, %xmm13
1899
1900#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1901    defined(NR) || defined(NC) || defined(TR) || defined(TC)
1902	xorpd	%xmm7, %xmm9
1903	xorpd	%xmm7, %xmm13
1904#else
1905	xorpd	%xmm7, %xmm8
1906	xorpd	%xmm7, %xmm12
1907#endif
1908
1909#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1910    defined(RR) || defined(RC) || defined(CR) || defined(CC)
1911	subpd	%xmm9, %xmm8
1912	subpd	%xmm13, %xmm12
1913#else
1914	addpd	%xmm9, %xmm8
1915	addpd	%xmm13, %xmm12
1916#endif
1917
1918#if defined(LN) || defined(LT)
1919	movapd	-16 * SIZE(B), %xmm9
1920	movapd	-14 * SIZE(B), %xmm13
1921
1922	subpd	%xmm8,  %xmm9
1923	subpd	%xmm12,  %xmm13
1924#else
1925	movapd	-16 * SIZE(AO), %xmm9
1926	movapd	-14 * SIZE(AO), %xmm13
1927
1928	subpd	%xmm8,  %xmm9
1929	subpd	%xmm12,  %xmm13
1930#endif
1931
1932#ifndef CONJ
1933	SHUFPD_1 %xmm7, %xmm7
1934#endif
1935
1936#ifdef LN
1937	movddup	-10 * SIZE(AO), %xmm0
1938	movddup	 -9 * SIZE(AO), %xmm1
1939	movddup	-12 * SIZE(AO), %xmm2
1940	movddup	-11 * SIZE(AO), %xmm3
1941	movddup	-16 * SIZE(AO), %xmm4
1942	movddup	-15 * SIZE(AO), %xmm5
1943
1944	pshufd	$0x4e, %xmm13, %xmm12
1945
1946	xorpd	 %xmm7, %xmm12
1947
1948	mulpd	 %xmm0, %xmm13
1949	mulpd	 %xmm1, %xmm12
1950
1951	addpd	 %xmm12, %xmm13
1952
1953	movapd	 %xmm13, %xmm8
1954	pshufd	 $0x4e, %xmm13, %xmm12
1955
1956	xorpd	 %xmm7, %xmm12
1957
1958	mulpd	 %xmm2, %xmm8
1959	mulpd	 %xmm3, %xmm12
1960
1961	subpd	 %xmm8, %xmm9
1962	subpd	 %xmm12, %xmm9
1963
1964	pshufd	$0x4e, %xmm9, %xmm8
1965
1966	xorpd	 %xmm7, %xmm8
1967
1968	mulpd	 %xmm4, %xmm9
1969	mulpd	 %xmm5, %xmm8
1970
1971	addpd	 %xmm8, %xmm9
1972#endif
1973
1974#ifdef LT
1975	movddup	-16 * SIZE(AO), %xmm0
1976	movddup	-15 * SIZE(AO), %xmm1
1977	movddup	-14 * SIZE(AO), %xmm2
1978	movddup	-13 * SIZE(AO), %xmm3
1979	movddup	-10 * SIZE(AO), %xmm4
1980	movddup	 -9 * SIZE(AO), %xmm5
1981
1982	pshufd	$0x4e, %xmm9, %xmm8
1983
1984	xorpd	 %xmm7, %xmm8
1985
1986	mulpd	 %xmm0, %xmm9
1987	mulpd	 %xmm1, %xmm8
1988
1989	addpd	 %xmm8, %xmm9
1990
1991	movapd	 %xmm9, %xmm8
1992	pshufd	 $0x4e, %xmm9, %xmm12
1993
1994	xorpd	 %xmm7, %xmm12
1995
1996	mulpd	 %xmm2, %xmm8
1997	mulpd	 %xmm3, %xmm12
1998
1999	subpd	 %xmm8, %xmm13
2000	subpd	 %xmm12, %xmm13
2001
2002	pshufd	$0x4e, %xmm13, %xmm12
2003
2004	xorpd	 %xmm7, %xmm12
2005
2006	mulpd	 %xmm4, %xmm13
2007	mulpd	 %xmm5, %xmm12
2008
2009	addpd	 %xmm12, %xmm13
2010#endif
2011
2012#ifdef RN
2013	movddup	-16 * SIZE(B), %xmm0
2014	movddup	-15 * SIZE(B), %xmm1
2015
2016	pshufd	$0x4e, %xmm9, %xmm8
2017	pshufd	$0x4e, %xmm13, %xmm12
2018
2019	xorpd	 %xmm7, %xmm8
2020	xorpd	 %xmm7, %xmm12
2021
2022	mulpd	 %xmm0, %xmm9
2023	mulpd	 %xmm1, %xmm8
2024	mulpd	 %xmm0, %xmm13
2025	mulpd	 %xmm1, %xmm12
2026
2027	addpd	 %xmm8, %xmm9
2028	addpd	 %xmm12, %xmm13
2029#endif
2030
2031#ifdef RT
2032	movddup	-16 * SIZE(B), %xmm0
2033	movddup	-15 * SIZE(B), %xmm1
2034
2035	pshufd	$0x4e, %xmm9, %xmm8
2036	pshufd	$0x4e, %xmm13, %xmm12
2037
2038	xorpd	 %xmm7, %xmm8
2039	xorpd	 %xmm7, %xmm12
2040
2041	mulpd	 %xmm0, %xmm9
2042	mulpd	 %xmm1, %xmm8
2043	mulpd	 %xmm0, %xmm13
2044	mulpd	 %xmm1, %xmm12
2045
2046	addpd	 %xmm8, %xmm9
2047	addpd	 %xmm12, %xmm13
2048#endif
2049
2050#ifdef LN
2051	subq	$4 * SIZE, CO1
2052#endif
2053
2054	movsd	%xmm9,   0 * SIZE(CO1)
2055	movhpd	%xmm9,   1 * SIZE(CO1)
2056	movsd	%xmm13,  2 * SIZE(CO1)
2057	movhpd	%xmm13,  3 * SIZE(CO1)
2058
2059#if defined(LN) || defined(LT)
2060	movapd	%xmm9,  -16 * SIZE(B)
2061	movapd	%xmm13, -14 * SIZE(B)
2062
2063	movddup	%xmm9,  %xmm8
2064	unpckhpd %xmm9,  %xmm9
2065	movddup	%xmm13, %xmm12
2066	unpckhpd %xmm13, %xmm13
2067
2068	movapd	%xmm8,  -16 * SIZE(BO)
2069	movapd	%xmm9,  -14 * SIZE(BO)
2070	movapd	%xmm12, -12 * SIZE(BO)
2071	movapd	%xmm13, -10 * SIZE(BO)
2072#else
2073	movapd	%xmm9,  -16 * SIZE(AO)
2074	movapd	%xmm13, -14 * SIZE(AO)
2075#endif
2076
2077#ifndef LN
2078	addq	$4 * SIZE, CO1
2079#endif
2080
2081#if defined(LT) || defined(RN)
2082	movq	K,  %rax
2083	subq	KK, %rax
2084	salq	$ZBASE_SHIFT, %rax
2085	leaq	(AO, %rax, 2), AO
2086#ifdef LT
2087	addq	$4 * SIZE, B
2088#endif
2089#endif
2090
2091#ifdef LN
2092	subq	$2, KK
2093	movq	BORIG, B
2094#endif
2095
2096#ifdef LT
2097	addq	$2, KK
2098#endif
2099
2100#ifdef RT
2101	movq	K, %rax
2102	movq	BORIG, B
2103	salq	$1 + ZBASE_SHIFT, %rax
2104	addq	%rax, AORIG
2105#endif
2106
2107	decq	I			# i --
2108	jg	.L110
2109	ALIGN_4
2110
2111.L199:
2112#ifdef LN
2113       leaq	(, K, SIZE), %rax
2114       leaq	(B, %rax, 2), B
2115#endif
2116
2117#if defined(LT) || defined(RN)
2118	movq	K,  %rax
2119	subq	KK, %rax
2120	leaq	(,%rax, SIZE), %rax
2121	leaq	(B,  %rax, 1 * COMPSIZE), B
2122#endif
2123
2124#ifdef RN
2125	addq	$1, KK
2126#endif
2127
2128#ifdef RT
2129	subq	$1, KK
2130#endif
2131	ALIGN_4
2132
2133
2134.L999:
2135	movq	%r15, %rsp
2136
2137	movq	  0(%rsp), %rbx
2138	movq	  8(%rsp), %rbp
2139	movq	 16(%rsp), %r12
2140	movq	 24(%rsp), %r13
2141	movq	 32(%rsp), %r14
2142	movq	 40(%rsp), %r15
2143
2144#ifdef WINDOWS_ABI
2145	movq	 48(%rsp), %rdi
2146	movq	 56(%rsp), %rsi
2147	movups	 64(%rsp), %xmm6
2148	movups	 80(%rsp), %xmm7
2149	movups	 96(%rsp), %xmm8
2150	movups	112(%rsp), %xmm9
2151	movups	128(%rsp), %xmm10
2152	movups	144(%rsp), %xmm11
2153	movups	160(%rsp), %xmm12
2154	movups	176(%rsp), %xmm13
2155	movups	192(%rsp), %xmm14
2156	movups	208(%rsp), %xmm15
2157#endif
2158
2159	addq	$STACKSIZE, %rsp
2160	ret
2161
2162	EPILOGUE
2163