1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define OLD_M	%rdi
43#define OLD_N	%rsi
44#define OLD_K	%rdx
45
46#define M	%r13
47#define N	%r14
48#define K	%r15
49
50#define A	%rcx
51#define B	%r8
52#define C	%r9
53#define LDC	%r10
54
55#define I	%r11
56#define AO	%rdi
57#define BO	%rsi
58#define	CO1	%rbx
59#define CO2	%rbp
60#define KK	%rdx
61#define BB	%r12
62
63#ifndef WINDOWS_ABI
64
65#define STACKSIZE 128
66
67#define OLD_LDC		 8 + STACKSIZE(%rsp)
68#define OLD_OFFSET	16 + STACKSIZE(%rsp)
69
70#define OFFSET	   48(%rsp)
71#define J	   56(%rsp)
72#define KKK	   64(%rsp)
73#define AORIG	   72(%rsp)
74
75#else
76
77#define STACKSIZE 512
78
79#define OLD_A		40 + STACKSIZE(%rsp)
80#define OLD_B		48 + STACKSIZE(%rsp)
81#define OLD_C		56 + STACKSIZE(%rsp)
82#define OLD_LDC		64 + STACKSIZE(%rsp)
83#define OLD_OFFSET	72 + STACKSIZE(%rsp)
84
85#define OFFSET	  224(%rsp)
86#define J	  232(%rsp)
87#define KKK	  240(%rsp)
88#define AORIG	  248(%rsp)
89
90#endif
91
92#define PREFETCHSIZE  (8 * 1 - 4)
93#define PREFETCH     prefetcht0
94
95	PROLOGUE
96	PROFCODE
97
98	subq	$STACKSIZE, %rsp
99
100	movq	%rbx,  0(%rsp)
101	movq	%rbp,  8(%rsp)
102	movq	%r12, 16(%rsp)
103	movq	%r13, 24(%rsp)
104	movq	%r14, 32(%rsp)
105	movq	%r15, 40(%rsp)
106
107#ifdef WINDOWS_ABI
108	movq	%rdi,    48(%rsp)
109	movq	%rsi,    56(%rsp)
110	movups	%xmm6,   64(%rsp)
111	movups	%xmm7,   80(%rsp)
112	movups	%xmm8,   96(%rsp)
113	movups	%xmm9,  112(%rsp)
114	movups	%xmm10, 128(%rsp)
115	movups	%xmm11, 144(%rsp)
116	movups	%xmm12, 160(%rsp)
117	movups	%xmm13, 176(%rsp)
118	movups	%xmm14, 192(%rsp)
119	movups	%xmm15, 208(%rsp)
120
121	movq	ARG1,      OLD_M
122	movq	ARG2,      OLD_N
123	movq	ARG3,      OLD_K
124	movq	OLD_A,     A
125	movq	OLD_B,     B
126	movq	OLD_C,     C
127#endif
128
129	subq	$-16 * SIZE, A
130	subq	$-16 * SIZE, B
131
132	movq	OLD_M, M
133	movq	OLD_N, N
134	movq	OLD_K, K
135
136	movq	OLD_LDC,   LDC
137	movq	OLD_OFFSET, KK
138
139
140	leaq	(, LDC, SIZE), LDC
141
142	movq	KK, OFFSET
143	negq	KK
144
145#ifdef LN
146       leaq	(, M, SIZE), %rax
147       addq	%rax, C
148       imulq	K, %rax
149       addq	%rax, A
150#endif
151
152#ifdef RT
153       leaq	(, N, SIZE), %rax
154       imulq	K, %rax
155       addq	%rax, B
156       movq	N, %rax
157       imulq	LDC, %rax
158       addq	%rax, C
159#endif
160
161#ifdef RT
162       movq	N, %rax
163       subq	OFFSET, %rax
164       movq	%rax, KK
165#endif
166
167	movq	N,  J
168	sarq	$3, J
169	NOBRANCH
170	jle	.L30
171	ALIGN_4
172
173.L01:
174#if defined(LT) || defined(RN)
175	movq	A, AO
176#else
177	movq	A, AORIG
178#endif
179
180#ifdef RT
181       movq	K, %rax
182       salq	$3 + BASE_SHIFT, %rax
183       subq	%rax, B
184
185       leaq	(, LDC, 8), %rax
186       subq	%rax, C
187#endif
188
189	movq	C, CO1
190	leaq	(C, LDC, 4), CO2
191#ifndef RT
192	leaq	(C, LDC, 8), C
193#endif
194
195#ifdef LN
196	movq	OFFSET, %rax
197	addq	M, %rax
198	movq	%rax, KK
199#endif
200
201#ifdef LT
202	movq	OFFSET, %rax
203	movq	%rax, KK
204#endif
205
206	movq	K, %rax
207	salq	$BASE_SHIFT + 3, %rax
208	leaq	(B, %rax), BB
209
210	movq	M,  I
211	sarq	$1, I
212	NOBRANCH
213	jle	.L20
214	ALIGN_4
215
216.L11:
217#ifdef LN
218       movq	K, %rax
219       salq	$1 + BASE_SHIFT, %rax
220       subq	%rax, AORIG
221#endif
222
223#if defined(LN) || defined(RT)
224	movq	KK, %rax
225	leaq	(, %rax, SIZE), %rax
226	movq	AORIG, AO
227	leaq	(AO, %rax, 2), AO
228	leaq	(B,  %rax, 8), BO
229#else
230	movq	B, BO
231#endif
232
233	prefetcht0	 -16 * SIZE(BB)
234	subq		 $-8 * SIZE, BB
235
236	xorps	%xmm1, %xmm1
237	movapd	-16 * SIZE(AO), %xmm0
238	xorps	%xmm2, %xmm2
239	xorps	%xmm3, %xmm3
240	xorps	%xmm4, %xmm4
241
242	leaq	(LDC, LDC, 2), %rax
243
244	xorps	%xmm8,  %xmm8
245	prefetcht0     1 * SIZE(CO1)
246	xorps	%xmm9,  %xmm9
247	prefetcht0     2 * SIZE(CO1, LDC,  1)
248	xorps	%xmm10, %xmm10
249	prefetcht0     1 * SIZE(CO1, LDC,  2)
250	xorps	%xmm11, %xmm11
251	prefetcht0     2 * SIZE(CO1, %rax, 1)
252
253	xorps	%xmm12, %xmm12
254	prefetcht0     1 * SIZE(CO2)
255	xorps	%xmm13, %xmm13
256	prefetcht0     2 * SIZE(CO2, LDC,  1)
257	xorps	%xmm14, %xmm14
258	prefetcht0     1 * SIZE(CO2, LDC,  2)
259	xorps	%xmm15, %xmm15
260	prefetcht0     2 * SIZE(CO2, %rax, 1)
261
262#if defined(LT) || defined(RN)
263	movq	KK, %rax
264#else
265	movq	K, %rax
266	subq	KK, %rax
267#endif
268	sarq	$2, %rax
269	NOBRANCH
270	jle	.L15
271	ALIGN_3
272
273.L12:
274	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
275
276	addpd	%xmm1, %xmm12
277	movaps	-16 * SIZE(BO), %xmm6
278	addpd	%xmm2, %xmm13
279	pshufd	$0x4e, %xmm6, %xmm2
280	mulpd	%xmm0, %xmm6
281	mulpd	%xmm0, %xmm2
282
283	addpd	%xmm3, %xmm14
284	movaps	-14 * SIZE(BO), %xmm3
285	addpd	%xmm4, %xmm15
286	pshufd	$0x4e, %xmm3, %xmm4
287	mulpd	%xmm0, %xmm3
288	mulpd	%xmm0, %xmm4
289
290	addpd	%xmm6, %xmm8
291	movaps	-12 * SIZE(BO), %xmm6
292	addpd	%xmm2, %xmm9
293	pshufd	$0x4e, %xmm6, %xmm2
294	mulpd	%xmm0, %xmm6
295	mulpd	%xmm0, %xmm2
296
297	addpd	%xmm3, %xmm10
298	movaps	-10 * SIZE(BO), %xmm3
299	addpd	%xmm4, %xmm11
300	pshufd	$0x4e, %xmm3, %xmm4
301	mulpd	%xmm0, %xmm3
302	mulpd	%xmm0, %xmm4
303
304	addpd	%xmm6, %xmm12
305	movaps	 -8 * SIZE(BO), %xmm1
306	addpd	%xmm2, %xmm13
307	movaps	-14 * SIZE(AO), %xmm5
308	pshufd	$0x4e, %xmm1, %xmm2
309	mulpd	%xmm5, %xmm1
310	mulpd	%xmm5, %xmm2
311
312	addpd	%xmm3, %xmm14
313	movaps	 -6 * SIZE(BO), %xmm3
314	addpd	%xmm4, %xmm15
315	pshufd	$0x4e, %xmm3, %xmm4
316	mulpd	%xmm5, %xmm3
317	mulpd	%xmm5, %xmm4
318
319	addpd	%xmm1, %xmm8
320	movaps	 -4 * SIZE(BO), %xmm1
321	addpd	%xmm2, %xmm9
322	pshufd	$0x4e, %xmm1, %xmm2
323	mulpd	%xmm5, %xmm1
324	mulpd	%xmm5, %xmm2
325
326	addpd	%xmm3, %xmm10
327	movaps	 -2 * SIZE(BO), %xmm3
328	addpd	%xmm4, %xmm11
329	pshufd	$0x4e, %xmm3, %xmm4
330	movaps	-12 * SIZE(AO), %xmm0
331	mulpd	%xmm5, %xmm3
332	mulpd	%xmm5, %xmm4
333
334	addpd	%xmm1, %xmm12
335	movaps	  0 * SIZE(BO), %xmm6
336	addpd	%xmm2, %xmm13
337	pshufd	$0x4e, %xmm6, %xmm2
338	mulpd	%xmm0, %xmm6
339	mulpd	%xmm0, %xmm2
340
341	addpd	%xmm3, %xmm14
342	movaps	  2 * SIZE(BO), %xmm3
343	addpd	%xmm4, %xmm15
344	pshufd	$0x4e, %xmm3, %xmm4
345	mulpd	%xmm0, %xmm3
346	mulpd	%xmm0, %xmm4
347
348	addpd	%xmm6, %xmm8
349	movaps	  4 * SIZE(BO), %xmm6
350	addpd	%xmm2, %xmm9
351	pshufd	$0x4e, %xmm6, %xmm2
352	mulpd	%xmm0, %xmm6
353	mulpd	%xmm0, %xmm2
354
355	addpd	%xmm3, %xmm10
356	movaps	  6 * SIZE(BO), %xmm3
357	addpd	%xmm4, %xmm11
358	pshufd	$0x4e, %xmm3, %xmm4
359	mulpd	%xmm0, %xmm3
360	mulpd	%xmm0, %xmm4
361
362	addpd	%xmm6, %xmm12
363	movaps	  8 * SIZE(BO), %xmm1
364	addpd	%xmm2, %xmm13
365	movaps	-10 * SIZE(AO), %xmm5
366	pshufd	$0x4e, %xmm1, %xmm2
367	mulpd	%xmm5, %xmm1
368	mulpd	%xmm5, %xmm2
369
370	addpd	%xmm3, %xmm14
371	movaps	 10 * SIZE(BO), %xmm3
372	addpd	%xmm4, %xmm15
373	pshufd	$0x4e, %xmm3, %xmm4
374	mulpd	%xmm5, %xmm3
375	mulpd	%xmm5, %xmm4
376
377	addpd	%xmm1, %xmm8
378	movaps	 12 * SIZE(BO), %xmm1
379	addpd	%xmm2, %xmm9
380	pshufd	$0x4e, %xmm1, %xmm2
381	mulpd	%xmm5, %xmm1
382	mulpd	%xmm5, %xmm2
383
384	addpd	%xmm3, %xmm10
385	movaps	 14 * SIZE(BO), %xmm3
386	addpd	%xmm4, %xmm11
387	pshufd	$0x4e, %xmm3, %xmm4
388	movaps	 -8 * SIZE(AO), %xmm0
389	mulpd	%xmm5, %xmm3
390	mulpd	%xmm5, %xmm4
391
392	addq	$32 * SIZE, BO
393	subq	$-8 * SIZE, AO
394	decq	%rax
395	BRANCH
396	jg	.L12
397	ALIGN_3
398
399.L15:
400#if defined(LT) || defined(RN)
401	movq	KK, %rax
402#else
403	movq	K, %rax
404	subq	KK, %rax
405#endif
406	andq	$3, %rax		# if (k & 1)
407	BRANCH
408	je	.L18
409	ALIGN_3
410
411.L16:
412	addpd	%xmm1, %xmm12
413	movaps	-16 * SIZE(BO), %xmm1
414	addpd	%xmm2, %xmm13
415	pshufd	$0x4e, %xmm1, %xmm2
416	mulpd	%xmm0, %xmm1
417	mulpd	%xmm0, %xmm2
418
419	addpd	%xmm3, %xmm14
420	movaps	-14 * SIZE(BO), %xmm3
421	addpd	%xmm4, %xmm15
422	pshufd	$0x4e, %xmm3, %xmm4
423	mulpd	%xmm0, %xmm3
424	mulpd	%xmm0, %xmm4
425
426	addpd	%xmm1, %xmm8
427	movaps	-12 * SIZE(BO), %xmm1
428	addpd	%xmm2, %xmm9
429	pshufd	$0x4e, %xmm1, %xmm2
430	mulpd	%xmm0, %xmm1
431	mulpd	%xmm0, %xmm2
432
433	addpd	%xmm3, %xmm10
434	movaps	-10 * SIZE(BO), %xmm3
435	addpd	%xmm4, %xmm11
436	pshufd	$0x4e, %xmm3, %xmm4
437	mulpd	%xmm0, %xmm3
438	mulpd	%xmm0, %xmm4
439
440	movaps	-14 * SIZE(AO), %xmm0
441
442	addq	$2 * SIZE, AO
443	addq	$8 * SIZE, BO
444
445	subq	$1, %rax
446	BRANCH
447	jg	.L16
448	ALIGN_4
449
450.L18:
451#if defined(LN) || defined(RT)
452	movq	KK, %rax
453#ifdef LN
454	subq	$2, %rax
455#else
456	subq	$8, %rax
457#endif
458
459	leaq	(, %rax, SIZE), %rax
460
461	movq	AORIG, AO
462	leaq	(AO, %rax, 2), AO
463	leaq	(B,  %rax, 8), BO
464#endif
465
466	addpd	%xmm1, %xmm12
467	addpd	%xmm2, %xmm13
468	addpd	%xmm3, %xmm14
469	addpd	%xmm4, %xmm15
470
471#if defined(LN) || defined(LT)
472	movaps	%xmm8, %xmm0
473	shufpd	$0, %xmm9, %xmm8
474	shufpd	$3, %xmm0, %xmm9
475
476	movaps	%xmm10, %xmm0
477	shufpd	$0, %xmm11, %xmm10
478	shufpd	$3, %xmm0, %xmm11
479
480	movaps	%xmm12, %xmm0
481	shufpd	$0, %xmm13, %xmm12
482	shufpd	$3, %xmm0, %xmm13
483
484	movaps	%xmm14, %xmm0
485	shufpd	$0, %xmm15, %xmm14
486	shufpd	$3, %xmm0, %xmm15
487
488	movapd	-16 * SIZE(BO), %xmm0
489	movapd	-14 * SIZE(BO), %xmm2
490	movapd	-12 * SIZE(BO), %xmm4
491	movapd	-10 * SIZE(BO), %xmm6
492	movapd	 -8 * SIZE(BO), %xmm1
493	movapd	 -6 * SIZE(BO), %xmm3
494	movapd	 -4 * SIZE(BO), %xmm5
495	movapd	 -2 * SIZE(BO), %xmm7
496#else
497	movaps	%xmm8, %xmm0
498	shufpd	$2, %xmm9, %xmm8
499	shufpd	$2, %xmm0, %xmm9
500
501	movaps	%xmm10, %xmm0
502	shufpd	$2, %xmm11, %xmm10
503	shufpd	$2, %xmm0, %xmm11
504
505	movaps	%xmm12, %xmm0
506	shufpd	$2, %xmm13, %xmm12
507	shufpd	$2, %xmm0, %xmm13
508
509	movaps	%xmm14, %xmm0
510	shufpd	$2, %xmm15, %xmm14
511	shufpd	$2, %xmm0, %xmm15
512
513	movapd	 -16 * SIZE(AO), %xmm0
514	movapd	 -14 * SIZE(AO), %xmm1
515	movapd	 -12 * SIZE(AO), %xmm2
516	movapd	 -10 * SIZE(AO), %xmm3
517
518	movapd	  -8 * SIZE(AO), %xmm4
519	movapd	  -6 * SIZE(AO), %xmm5
520	movapd	  -4 * SIZE(AO), %xmm6
521	movapd	  -2 * SIZE(AO), %xmm7
522#endif
523
524	subpd	%xmm8,  %xmm0
525	subpd	%xmm9,  %xmm1
526	subpd	%xmm10, %xmm2
527	subpd	%xmm11, %xmm3
528	subpd	%xmm12, %xmm4
529	subpd	%xmm13, %xmm5
530	subpd	%xmm14, %xmm6
531	subpd	%xmm15, %xmm7
532
533#ifdef LN
534	movddup	-13 * SIZE(AO), %xmm8
535	mulpd	 %xmm8, %xmm1
536	mulpd	 %xmm8, %xmm3
537	mulpd	 %xmm8, %xmm5
538	mulpd	 %xmm8, %xmm7
539
540	movddup	-14 * SIZE(AO), %xmm12
541	movapd	%xmm12, %xmm13
542	movapd	%xmm12, %xmm14
543	movapd	%xmm12, %xmm15
544
545	mulpd	%xmm1, %xmm12
546	mulpd	%xmm3, %xmm13
547	mulpd	%xmm5, %xmm14
548	mulpd	%xmm7, %xmm15
549
550	subpd	%xmm12, %xmm0
551	subpd	%xmm13, %xmm2
552	subpd	%xmm14, %xmm4
553	subpd	%xmm15, %xmm6
554
555	movddup	-16 * SIZE(AO), %xmm8
556	mulpd	 %xmm8, %xmm0
557	mulpd	 %xmm8, %xmm2
558	mulpd	 %xmm8, %xmm4
559	mulpd	 %xmm8, %xmm6
560#endif
561
562#ifdef LT
563	movddup	-16 * SIZE(AO), %xmm8
564	mulpd	 %xmm8, %xmm0
565	mulpd	 %xmm8, %xmm2
566	mulpd	 %xmm8, %xmm4
567	mulpd	 %xmm8, %xmm6
568
569	movddup	-15 * SIZE(AO), %xmm12
570	movapd	%xmm12, %xmm13
571	movapd	%xmm12, %xmm14
572	movapd	%xmm12, %xmm15
573
574	mulpd	%xmm0, %xmm12
575	mulpd	%xmm2, %xmm13
576	mulpd	%xmm4, %xmm14
577	mulpd	%xmm6, %xmm15
578
579	subpd	%xmm12, %xmm1
580	subpd	%xmm13, %xmm3
581	subpd	%xmm14, %xmm5
582	subpd	%xmm15, %xmm7
583
584	movddup	-13 * SIZE(AO), %xmm8
585	mulpd	 %xmm8, %xmm1
586	mulpd	 %xmm8, %xmm3
587	mulpd	 %xmm8, %xmm5
588	mulpd	 %xmm8, %xmm7
589#endif
590
591#ifdef RN
592	movddup	-16 * SIZE(BO), %xmm8
593	mulpd	%xmm8, %xmm0
594	movddup	-15 * SIZE(BO), %xmm9
595	mulpd	%xmm0,  %xmm9
596	subpd	%xmm9,  %xmm1
597	movddup	-14 * SIZE(BO), %xmm10
598	mulpd	%xmm0,  %xmm10
599	subpd	%xmm10, %xmm2
600	movddup	-13 * SIZE(BO), %xmm11
601	mulpd	%xmm0,  %xmm11
602	subpd	%xmm11, %xmm3
603	movddup	-12 * SIZE(BO), %xmm12
604	mulpd	%xmm0,  %xmm12
605	subpd	%xmm12, %xmm4
606	movddup	-11 * SIZE(BO), %xmm13
607	mulpd	%xmm0,  %xmm13
608	subpd	%xmm13, %xmm5
609	movddup	-10 * SIZE(BO), %xmm14
610	mulpd	%xmm0,  %xmm14
611	subpd	%xmm14, %xmm6
612	movddup	 -9 * SIZE(BO), %xmm15
613	mulpd	%xmm0,  %xmm15
614	subpd	%xmm15, %xmm7
615
616	movddup	 -7 * SIZE(BO), %xmm9
617	mulpd	%xmm9, %xmm1
618	movddup	 -6 * SIZE(BO), %xmm10
619	mulpd	%xmm1,  %xmm10
620	subpd	%xmm10, %xmm2
621	movddup	 -5 * SIZE(BO), %xmm11
622	mulpd	%xmm1,  %xmm11
623	subpd	%xmm11, %xmm3
624	movddup	 -4 * SIZE(BO), %xmm12
625	mulpd	%xmm1,  %xmm12
626	subpd	%xmm12, %xmm4
627	movddup	 -3 * SIZE(BO), %xmm13
628	mulpd	%xmm1,  %xmm13
629	subpd	%xmm13, %xmm5
630	movddup	 -2 * SIZE(BO), %xmm14
631	mulpd	%xmm1,  %xmm14
632	subpd	%xmm14, %xmm6
633	movddup	 -1 * SIZE(BO), %xmm15
634	mulpd	%xmm1,  %xmm15
635	subpd	%xmm15, %xmm7
636
637	movddup	  2 * SIZE(BO), %xmm10
638	mulpd	%xmm10, %xmm2
639	movddup	  3 * SIZE(BO), %xmm11
640	mulpd	%xmm2,  %xmm11
641	subpd	%xmm11, %xmm3
642	movddup	  4 * SIZE(BO), %xmm12
643	mulpd	%xmm2,  %xmm12
644	subpd	%xmm12, %xmm4
645	movddup	  5 * SIZE(BO), %xmm13
646	mulpd	%xmm2,  %xmm13
647	subpd	%xmm13, %xmm5
648	movddup	  6 * SIZE(BO), %xmm14
649	mulpd	%xmm2,  %xmm14
650	subpd	%xmm14, %xmm6
651	movddup	  7 * SIZE(BO), %xmm15
652	mulpd	%xmm2,  %xmm15
653	subpd	%xmm15, %xmm7
654
655	movddup	 11 * SIZE(BO), %xmm11
656	mulpd	%xmm11, %xmm3
657	movddup	 12 * SIZE(BO), %xmm12
658	mulpd	%xmm3,  %xmm12
659	subpd	%xmm12, %xmm4
660	movddup	 13 * SIZE(BO), %xmm13
661	mulpd	%xmm3,  %xmm13
662	subpd	%xmm13, %xmm5
663	movddup	 14 * SIZE(BO), %xmm14
664	mulpd	%xmm3,  %xmm14
665	subpd	%xmm14, %xmm6
666	movddup	 15 * SIZE(BO), %xmm15
667	mulpd	%xmm3,  %xmm15
668	subpd	%xmm15, %xmm7
669
670	movddup	 20 * SIZE(BO), %xmm12
671	mulpd	%xmm12, %xmm4
672	movddup	 21 * SIZE(BO), %xmm13
673	mulpd	%xmm4,  %xmm13
674	subpd	%xmm13, %xmm5
675	movddup	 22 * SIZE(BO), %xmm14
676	mulpd	%xmm4,  %xmm14
677	subpd	%xmm14, %xmm6
678	movddup	 23 * SIZE(BO), %xmm15
679	mulpd	%xmm4,  %xmm15
680	subpd	%xmm15, %xmm7
681
682	movddup	 29 * SIZE(BO), %xmm13
683	mulpd	%xmm13, %xmm5
684	movddup	 30 * SIZE(BO), %xmm14
685	mulpd	%xmm5,  %xmm14
686	subpd	%xmm14, %xmm6
687	movddup	 31 * SIZE(BO), %xmm15
688	mulpd	%xmm5,  %xmm15
689	subpd	%xmm15, %xmm7
690
691	movddup	 38 * SIZE(BO), %xmm14
692	mulpd	%xmm14, %xmm6
693	movddup	 39 * SIZE(BO), %xmm15
694	mulpd	%xmm6,  %xmm15
695	subpd	%xmm15, %xmm7
696
697	movddup	 47 * SIZE(BO), %xmm15
698	mulpd	%xmm15, %xmm7
699#endif
700
701#ifdef RT
702	movddup	 47 * SIZE(BO), %xmm8
703	mulpd	%xmm8,  %xmm7
704	movddup	 46 * SIZE(BO), %xmm9
705	mulpd	%xmm7,  %xmm9
706	subpd	%xmm9,  %xmm6
707	movddup	 45 * SIZE(BO), %xmm10
708	mulpd	%xmm7,  %xmm10
709	subpd	%xmm10, %xmm5
710	movddup	 44 * SIZE(BO), %xmm11
711	mulpd	%xmm7,  %xmm11
712	subpd	%xmm11, %xmm4
713	movddup	 43 * SIZE(BO), %xmm12
714	mulpd	%xmm7,  %xmm12
715	subpd	%xmm12, %xmm3
716	movddup	 42 * SIZE(BO), %xmm13
717	mulpd	%xmm7,  %xmm13
718	subpd	%xmm13, %xmm2
719	movddup	 41 * SIZE(BO), %xmm14
720	mulpd	%xmm7,  %xmm14
721	subpd	%xmm14, %xmm1
722	movddup	 40 * SIZE(BO), %xmm15
723	mulpd	%xmm7,  %xmm15
724	subpd	%xmm15, %xmm0
725
726	movddup	 38 * SIZE(BO), %xmm9
727	mulpd	%xmm9,  %xmm6
728	movddup	 37 * SIZE(BO), %xmm10
729	mulpd	%xmm6,  %xmm10
730	subpd	%xmm10, %xmm5
731	movddup	 36 * SIZE(BO), %xmm11
732	mulpd	%xmm6,  %xmm11
733	subpd	%xmm11, %xmm4
734	movddup	 35 * SIZE(BO), %xmm12
735	mulpd	%xmm6,  %xmm12
736	subpd	%xmm12, %xmm3
737	movddup	 34 * SIZE(BO), %xmm13
738	mulpd	%xmm6,  %xmm13
739	subpd	%xmm13, %xmm2
740	movddup	 33 * SIZE(BO), %xmm14
741	mulpd	%xmm6,  %xmm14
742	subpd	%xmm14, %xmm1
743	movddup	 32 * SIZE(BO), %xmm15
744	mulpd	%xmm6,  %xmm15
745	subpd	%xmm15, %xmm0
746
747	movddup	 29 * SIZE(BO), %xmm10
748	mulpd	%xmm10, %xmm5
749	movddup	 28 * SIZE(BO), %xmm11
750	mulpd	%xmm5,  %xmm11
751	subpd	%xmm11, %xmm4
752	movddup	 27 * SIZE(BO), %xmm12
753	mulpd	%xmm5,  %xmm12
754	subpd	%xmm12, %xmm3
755	movddup	 26 * SIZE(BO), %xmm13
756	mulpd	%xmm5,  %xmm13
757	subpd	%xmm13, %xmm2
758	movddup	 25 * SIZE(BO), %xmm14
759	mulpd	%xmm5,  %xmm14
760	subpd	%xmm14, %xmm1
761	movddup	 24 * SIZE(BO), %xmm15
762	mulpd	%xmm5,  %xmm15
763	subpd	%xmm15, %xmm0
764
765	movddup	 20 * SIZE(BO), %xmm11
766	mulpd	%xmm11, %xmm4
767	movddup	 19 * SIZE(BO), %xmm12
768	mulpd	%xmm4,  %xmm12
769	subpd	%xmm12, %xmm3
770	movddup	 18 * SIZE(BO), %xmm13
771	mulpd	%xmm4,  %xmm13
772	subpd	%xmm13, %xmm2
773	movddup	 17 * SIZE(BO), %xmm14
774	mulpd	%xmm4,  %xmm14
775	subpd	%xmm14, %xmm1
776	movddup	 16 * SIZE(BO), %xmm15
777	mulpd	%xmm4,  %xmm15
778	subpd	%xmm15, %xmm0
779
780	movddup	 11 * SIZE(BO), %xmm12
781	mulpd	%xmm12, %xmm3
782	movddup	 10 * SIZE(BO), %xmm13
783	mulpd	%xmm3,  %xmm13
784	subpd	%xmm13, %xmm2
785	movddup	  9 * SIZE(BO), %xmm14
786	mulpd	%xmm3,  %xmm14
787	subpd	%xmm14, %xmm1
788	movddup	  8 * SIZE(BO), %xmm15
789	mulpd	%xmm3,  %xmm15
790	subpd	%xmm15, %xmm0
791
792	movddup	  2 * SIZE(BO), %xmm13
793	mulpd	%xmm13, %xmm2
794	movddup	  1 * SIZE(BO), %xmm14
795	mulpd	%xmm2,  %xmm14
796	subpd	%xmm14, %xmm1
797	movddup	  0 * SIZE(BO), %xmm15
798	mulpd	%xmm2,  %xmm15
799	subpd	%xmm15, %xmm0
800
801	movddup	 -7 * SIZE(BO), %xmm14
802	mulpd	%xmm14, %xmm1
803	movddup	 -8 * SIZE(BO), %xmm15
804	mulpd	%xmm1,  %xmm15
805	subpd	%xmm15, %xmm0
806
807	movddup	-16 * SIZE(BO), %xmm15
808	mulpd	%xmm15, %xmm0
809#endif
810
811
812#ifdef LN
813	subq	$2 * SIZE, CO1
814	subq	$2 * SIZE, CO2
815#endif
816
817#if defined(LN) || defined(LT)
818	movapd	%xmm0,  -16 * SIZE(BO)
819	movapd	%xmm2,  -14 * SIZE(BO)
820	movapd	%xmm4,  -12 * SIZE(BO)
821	movapd	%xmm6,  -10 * SIZE(BO)
822	movapd	%xmm1,   -8 * SIZE(BO)
823	movapd	%xmm3,   -6 * SIZE(BO)
824	movapd	%xmm5,   -4 * SIZE(BO)
825	movapd	%xmm7,   -2 * SIZE(BO)
826#else
827	movapd	%xmm0,  -16 * SIZE(AO)
828	movapd	%xmm1,  -14 * SIZE(AO)
829	movapd	%xmm2,  -12 * SIZE(AO)
830	movapd	%xmm3,  -10 * SIZE(AO)
831	movapd	%xmm4,   -8 * SIZE(AO)
832	movapd	%xmm5 ,  -6 * SIZE(AO)
833	movapd	%xmm6,   -4 * SIZE(AO)
834	movapd	%xmm7,   -2 * SIZE(AO)
835#endif
836
837	leaq	(LDC, LDC, 2), %rax
838
839#if defined(LN) || defined(LT)
840	movsd	%xmm0,  0 * SIZE(CO1)
841	movsd	%xmm1,  1 * SIZE(CO1)
842	movhps	%xmm0,  0 * SIZE(CO1, LDC,  1)
843	movhps	%xmm1,  1 * SIZE(CO1, LDC,  1)
844
845	movsd	%xmm2,  0 * SIZE(CO1, LDC,  2)
846	movsd	%xmm3,  1 * SIZE(CO1, LDC,  2)
847	movhps	%xmm2,  0 * SIZE(CO1, %rax, 1)
848	movhps	%xmm3,  1 * SIZE(CO1, %rax, 1)
849
850	movsd	%xmm4,  0 * SIZE(CO2)
851	movsd	%xmm5,  1 * SIZE(CO2)
852	movhps	%xmm4,  0 * SIZE(CO2, LDC,  1)
853	movhps	%xmm5,  1 * SIZE(CO2, LDC,  1)
854
855	movsd	%xmm6,  0 * SIZE(CO2, LDC,  2)
856	movsd	%xmm7,  1 * SIZE(CO2, LDC,  2)
857	movhps	%xmm6,  0 * SIZE(CO2, %rax, 1)
858	movhps	%xmm7,  1 * SIZE(CO2, %rax, 1)
859#else
860	movups	%xmm0,  0 * SIZE(CO1)
861	movups	%xmm1,  0 * SIZE(CO1, LDC,  1)
862	movups	%xmm2,  0 * SIZE(CO1, LDC,  2)
863	movups	%xmm3,  0 * SIZE(CO1, %rax, 1)
864	movups	%xmm4,  0 * SIZE(CO2)
865	movups	%xmm5,  0 * SIZE(CO2, LDC,  1)
866	movups	%xmm6,  0 * SIZE(CO2, LDC,  2)
867	movups	%xmm7,  0 * SIZE(CO2, %rax, 1)
868#endif
869
870#ifndef LN
871	addq	$2 * SIZE, CO1
872	addq	$2 * SIZE, CO2
873#endif
874
875
876#if defined(LT) || defined(RN)
877	movq	K,  %rax
878	subq	KK, %rax
879	leaq	(,%rax, SIZE), %rax
880	leaq	(AO, %rax, 2), AO
881	leaq	(BO, %rax, 8), BO
882#endif
883
884#ifdef LN
885	subq	$2, KK
886#endif
887
888#ifdef LT
889	addq	$2, KK
890#endif
891
892#ifdef RT
893       movq	K, %rax
894       salq	$1 + BASE_SHIFT, %rax
895       addq	%rax, AORIG
896#endif
897
898	decq	I
899	BRANCH
900	jg	.L11
901	ALIGN_4
902
903.L20:
904	testq	$1, M
905	BRANCH
906	jle	.L29
907	ALIGN_4
908
909#ifdef LN
910       movq	K, %rax
911       salq	$BASE_SHIFT, %rax
912       subq	%rax, AORIG
913#endif
914
915#if defined(LN) || defined(RT)
916	movq	KK, %rax
917	leaq	(, %rax, SIZE), %rax
918	movq	AORIG, AO
919	leaq	(AO, %rax, 1), AO
920	leaq	(B,  %rax, 8), BO
921#else
922	movq	B, BO
923#endif
924
925	movddup	-16 * SIZE(AO), %xmm0
926	xorps	%xmm8,  %xmm8
927	movaps	-16 * SIZE(BO), %xmm1
928	xorps	%xmm9,  %xmm9
929	xorps	%xmm10, %xmm10
930	xorps	%xmm11, %xmm11
931
932#if defined(LT) || defined(RN)
933	movq	KK, %rax
934#else
935	movq	K, %rax
936	subq	KK, %rax
937#endif
938	sarq	$2, %rax
939	NOBRANCH
940	jle	.L25
941	ALIGN_3
942
943.L22:
944	mulpd	%xmm0, %xmm1
945	addpd	%xmm1, %xmm8
946	movaps	-14 * SIZE(BO), %xmm1
947	mulpd	%xmm0, %xmm1
948	addpd	%xmm1, %xmm9
949	movaps	-12 * SIZE(BO), %xmm1
950	mulpd	%xmm0, %xmm1
951	addpd	%xmm1, %xmm10
952	movaps	-10 * SIZE(BO), %xmm1
953	mulpd	%xmm0, %xmm1
954	movddup	-15 * SIZE(AO), %xmm0
955	addpd	%xmm1, %xmm11
956	movaps	 -8 * SIZE(BO), %xmm1
957
958	mulpd	%xmm0, %xmm1
959	addpd	%xmm1, %xmm8
960	movaps	 -6 * SIZE(BO), %xmm1
961	mulpd	%xmm0, %xmm1
962	addpd	%xmm1, %xmm9
963	movaps	 -4 * SIZE(BO), %xmm1
964	mulpd	%xmm0, %xmm1
965	addpd	%xmm1, %xmm10
966	movaps	 -2 * SIZE(BO), %xmm1
967	mulpd	%xmm0, %xmm1
968	movddup	-14 * SIZE(AO), %xmm0
969	addpd	%xmm1, %xmm11
970	movaps	  0 * SIZE(BO), %xmm1
971
972	mulpd	%xmm0, %xmm1
973	addpd	%xmm1, %xmm8
974	movaps	  2 * SIZE(BO), %xmm1
975	mulpd	%xmm0, %xmm1
976	addpd	%xmm1, %xmm9
977	movaps	  4 * SIZE(BO), %xmm1
978	mulpd	%xmm0, %xmm1
979	addpd	%xmm1, %xmm10
980	movaps	  6 * SIZE(BO), %xmm1
981	mulpd	%xmm0, %xmm1
982	movddup	-13 * SIZE(AO), %xmm0
983	addpd	%xmm1, %xmm11
984	movaps	  8 * SIZE(BO), %xmm1
985
986	mulpd	%xmm0, %xmm1
987	addpd	%xmm1, %xmm8
988	movaps	 10 * SIZE(BO), %xmm1
989	mulpd	%xmm0, %xmm1
990	addpd	%xmm1, %xmm9
991	movaps	 12 * SIZE(BO), %xmm1
992	mulpd	%xmm0, %xmm1
993	addpd	%xmm1, %xmm10
994	movaps	 14 * SIZE(BO), %xmm1
995	mulpd	%xmm0, %xmm1
996	movddup	-12 * SIZE(AO), %xmm0
997	addpd	%xmm1, %xmm11
998	movaps	 16 * SIZE(BO), %xmm1
999
1000	subq	$ -4 * SIZE, AO
1001	subq	$-32 * SIZE, BO
1002
1003	subq	$1, %rax
1004	BRANCH
1005	jg	.L22
1006	ALIGN_3
1007
1008.L25:
1009#if defined(LT) || defined(RN)
1010	movq	KK, %rax
1011#else
1012	movq	K, %rax
1013	subq	KK, %rax
1014#endif
1015	andq	$3, %rax		# if (k & 1)
1016	BRANCH
1017	je	.L28
1018	ALIGN_3
1019
1020.L26:
1021	mulpd	%xmm0, %xmm1
1022	addpd	%xmm1, %xmm8
1023	movaps	-14 * SIZE(BO), %xmm1
1024	mulpd	%xmm0, %xmm1
1025	addpd	%xmm1, %xmm9
1026	movaps	-12 * SIZE(BO), %xmm1
1027	mulpd	%xmm0, %xmm1
1028	addpd	%xmm1, %xmm10
1029	movaps	-10 * SIZE(BO), %xmm1
1030	mulpd	%xmm0, %xmm1
1031	movddup	-15 * SIZE(AO), %xmm0
1032	addpd	%xmm1, %xmm11
1033	movaps	 -8 * SIZE(BO), %xmm1
1034
1035	addq	$1 * SIZE, AO
1036	addq	$8 * SIZE, BO
1037
1038	subq	$1, %rax
1039	BRANCH
1040	jg	.L26
1041	ALIGN_4
1042
1043.L28:
1044#if defined(LN) || defined(RT)
1045	movq	KK, %rax
1046#ifdef LN
1047	subq	$1, %rax
1048#else
1049	subq	$8, %rax
1050#endif
1051
1052	leaq	(, %rax, SIZE), %rax
1053
1054	movq	AORIG, AO
1055	leaq	(AO, %rax, 1), AO
1056	leaq	(B,  %rax, 8), BO
1057#endif
1058
1059#if defined(LN) || defined(LT)
1060	movapd	-16 * SIZE(BO), %xmm0
1061	movapd	-14 * SIZE(BO), %xmm1
1062	movapd	-12 * SIZE(BO), %xmm2
1063	movapd	-10 * SIZE(BO), %xmm3
1064#else
1065	movapd	-16 * SIZE(AO), %xmm0
1066	movapd	-14 * SIZE(AO), %xmm1
1067	movapd	-12 * SIZE(AO), %xmm2
1068	movapd	-10 * SIZE(AO), %xmm3
1069#endif
1070
1071	subpd	%xmm8,  %xmm0
1072	subpd	%xmm9,  %xmm1
1073	subpd	%xmm10, %xmm2
1074	subpd	%xmm11, %xmm3
1075
1076#if defined(LN) || defined(LT)
1077	movddup	-16 * SIZE(AO), %xmm8
1078	mulpd	 %xmm8, %xmm0
1079	mulpd	 %xmm8, %xmm1
1080	mulpd	 %xmm8, %xmm2
1081	mulpd	 %xmm8, %xmm3
1082#endif
1083
1084#if defined(RN) || defined(RT)
1085	pshufd	$0xe, %xmm3, %xmm7
1086	movaps	%xmm3, %xmm6
1087	pshufd	$0xe, %xmm2, %xmm5
1088	movaps	%xmm2, %xmm4
1089	pshufd	$0xe, %xmm1, %xmm3
1090	movaps	%xmm1, %xmm2
1091	pshufd	$0xe, %xmm0, %xmm1
1092#endif
1093
1094#ifdef RN
1095	movsd	-16 * SIZE(BO), %xmm8
1096	mulsd	%xmm8, %xmm0
1097	movsd	-15 * SIZE(BO), %xmm9
1098	mulsd	%xmm0,  %xmm9
1099	subsd	%xmm9,  %xmm1
1100	movsd	-14 * SIZE(BO), %xmm10
1101	mulsd	%xmm0,  %xmm10
1102	subsd	%xmm10, %xmm2
1103	movsd	-13 * SIZE(BO), %xmm11
1104	mulsd	%xmm0,  %xmm11
1105	subsd	%xmm11, %xmm3
1106	movsd	-12 * SIZE(BO), %xmm12
1107	mulsd	%xmm0,  %xmm12
1108	subsd	%xmm12, %xmm4
1109	movsd	-11 * SIZE(BO), %xmm13
1110	mulsd	%xmm0,  %xmm13
1111	subsd	%xmm13, %xmm5
1112	movsd	-10 * SIZE(BO), %xmm14
1113	mulsd	%xmm0,  %xmm14
1114	subsd	%xmm14, %xmm6
1115	movsd	 -9 * SIZE(BO), %xmm15
1116	mulsd	%xmm0,  %xmm15
1117	subsd	%xmm15, %xmm7
1118
1119	movsd	 -7 * SIZE(BO), %xmm9
1120	mulsd	%xmm9, %xmm1
1121	movsd	 -6 * SIZE(BO), %xmm10
1122	mulsd	%xmm1,  %xmm10
1123	subsd	%xmm10, %xmm2
1124	movsd	 -5 * SIZE(BO), %xmm11
1125	mulsd	%xmm1,  %xmm11
1126	subsd	%xmm11, %xmm3
1127	movsd	 -4 * SIZE(BO), %xmm12
1128	mulsd	%xmm1,  %xmm12
1129	subsd	%xmm12, %xmm4
1130	movsd	 -3 * SIZE(BO), %xmm13
1131	mulsd	%xmm1,  %xmm13
1132	subsd	%xmm13, %xmm5
1133	movsd	 -2 * SIZE(BO), %xmm14
1134	mulsd	%xmm1,  %xmm14
1135	subsd	%xmm14, %xmm6
1136	movsd	 -1 * SIZE(BO), %xmm15
1137	mulsd	%xmm1,  %xmm15
1138	subsd	%xmm15, %xmm7
1139
1140	movsd	  2 * SIZE(BO), %xmm10
1141	mulsd	%xmm10, %xmm2
1142	movsd	  3 * SIZE(BO), %xmm11
1143	mulsd	%xmm2,  %xmm11
1144	subsd	%xmm11, %xmm3
1145	movsd	  4 * SIZE(BO), %xmm12
1146	mulsd	%xmm2,  %xmm12
1147	subsd	%xmm12, %xmm4
1148	movsd	  5 * SIZE(BO), %xmm13
1149	mulsd	%xmm2,  %xmm13
1150	subsd	%xmm13, %xmm5
1151	movsd	  6 * SIZE(BO), %xmm14
1152	mulsd	%xmm2,  %xmm14
1153	subsd	%xmm14, %xmm6
1154	movsd	  7 * SIZE(BO), %xmm15
1155	mulsd	%xmm2,  %xmm15
1156	subsd	%xmm15, %xmm7
1157
1158	movsd	 11 * SIZE(BO), %xmm11
1159	mulsd	%xmm11, %xmm3
1160	movsd	 12 * SIZE(BO), %xmm12
1161	mulsd	%xmm3,  %xmm12
1162	subsd	%xmm12, %xmm4
1163	movsd	 13 * SIZE(BO), %xmm13
1164	mulsd	%xmm3,  %xmm13
1165	subsd	%xmm13, %xmm5
1166	movsd	 14 * SIZE(BO), %xmm14
1167	mulsd	%xmm3,  %xmm14
1168	subsd	%xmm14, %xmm6
1169	movsd	 15 * SIZE(BO), %xmm15
1170	mulsd	%xmm3,  %xmm15
1171	subsd	%xmm15, %xmm7
1172
1173	movsd	 20 * SIZE(BO), %xmm12
1174	mulsd	%xmm12, %xmm4
1175	movsd	 21 * SIZE(BO), %xmm13
1176	mulsd	%xmm4,  %xmm13
1177	subsd	%xmm13, %xmm5
1178	movsd	 22 * SIZE(BO), %xmm14
1179	mulsd	%xmm4,  %xmm14
1180	subsd	%xmm14, %xmm6
1181	movsd	 23 * SIZE(BO), %xmm15
1182	mulsd	%xmm4,  %xmm15
1183	subsd	%xmm15, %xmm7
1184
1185	movsd	 29 * SIZE(BO), %xmm13
1186	mulsd	%xmm13, %xmm5
1187	movsd	 30 * SIZE(BO), %xmm14
1188	mulsd	%xmm5,  %xmm14
1189	subsd	%xmm14, %xmm6
1190	movsd	 31 * SIZE(BO), %xmm15
1191	mulsd	%xmm5,  %xmm15
1192	subsd	%xmm15, %xmm7
1193
1194	movsd	 38 * SIZE(BO), %xmm14
1195	mulsd	%xmm14, %xmm6
1196	movsd	 39 * SIZE(BO), %xmm15
1197	mulsd	%xmm6,  %xmm15
1198	subsd	%xmm15, %xmm7
1199
1200	movsd	 47 * SIZE(BO), %xmm15
1201	mulsd	%xmm15, %xmm7
1202#endif
1203
1204#ifdef RT
1205	movsd	 47 * SIZE(BO), %xmm8
1206	mulsd	%xmm8,  %xmm7
1207	movsd	 46 * SIZE(BO), %xmm9
1208	mulsd	%xmm7,  %xmm9
1209	subsd	%xmm9,  %xmm6
1210	movsd	 45 * SIZE(BO), %xmm10
1211	mulsd	%xmm7,  %xmm10
1212	subsd	%xmm10, %xmm5
1213	movsd	 44 * SIZE(BO), %xmm11
1214	mulsd	%xmm7,  %xmm11
1215	subsd	%xmm11, %xmm4
1216	movsd	 43 * SIZE(BO), %xmm12
1217	mulsd	%xmm7,  %xmm12
1218	subsd	%xmm12, %xmm3
1219	movsd	 42 * SIZE(BO), %xmm13
1220	mulsd	%xmm7,  %xmm13
1221	subsd	%xmm13, %xmm2
1222	movsd	 41 * SIZE(BO), %xmm14
1223	mulsd	%xmm7,  %xmm14
1224	subsd	%xmm14, %xmm1
1225	movsd	 40 * SIZE(BO), %xmm15
1226	mulsd	%xmm7,  %xmm15
1227	subsd	%xmm15, %xmm0
1228
1229	movsd	 38 * SIZE(BO), %xmm9
1230	mulsd	%xmm9,  %xmm6
1231	movsd	 37 * SIZE(BO), %xmm10
1232	mulsd	%xmm6,  %xmm10
1233	subsd	%xmm10, %xmm5
1234	movsd	 36 * SIZE(BO), %xmm11
1235	mulsd	%xmm6,  %xmm11
1236	subsd	%xmm11, %xmm4
1237	movsd	 35 * SIZE(BO), %xmm12
1238	mulsd	%xmm6,  %xmm12
1239	subsd	%xmm12, %xmm3
1240	movsd	 34 * SIZE(BO), %xmm13
1241	mulsd	%xmm6,  %xmm13
1242	subsd	%xmm13, %xmm2
1243	movsd	 33 * SIZE(BO), %xmm14
1244	mulsd	%xmm6,  %xmm14
1245	subsd	%xmm14, %xmm1
1246	movsd	 32 * SIZE(BO), %xmm15
1247	mulsd	%xmm6,  %xmm15
1248	subsd	%xmm15, %xmm0
1249
1250	movsd	 29 * SIZE(BO), %xmm10
1251	mulsd	%xmm10, %xmm5
1252	movsd	 28 * SIZE(BO), %xmm11
1253	mulsd	%xmm5,  %xmm11
1254	subsd	%xmm11, %xmm4
1255	movsd	 27 * SIZE(BO), %xmm12
1256	mulsd	%xmm5,  %xmm12
1257	subsd	%xmm12, %xmm3
1258	movsd	 26 * SIZE(BO), %xmm13
1259	mulsd	%xmm5,  %xmm13
1260	subsd	%xmm13, %xmm2
1261	movsd	 25 * SIZE(BO), %xmm14
1262	mulsd	%xmm5,  %xmm14
1263	subsd	%xmm14, %xmm1
1264	movsd	 24 * SIZE(BO), %xmm15
1265	mulsd	%xmm5,  %xmm15
1266	subsd	%xmm15, %xmm0
1267
1268	movsd	 20 * SIZE(BO), %xmm11
1269	mulsd	%xmm11, %xmm4
1270	movsd	 19 * SIZE(BO), %xmm12
1271	mulsd	%xmm4,  %xmm12
1272	subsd	%xmm12, %xmm3
1273	movsd	 18 * SIZE(BO), %xmm13
1274	mulsd	%xmm4,  %xmm13
1275	subsd	%xmm13, %xmm2
1276	movsd	 17 * SIZE(BO), %xmm14
1277	mulsd	%xmm4,  %xmm14
1278	subsd	%xmm14, %xmm1
1279	movsd	 16 * SIZE(BO), %xmm15
1280	mulsd	%xmm4,  %xmm15
1281	subsd	%xmm15, %xmm0
1282
1283	movsd	 11 * SIZE(BO), %xmm12
1284	mulsd	%xmm12, %xmm3
1285	movsd	 10 * SIZE(BO), %xmm13
1286	mulsd	%xmm3,  %xmm13
1287	subsd	%xmm13, %xmm2
1288	movsd	  9 * SIZE(BO), %xmm14
1289	mulsd	%xmm3,  %xmm14
1290	subsd	%xmm14, %xmm1
1291	movsd	  8 * SIZE(BO), %xmm15
1292	mulsd	%xmm3,  %xmm15
1293	subsd	%xmm15, %xmm0
1294
1295	movsd	  2 * SIZE(BO), %xmm13
1296	mulsd	%xmm13, %xmm2
1297	movsd	  1 * SIZE(BO), %xmm14
1298	mulsd	%xmm2,  %xmm14
1299	subsd	%xmm14, %xmm1
1300	movsd	  0 * SIZE(BO), %xmm15
1301	mulsd	%xmm2,  %xmm15
1302	subsd	%xmm15, %xmm0
1303
1304	movsd	 -7 * SIZE(BO), %xmm14
1305	mulsd	%xmm14, %xmm1
1306	movsd	 -8 * SIZE(BO), %xmm15
1307	mulsd	%xmm1,  %xmm15
1308	subsd	%xmm15, %xmm0
1309
1310	movsd	-16 * SIZE(BO), %xmm15
1311	mulsd	%xmm15, %xmm0
1312#endif
1313
1314#if defined(RN) || defined(RT)
1315	unpcklpd   %xmm1, %xmm0
1316	movaps	   %xmm2, %xmm1
1317	unpcklpd   %xmm3, %xmm1
1318	movaps	   %xmm4, %xmm2
1319	unpcklpd   %xmm5, %xmm2
1320	movaps	   %xmm6, %xmm3
1321	unpcklpd   %xmm7, %xmm3
1322#endif
1323
1324#ifdef LN
1325	subq	$1 * SIZE, CO1
1326	subq	$1 * SIZE, CO2
1327#endif
1328
1329	leaq	(LDC, LDC, 2), %rax
1330
1331	movsd	%xmm0,  0 * SIZE(CO1)
1332	movhps	%xmm0,  0 * SIZE(CO1, LDC,  1)
1333	movsd	%xmm1,  0 * SIZE(CO1, LDC,  2)
1334	movhps	%xmm1,  0 * SIZE(CO1, %rax, 1)
1335	movsd	%xmm2,  0 * SIZE(CO2)
1336	movhps	%xmm2,  0 * SIZE(CO2, LDC,  1)
1337	movsd	%xmm3,  0 * SIZE(CO2, LDC,  2)
1338	movhps	%xmm3,  0 * SIZE(CO2, %rax, 1)
1339
1340#if defined(LN) || defined(LT)
1341	movapd	%xmm0,  -16 * SIZE(BO)
1342	movapd	%xmm1,  -14 * SIZE(BO)
1343	movapd	%xmm2,  -12 * SIZE(BO)
1344	movapd	%xmm3,  -10 * SIZE(BO)
1345#else
1346	movapd	%xmm0,  -16 * SIZE(AO)
1347	movapd	%xmm1,  -14 * SIZE(AO)
1348	movapd	%xmm2,  -12 * SIZE(AO)
1349	movapd	%xmm3,  -10 * SIZE(AO)
1350#endif
1351
1352#ifndef LN
1353	addq	$1 * SIZE, CO1
1354	addq	$1 * SIZE, CO2
1355#endif
1356
1357
1358#if defined(LT) || defined(RN)
1359	movq	K,  %rax
1360	subq	KK, %rax
1361	leaq	(,%rax, SIZE), %rax
1362	leaq	(AO, %rax, 1), AO
1363	leaq	(BO, %rax, 8), BO
1364#endif
1365
1366#ifdef LN
1367	subq	$1, KK
1368#endif
1369
1370#ifdef LT
1371	addq	$1, KK
1372#endif
1373
1374#ifdef RT
1375       movq	K, %rax
1376       salq	$BASE_SHIFT, %rax
1377       addq	%rax, AORIG
1378#endif
1379	ALIGN_4
1380
1381.L29:
1382#ifdef LN
1383       leaq	(, K, SIZE), %rax
1384       leaq	(B, %rax, 8), B
1385#endif
1386#if defined(LT) || defined(RN)
1387	movq	BO, B
1388#endif
1389
1390#ifdef RN
1391	addq	$8, KK
1392#endif
1393
1394#ifdef RT
1395	subq	$8, KK
1396#endif
1397
1398	subq	$1, J
1399	BRANCH
1400	jg	.L01
1401	ALIGN_4
1402
1403.L30:
1404	testq	$4, N
1405	jle	.L50
1406	ALIGN_4
1407
1408#if defined(LT) || defined(RN)
1409	movq	A, AO
1410#else
1411	movq	A, AORIG
1412#endif
1413
1414#ifdef RT
1415       movq	K, %rax
1416       salq	$2 + BASE_SHIFT, %rax
1417       subq	%rax, B
1418
1419       leaq	(, LDC, 4), %rax
1420       subq	%rax, C
1421#endif
1422
1423	movq	C, CO1
1424	leaq	(C, LDC, 2), CO2
1425#ifndef RT
1426	leaq	(C, LDC, 4), C
1427#endif
1428
1429#ifdef LN
1430	movq	OFFSET, %rax
1431	addq	M, %rax
1432	movq	%rax, KK
1433#endif
1434
1435#ifdef LT
1436	movq	OFFSET, %rax
1437	movq	%rax, KK
1438#endif
1439
1440	movq	M,  I
1441	sarq	$1, I
1442	NOBRANCH
1443	jle	.L40
1444	ALIGN_4
1445
1446.L31:
1447#ifdef LN
1448       movq	K, %rax
1449       salq	$1 + BASE_SHIFT, %rax
1450       subq	%rax, AORIG
1451#endif
1452
1453#if defined(LN) || defined(RT)
1454	movq	KK, %rax
1455	leaq	(, %rax, SIZE), %rax
1456	movq	AORIG, AO
1457	leaq	(AO, %rax, 2), AO
1458	leaq	(B,  %rax, 4), BO
1459#else
1460	movq	B, BO
1461#endif
1462
1463	xorps	%xmm1, %xmm1
1464	movaps	-16 * SIZE(AO), %xmm0
1465	xorps	%xmm2, %xmm2
1466	xorps	%xmm3, %xmm3
1467	xorps	%xmm4, %xmm4
1468
1469	xorps	%xmm8,  %xmm8
1470	prefetcht0     2 * SIZE(CO1)
1471	xorps	%xmm9,  %xmm9
1472	prefetcht0     2 * SIZE(CO1, LDC,  1)
1473	xorps	%xmm10, %xmm10
1474	prefetcht0     2 * SIZE(CO2)
1475	xorps	%xmm11, %xmm11
1476	prefetcht0     2 * SIZE(CO2, LDC,  1)
1477
1478#if defined(LT) || defined(RN)
1479	movq	KK, %rax
1480#else
1481	movq	K, %rax
1482	subq	KK, %rax
1483#endif
1484	sarq	$2, %rax
1485	NOBRANCH
1486	jle	.L35
1487	ALIGN_3
1488
1489.L32:
1490	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
1491
1492	addpd	%xmm1, %xmm8
1493	movaps	-16 * SIZE(BO), %xmm1
1494	addpd	%xmm2, %xmm9
1495	pshufd	$0x4e, %xmm1, %xmm2
1496	mulpd	%xmm0, %xmm1
1497	mulpd	%xmm0, %xmm2
1498
1499	addpd	%xmm3, %xmm10
1500	movaps	-14 * SIZE(BO), %xmm3
1501	addpd	%xmm4, %xmm11
1502	pshufd	$0x4e, %xmm3, %xmm4
1503	mulpd	%xmm0, %xmm3
1504	mulpd	%xmm0, %xmm4
1505
1506	movaps	-14 * SIZE(AO), %xmm0
1507
1508	addpd	%xmm1, %xmm8
1509	movaps	-12 * SIZE(BO), %xmm1
1510	addpd	%xmm2, %xmm9
1511	pshufd	$0x4e, %xmm1, %xmm2
1512	mulpd	%xmm0, %xmm1
1513	mulpd	%xmm0, %xmm2
1514
1515	addpd	%xmm3, %xmm10
1516	movaps	-10 * SIZE(BO), %xmm3
1517	addpd	%xmm4, %xmm11
1518	pshufd	$0x4e, %xmm3, %xmm4
1519	mulpd	%xmm0, %xmm3
1520	mulpd	%xmm0, %xmm4
1521
1522	movaps	-12 * SIZE(AO), %xmm0
1523
1524	addpd	%xmm1, %xmm8
1525	movaps	 -8 * SIZE(BO), %xmm1
1526	addpd	%xmm2, %xmm9
1527	pshufd	$0x4e, %xmm1, %xmm2
1528	mulpd	%xmm0, %xmm1
1529	mulpd	%xmm0, %xmm2
1530
1531	addpd	%xmm3, %xmm10
1532	movaps	 -6 * SIZE(BO), %xmm3
1533	addpd	%xmm4, %xmm11
1534	pshufd	$0x4e, %xmm3, %xmm4
1535	mulpd	%xmm0, %xmm3
1536	mulpd	%xmm0, %xmm4
1537
1538	movaps	-10 * SIZE(AO), %xmm0
1539
1540	addpd	%xmm1, %xmm8
1541	movaps	 -4 * SIZE(BO), %xmm1
1542	addpd	%xmm2, %xmm9
1543	pshufd	$0x4e, %xmm1, %xmm2
1544	mulpd	%xmm0, %xmm1
1545	mulpd	%xmm0, %xmm2
1546
1547	addpd	%xmm3, %xmm10
1548	movaps	 -2 * SIZE(BO), %xmm3
1549	addpd	%xmm4, %xmm11
1550	pshufd	$0x4e, %xmm3, %xmm4
1551	mulpd	%xmm0, %xmm3
1552	mulpd	%xmm0, %xmm4
1553
1554	movaps	 -8 * SIZE(AO), %xmm0
1555
1556	subq	$-8  * SIZE, AO
1557	subq	$-16 * SIZE, BO
1558
1559	subq	$1, %rax
1560	BRANCH
1561	jg	.L32
1562	ALIGN_3
1563
1564.L35:
1565#if defined(LT) || defined(RN)
1566	movq	KK, %rax
1567#else
1568	movq	K, %rax
1569	subq	KK, %rax
1570#endif
1571	andq	$3, %rax		# if (k & 1)
1572	BRANCH
1573	je	.L38
1574	ALIGN_3
1575
1576.L36:
1577	addpd	%xmm1, %xmm8
1578	movaps	-16 * SIZE(BO), %xmm1
1579	addpd	%xmm2, %xmm9
1580	pshufd	$0x4e, %xmm1, %xmm2
1581	mulpd	%xmm0, %xmm1
1582	mulpd	%xmm0, %xmm2
1583
1584	addpd	%xmm3, %xmm10
1585	movaps	-14 * SIZE(BO), %xmm3
1586	addpd	%xmm4, %xmm11
1587	pshufd	$0x4e, %xmm3, %xmm4
1588	mulpd	%xmm0, %xmm3
1589	mulpd	%xmm0, %xmm4
1590
1591	movaps	-14 * SIZE(AO), %xmm0
1592
1593	addq	$2 * SIZE, AO
1594	addq	$4 * SIZE, BO
1595
1596	subq	$1, %rax
1597	BRANCH
1598	jg	.L36
1599	ALIGN_4
1600
1601.L38:
1602#if defined(LN) || defined(RT)
1603	movq	KK, %rax
1604#ifdef LN
1605	subq	$2, %rax
1606#else
1607	subq	$4, %rax
1608#endif
1609
1610	leaq	(, %rax, SIZE), %rax
1611
1612	movq	AORIG, AO
1613	leaq	(AO, %rax, 2), AO
1614	leaq	(B,  %rax, 4), BO
1615#endif
1616
1617	addpd	%xmm1, %xmm8
1618	addpd	%xmm2, %xmm9
1619	addpd	%xmm3, %xmm10
1620	addpd	%xmm4, %xmm11
1621
1622#if defined(LN) || defined(LT)
1623	movaps	%xmm8, %xmm0
1624	shufpd	$0, %xmm9, %xmm8
1625	shufpd	$3, %xmm0, %xmm9
1626
1627	movaps	%xmm10, %xmm0
1628	shufpd	$0, %xmm11, %xmm10
1629	shufpd	$3, %xmm0, %xmm11
1630
1631	movapd	-16 * SIZE(BO), %xmm0
1632	movapd	-14 * SIZE(BO), %xmm2
1633	movapd	-12 * SIZE(BO), %xmm1
1634	movapd	-10 * SIZE(BO), %xmm3
1635#else
1636	movaps	%xmm8, %xmm0
1637	shufpd	$2, %xmm9, %xmm8
1638	shufpd	$2, %xmm0, %xmm9
1639
1640	movaps	%xmm10, %xmm0
1641	shufpd	$2, %xmm11, %xmm10
1642	shufpd	$2, %xmm0, %xmm11
1643
1644	movapd	 -16 * SIZE(AO), %xmm0
1645	movapd	 -14 * SIZE(AO), %xmm1
1646	movapd	 -12 * SIZE(AO), %xmm2
1647	movapd	 -10 * SIZE(AO), %xmm3
1648#endif
1649
1650	subpd	%xmm8,  %xmm0
1651	subpd	%xmm9,  %xmm1
1652	subpd	%xmm10, %xmm2
1653	subpd	%xmm11, %xmm3
1654
1655#ifdef LN
1656	movddup	-13 * SIZE(AO), %xmm8
1657	mulpd	 %xmm8, %xmm1
1658	mulpd	 %xmm8, %xmm3
1659
1660	movddup	-14 * SIZE(AO), %xmm12
1661	movapd	%xmm12, %xmm13
1662
1663	mulpd	%xmm1, %xmm12
1664	mulpd	%xmm3, %xmm13
1665
1666	subpd	%xmm12, %xmm0
1667	subpd	%xmm13, %xmm2
1668
1669	movddup	-16 * SIZE(AO), %xmm8
1670	mulpd	 %xmm8, %xmm0
1671	mulpd	 %xmm8, %xmm2
1672#endif
1673
1674#ifdef LT
1675	movddup	-16 * SIZE(AO), %xmm8
1676	mulpd	 %xmm8, %xmm0
1677	mulpd	 %xmm8, %xmm2
1678
1679	movddup	-15 * SIZE(AO), %xmm12
1680	movapd	%xmm12, %xmm13
1681
1682	mulpd	%xmm0, %xmm12
1683	mulpd	%xmm2, %xmm13
1684
1685	subpd	%xmm12, %xmm1
1686	subpd	%xmm13, %xmm3
1687
1688	movddup	-13 * SIZE(AO), %xmm8
1689	mulpd	 %xmm8, %xmm1
1690	mulpd	 %xmm8, %xmm3
1691#endif
1692
1693#ifdef RN
1694	movddup	-16 * SIZE(BO), %xmm8
1695	mulpd	%xmm8, %xmm0
1696	movddup	-15 * SIZE(BO), %xmm9
1697	mulpd	%xmm0,  %xmm9
1698	subpd	%xmm9,  %xmm1
1699	movddup	-14 * SIZE(BO), %xmm10
1700	mulpd	%xmm0,  %xmm10
1701	subpd	%xmm10, %xmm2
1702	movddup	-13 * SIZE(BO), %xmm11
1703	mulpd	%xmm0,  %xmm11
1704	subpd	%xmm11, %xmm3
1705
1706	movddup	-11 * SIZE(BO), %xmm9
1707	mulpd	%xmm9, %xmm1
1708	movddup	-10 * SIZE(BO), %xmm10
1709	mulpd	%xmm1,  %xmm10
1710	subpd	%xmm10, %xmm2
1711	movddup	 -9 * SIZE(BO), %xmm11
1712	mulpd	%xmm1,  %xmm11
1713	subpd	%xmm11, %xmm3
1714
1715	movddup	 -6 * SIZE(BO), %xmm10
1716	mulpd	%xmm10, %xmm2
1717	movddup	 -5 * SIZE(BO), %xmm11
1718	mulpd	%xmm2,  %xmm11
1719	subpd	%xmm11, %xmm3
1720
1721	movddup	 -1 * SIZE(BO), %xmm11
1722	mulpd	%xmm11, %xmm3
1723#endif
1724
1725#ifdef RT
1726	movddup	 -1 * SIZE(BO), %xmm12
1727	mulpd	%xmm12, %xmm3
1728	movddup	 -2 * SIZE(BO), %xmm13
1729	mulpd	%xmm3,  %xmm13
1730	subpd	%xmm13, %xmm2
1731	movddup	 -3 * SIZE(BO), %xmm14
1732	mulpd	%xmm3,  %xmm14
1733	subpd	%xmm14, %xmm1
1734	movddup	 -4 * SIZE(BO), %xmm15
1735	mulpd	%xmm3,  %xmm15
1736	subpd	%xmm15, %xmm0
1737
1738	movddup	 -6 * SIZE(BO), %xmm13
1739	mulpd	%xmm13, %xmm2
1740	movddup	 -7 * SIZE(BO), %xmm14
1741	mulpd	%xmm2,  %xmm14
1742	subpd	%xmm14, %xmm1
1743	movddup	 -8 * SIZE(BO), %xmm15
1744	mulpd	%xmm2,  %xmm15
1745	subpd	%xmm15, %xmm0
1746
1747	movddup	-11 * SIZE(BO), %xmm14
1748	mulpd	%xmm14, %xmm1
1749	movddup	-12 * SIZE(BO), %xmm15
1750	mulpd	%xmm1,  %xmm15
1751	subpd	%xmm15, %xmm0
1752
1753	movddup	-16 * SIZE(BO), %xmm15
1754	mulpd	%xmm15, %xmm0
1755#endif
1756
1757
1758#ifdef LN
1759	subq	$2 * SIZE, CO1
1760	subq	$2 * SIZE, CO2
1761#endif
1762
1763	leaq	(LDC, LDC, 2), %rax
1764
1765#if defined(LN) || defined(LT)
1766	movsd	%xmm0,  0 * SIZE(CO1)
1767	movsd	%xmm1,  1 * SIZE(CO1)
1768	movhps	%xmm0,  0 * SIZE(CO1, LDC,  1)
1769	movhps	%xmm1,  1 * SIZE(CO1, LDC,  1)
1770
1771	movsd	%xmm2,  0 * SIZE(CO2)
1772	movsd	%xmm3,  1 * SIZE(CO2)
1773	movhps	%xmm2,  0 * SIZE(CO2, LDC,  1)
1774	movhps	%xmm3,  1 * SIZE(CO2, LDC,  1)
1775#else
1776	movsd	%xmm0,  0 * SIZE(CO1)
1777	movhps	%xmm0,  1 * SIZE(CO1)
1778	movsd	%xmm1,  0 * SIZE(CO1, LDC,  1)
1779	movhps	%xmm1,  1 * SIZE(CO1, LDC,  1)
1780
1781	movsd	%xmm2,  0 * SIZE(CO2)
1782	movhps	%xmm2,  1 * SIZE(CO2)
1783	movsd	%xmm3,  0 * SIZE(CO2, LDC,  1)
1784	movhps	%xmm3,  1 * SIZE(CO2, LDC,  1)
1785#endif
1786
1787#if defined(LN) || defined(LT)
1788	movapd	%xmm0,  -16 * SIZE(BO)
1789	movapd	%xmm2,  -14 * SIZE(BO)
1790	movapd	%xmm1,  -12 * SIZE(BO)
1791	movapd	%xmm3,  -10 * SIZE(BO)
1792#else
1793	movapd	%xmm0,  -16 * SIZE(AO)
1794	movapd	%xmm1,  -14 * SIZE(AO)
1795	movapd	%xmm2,  -12 * SIZE(AO)
1796	movapd	%xmm3,  -10 * SIZE(AO)
1797#endif
1798
1799#ifndef LN
1800	addq	$2 * SIZE, CO1
1801	addq	$2 * SIZE, CO2
1802#endif
1803
1804
1805#if defined(LT) || defined(RN)
1806	movq	K,  %rax
1807	subq	KK, %rax
1808	leaq	(,%rax, SIZE), %rax
1809	leaq	(AO, %rax, 2), AO
1810	leaq	(BO, %rax, 4), BO
1811#endif
1812
1813#ifdef LN
1814	subq	$2, KK
1815#endif
1816
1817#ifdef LT
1818	addq	$2, KK
1819#endif
1820
1821#ifdef RT
1822       movq	K, %rax
1823       salq	$1 + BASE_SHIFT, %rax
1824       addq	%rax, AORIG
1825#endif
1826
1827	decq	I
1828	BRANCH
1829	jg	.L31
1830	ALIGN_4
1831
1832.L40:
1833	testq	$1, M
1834	BRANCH
1835	jle	.L49
1836	ALIGN_4
1837
1838#ifdef LN
1839       movq	K, %rax
1840       salq	$BASE_SHIFT, %rax
1841       subq	%rax, AORIG
1842#endif
1843
1844#if defined(LN) || defined(RT)
1845	movq	KK, %rax
1846	leaq	(, %rax, SIZE), %rax
1847	movq	AORIG, AO
1848	leaq	(AO, %rax, 1), AO
1849	leaq	(B,  %rax, 4), BO
1850#else
1851	movq	B, BO
1852#endif
1853
1854	movddup	-16 * SIZE(AO), %xmm0
1855	xorps	%xmm8,  %xmm8
1856	movaps	-16 * SIZE(BO), %xmm1
1857	xorps	%xmm9,  %xmm9
1858	xorps	%xmm10, %xmm10
1859	xorps	%xmm11, %xmm11
1860
1861#if defined(LT) || defined(RN)
1862	movq	KK, %rax
1863#else
1864	movq	K, %rax
1865	subq	KK, %rax
1866#endif
1867	sarq	$2, %rax
1868	NOBRANCH
1869	jle	.L45
1870	ALIGN_3
1871
1872.L42:
1873	mulpd	%xmm0, %xmm1
1874	addpd	%xmm1, %xmm8
1875	movaps	-14 * SIZE(BO), %xmm1
1876	mulpd	%xmm0, %xmm1
1877	movddup	-15 * SIZE(AO), %xmm0
1878	addpd	%xmm1, %xmm9
1879	movaps	-12 * SIZE(BO), %xmm1
1880
1881	mulpd	%xmm0, %xmm1
1882	addpd	%xmm1, %xmm10
1883	movaps	-10 * SIZE(BO), %xmm1
1884	mulpd	%xmm0, %xmm1
1885	movddup	-14 * SIZE(AO), %xmm0
1886	addpd	%xmm1, %xmm11
1887	movaps	 -8 * SIZE(BO), %xmm1
1888
1889	mulpd	%xmm0, %xmm1
1890	addpd	%xmm1, %xmm8
1891	movaps	 -6 * SIZE(BO), %xmm1
1892	mulpd	%xmm0, %xmm1
1893	movddup	-13 * SIZE(AO), %xmm0
1894	addpd	%xmm1, %xmm9
1895	movaps	 -4 * SIZE(BO), %xmm1
1896
1897	mulpd	%xmm0, %xmm1
1898	addpd	%xmm1, %xmm10
1899	movaps	 -2 * SIZE(BO), %xmm1
1900	mulpd	%xmm0, %xmm1
1901	movddup	-12 * SIZE(AO), %xmm0
1902	addpd	%xmm1, %xmm11
1903	movaps	  0 * SIZE(BO), %xmm1
1904
1905	subq	$ -4 * SIZE, AO
1906	subq	$-16 * SIZE, BO
1907
1908	subq	$1, %rax
1909	BRANCH
1910	jg	.L42
1911	ALIGN_3
1912
1913.L45:
1914#if defined(LT) || defined(RN)
1915	movq	KK, %rax
1916#else
1917	movq	K, %rax
1918	subq	KK, %rax
1919#endif
1920	andq	$3, %rax		# if (k & 1)
1921	BRANCH
1922	je	.L48
1923	ALIGN_3
1924
1925.L46:
1926	mulpd	%xmm0, %xmm1
1927	addpd	%xmm1, %xmm8
1928	movaps	-14 * SIZE(BO), %xmm1
1929	mulpd	%xmm0, %xmm1
1930	movddup	-15 * SIZE(AO), %xmm0
1931	addpd	%xmm1, %xmm9
1932	movaps	-12 * SIZE(BO), %xmm1
1933
1934	addq	$1 * SIZE, AO
1935	addq	$4 * SIZE, BO
1936
1937	subq	$1, %rax
1938	BRANCH
1939	jg	.L46
1940	ALIGN_4
1941
1942.L48:
1943#if defined(LN) || defined(RT)
1944	movq	KK, %rax
1945#ifdef LN
1946	subq	$1, %rax
1947#else
1948	subq	$4, %rax
1949#endif
1950
1951	leaq	(, %rax, SIZE), %rax
1952
1953	movq	AORIG, AO
1954	leaq	(AO, %rax, 1), AO
1955	leaq	(B,  %rax, 4), BO
1956#endif
1957
1958	addpd	%xmm10, %xmm8
1959	addpd	%xmm11, %xmm9
1960
1961#if defined(LN) || defined(LT)
1962	movapd	-16 * SIZE(BO), %xmm0
1963	movapd	-14 * SIZE(BO), %xmm1
1964#else
1965	movapd	-16 * SIZE(AO), %xmm0
1966	movapd	-14 * SIZE(AO), %xmm1
1967#endif
1968
1969	subpd	%xmm8,  %xmm0
1970	subpd	%xmm9,  %xmm1
1971
1972#if defined(LN) || defined(LT)
1973	movddup	-16 * SIZE(AO), %xmm8
1974	mulpd	 %xmm8, %xmm0
1975	mulpd	 %xmm8, %xmm1
1976#endif
1977
1978#if defined(RN) || defined(RT)
1979	pshufd	$0xe, %xmm1, %xmm3
1980	movaps	%xmm1, %xmm2
1981	pshufd	$0xe, %xmm0, %xmm1
1982#endif
1983
1984#ifdef RN
1985	movsd	-16 * SIZE(BO), %xmm8
1986	mulsd	%xmm8, %xmm0
1987	movsd	-15 * SIZE(BO), %xmm9
1988	mulsd	%xmm0,  %xmm9
1989	subsd	%xmm9,  %xmm1
1990	movsd	-14 * SIZE(BO), %xmm10
1991	mulsd	%xmm0,  %xmm10
1992	subsd	%xmm10, %xmm2
1993	movsd	-13 * SIZE(BO), %xmm11
1994	mulsd	%xmm0,  %xmm11
1995	subsd	%xmm11, %xmm3
1996
1997	movsd	-11 * SIZE(BO), %xmm9
1998	mulsd	%xmm9, %xmm1
1999	movsd	-10 * SIZE(BO), %xmm10
2000	mulsd	%xmm1,  %xmm10
2001	subsd	%xmm10, %xmm2
2002	movsd	 -9 * SIZE(BO), %xmm11
2003	mulsd	%xmm1,  %xmm11
2004	subsd	%xmm11, %xmm3
2005
2006	movsd	 -6 * SIZE(BO), %xmm10
2007	mulsd	%xmm10, %xmm2
2008	movsd	 -5 * SIZE(BO), %xmm11
2009	mulsd	%xmm2,  %xmm11
2010	subsd	%xmm11, %xmm3
2011
2012	movsd	 -1 * SIZE(BO), %xmm11
2013	mulsd	%xmm11, %xmm3
2014#endif
2015
2016#ifdef RT
2017	movsd	 -1 * SIZE(BO), %xmm12
2018	mulsd	%xmm12, %xmm3
2019	movsd	 -2 * SIZE(BO), %xmm13
2020	mulsd	%xmm3,  %xmm13
2021	subsd	%xmm13, %xmm2
2022	movsd	 -3 * SIZE(BO), %xmm14
2023	mulsd	%xmm3,  %xmm14
2024	subsd	%xmm14, %xmm1
2025	movsd	 -4 * SIZE(BO), %xmm15
2026	mulsd	%xmm3,  %xmm15
2027	subsd	%xmm15, %xmm0
2028
2029	movsd	 -6 * SIZE(BO), %xmm13
2030	mulsd	%xmm13, %xmm2
2031	movsd	 -7 * SIZE(BO), %xmm14
2032	mulsd	%xmm2,  %xmm14
2033	subsd	%xmm14, %xmm1
2034	movsd	 -8 * SIZE(BO), %xmm15
2035	mulsd	%xmm2,  %xmm15
2036	subsd	%xmm15, %xmm0
2037
2038	movsd	-11 * SIZE(BO), %xmm14
2039	mulsd	%xmm14, %xmm1
2040	movsd	-12 * SIZE(BO), %xmm15
2041	mulsd	%xmm1,  %xmm15
2042	subsd	%xmm15, %xmm0
2043
2044	movsd	-16 * SIZE(BO), %xmm15
2045	mulsd	%xmm15, %xmm0
2046#endif
2047
2048#if defined(RN) || defined(RT)
2049	unpcklpd   %xmm1, %xmm0
2050	movaps	   %xmm2, %xmm1
2051	unpcklpd   %xmm3, %xmm1
2052#endif
2053
2054#ifdef LN
2055	subq	$1 * SIZE, CO1
2056	subq	$1 * SIZE, CO2
2057#endif
2058
2059	movsd	%xmm0,  0 * SIZE(CO1)
2060	movhps	%xmm0,  0 * SIZE(CO1, LDC,  1)
2061	movsd	%xmm1,  0 * SIZE(CO2)
2062	movhps	%xmm1,  0 * SIZE(CO2, LDC,  1)
2063
2064#if defined(LN) || defined(LT)
2065	movapd	%xmm0,  -16 * SIZE(BO)
2066	movapd	%xmm1,  -14 * SIZE(BO)
2067#else
2068	movapd	%xmm0,  -16 * SIZE(AO)
2069	movapd	%xmm1,  -14 * SIZE(AO)
2070#endif
2071
2072#ifndef LN
2073	addq	$1 * SIZE, CO1
2074	addq	$1 * SIZE, CO2
2075#endif
2076
2077#if defined(LT) || defined(RN)
2078	movq	K,  %rax
2079	subq	KK, %rax
2080	leaq	(,%rax, SIZE), %rax
2081	leaq	(AO, %rax, 1), AO
2082	leaq	(BO, %rax, 4), BO
2083#endif
2084
2085#ifdef LN
2086	subq	$1, KK
2087#endif
2088
2089#ifdef LT
2090	addq	$1, KK
2091#endif
2092
2093#ifdef RT
2094       movq	K, %rax
2095       salq	$BASE_SHIFT, %rax
2096       addq	%rax, AORIG
2097#endif
2098	ALIGN_4
2099
2100.L49:
2101#ifdef LN
2102       leaq	(, K, SIZE), %rax
2103       leaq	(B, %rax, 4), B
2104#endif
2105#if defined(LT) || defined(RN)
2106	movq	BO, B
2107#endif
2108
2109#ifdef RN
2110	addq	$4, KK
2111#endif
2112
2113#ifdef RT
2114	subq	$4, KK
2115#endif
2116	ALIGN_4
2117
2118.L50:
2119	testq	$2, N
2120	jle	.L70
2121	ALIGN_4
2122
2123#if defined(LT) || defined(RN)
2124	movq	A, AO
2125#else
2126	movq	A, AORIG
2127#endif
2128
2129#ifdef RT
2130       movq	K, %rax
2131       salq	$1 + BASE_SHIFT, %rax
2132       subq	%rax, B
2133
2134       leaq	(, LDC, 2), %rax
2135       subq	%rax, C
2136#endif
2137
2138	movq	C, CO1
2139	leaq	(C, LDC, 1), CO2
2140#ifndef RT
2141	leaq	(C, LDC, 2), C
2142#endif
2143
2144#ifdef LN
2145	movq	OFFSET, %rax
2146	addq	M, %rax
2147	movq	%rax, KK
2148#endif
2149
2150#ifdef LT
2151	movq	OFFSET, %rax
2152	movq	%rax, KK
2153#endif
2154
2155	movq	M,  I
2156	sarq	$1, I
2157	NOBRANCH
2158	jle	.L60
2159	ALIGN_4
2160
2161.L51:
2162#ifdef LN
2163       movq	K, %rax
2164       salq	$1 + BASE_SHIFT, %rax
2165       subq	%rax, AORIG
2166#endif
2167
2168#if defined(LN) || defined(RT)
2169	movq	KK, %rax
2170	leaq	(, %rax, SIZE), %rax
2171	movq	AORIG, AO
2172	leaq	(AO, %rax, 2), AO
2173	leaq	(B,  %rax, 2), BO
2174#else
2175	movq	B, BO
2176#endif
2177
2178	xorps	%xmm1, %xmm1
2179	movaps	-16 * SIZE(AO), %xmm0
2180	xorps	%xmm2, %xmm2
2181
2182	xorps	%xmm8,  %xmm8
2183	prefetcht0     2 * SIZE(CO1)
2184	xorps	%xmm9,  %xmm9
2185	prefetcht0     2 * SIZE(CO2)
2186	xorps	%xmm10, %xmm10
2187	xorps	%xmm11, %xmm11
2188
2189#if defined(LT) || defined(RN)
2190	movq	KK, %rax
2191#else
2192	movq	K, %rax
2193	subq	KK, %rax
2194#endif
2195	sarq	$2, %rax
2196	NOBRANCH
2197	jle	.L55
2198	ALIGN_3
2199
2200.L52:
2201	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
2202
2203	addpd	%xmm1, %xmm8
2204	movaps	-16 * SIZE(BO), %xmm1
2205	addpd	%xmm2, %xmm9
2206	pshufd	$0x4e, %xmm1, %xmm2
2207	mulpd	%xmm0, %xmm1
2208	mulpd	%xmm0, %xmm2
2209	movaps	-14 * SIZE(AO), %xmm0
2210
2211	addpd	%xmm1, %xmm10
2212	movaps	-14 * SIZE(BO), %xmm1
2213	addpd	%xmm2, %xmm11
2214	pshufd	$0x4e, %xmm1, %xmm2
2215	mulpd	%xmm0, %xmm1
2216	mulpd	%xmm0, %xmm2
2217	movaps	-12 * SIZE(AO), %xmm0
2218
2219	addpd	%xmm1, %xmm8
2220	movaps	-12 * SIZE(BO), %xmm1
2221	addpd	%xmm2, %xmm9
2222	pshufd	$0x4e, %xmm1, %xmm2
2223	mulpd	%xmm0, %xmm1
2224	mulpd	%xmm0, %xmm2
2225	movaps	-10 * SIZE(AO), %xmm0
2226
2227	addpd	%xmm1, %xmm10
2228	movaps	-10 * SIZE(BO), %xmm1
2229	addpd	%xmm2, %xmm11
2230	pshufd	$0x4e, %xmm1, %xmm2
2231	mulpd	%xmm0, %xmm1
2232	mulpd	%xmm0, %xmm2
2233	movaps	 -8 * SIZE(AO), %xmm0
2234
2235	subq	$-8 * SIZE, AO
2236	subq	$-8 * SIZE, BO
2237
2238	subq	$1, %rax
2239	BRANCH
2240	jg	.L52
2241
2242	addpd	%xmm10, %xmm8
2243	addpd	%xmm11, %xmm9
2244	ALIGN_3
2245
2246.L55:
2247#if defined(LT) || defined(RN)
2248	movq	KK, %rax
2249#else
2250	movq	K, %rax
2251	subq	KK, %rax
2252#endif
2253	andq	$3, %rax		# if (k & 1)
2254	BRANCH
2255	je	.L58
2256	ALIGN_3
2257
2258.L56:
2259	addpd	%xmm1, %xmm8
2260	movaps	-16 * SIZE(BO), %xmm1
2261	addpd	%xmm2, %xmm9
2262	pshufd	$0x4e, %xmm1, %xmm2
2263	mulpd	%xmm0, %xmm1
2264	mulpd	%xmm0, %xmm2
2265	movaps	-14 * SIZE(AO), %xmm0
2266
2267	addq	$2 * SIZE, AO
2268	addq	$2 * SIZE, BO
2269
2270	subq	$1, %rax
2271	BRANCH
2272	jg	.L56
2273	ALIGN_4
2274
2275.L58:
2276#if defined(LN) || defined(RT)
2277	movq	KK, %rax
2278#ifdef LN
2279	subq	$2, %rax
2280#else
2281	subq	$2, %rax
2282#endif
2283
2284	leaq	(, %rax, SIZE), %rax
2285
2286	movq	AORIG, AO
2287	leaq	(AO, %rax, 2), AO
2288	leaq	(B,  %rax, 2), BO
2289#endif
2290
2291	addpd	%xmm1, %xmm8
2292	addpd	%xmm2, %xmm9
2293
2294#if defined(LN) || defined(LT)
2295	movaps	%xmm8, %xmm0
2296	shufpd	$0, %xmm9, %xmm8
2297	shufpd	$3, %xmm0, %xmm9
2298
2299	movapd	-16 * SIZE(BO), %xmm0
2300	movapd	-14 * SIZE(BO), %xmm1
2301#else
2302	movaps	%xmm8, %xmm0
2303	shufpd	$2, %xmm9, %xmm8
2304	shufpd	$2, %xmm0, %xmm9
2305
2306	movapd	 -16 * SIZE(AO), %xmm0
2307	movapd	 -14 * SIZE(AO), %xmm1
2308#endif
2309
2310	subpd	%xmm8,  %xmm0
2311	subpd	%xmm9,  %xmm1
2312
2313#ifdef LN
2314	movddup	-13 * SIZE(AO), %xmm8
2315	mulpd	 %xmm8, %xmm1
2316	movddup	-14 * SIZE(AO), %xmm12
2317	mulpd	%xmm1, %xmm12
2318	subpd	%xmm12, %xmm0
2319	movddup	-16 * SIZE(AO), %xmm8
2320	mulpd	 %xmm8, %xmm0
2321#endif
2322
2323#ifdef LT
2324	movddup	-16 * SIZE(AO), %xmm8
2325	mulpd	 %xmm8, %xmm0
2326	movddup	-15 * SIZE(AO), %xmm12
2327	mulpd	%xmm0, %xmm12
2328	subpd	%xmm12, %xmm1
2329	movddup	-13 * SIZE(AO), %xmm8
2330	mulpd	 %xmm8, %xmm1
2331#endif
2332
2333#ifdef RN
2334	movddup	-16 * SIZE(BO), %xmm10
2335	mulpd	%xmm10, %xmm0
2336	movddup	-15 * SIZE(BO), %xmm11
2337	mulpd	%xmm0,  %xmm11
2338	subpd	%xmm11, %xmm1
2339
2340	movddup	-13 * SIZE(BO), %xmm11
2341	mulpd	%xmm11, %xmm1
2342#endif
2343
2344#ifdef RT
2345	movddup	-13 * SIZE(BO), %xmm14
2346	mulpd	%xmm14, %xmm1
2347	movddup	-14 * SIZE(BO), %xmm15
2348	mulpd	%xmm1,  %xmm15
2349	subpd	%xmm15, %xmm0
2350
2351	movddup	-16 * SIZE(BO), %xmm15
2352	mulpd	%xmm15, %xmm0
2353#endif
2354
2355#ifdef LN
2356	subq	$2 * SIZE, CO1
2357	subq	$2 * SIZE, CO2
2358#endif
2359
2360#if defined(LN) || defined(LT)
2361	movsd	%xmm0,  0 * SIZE(CO1)
2362	movsd	%xmm1,  1 * SIZE(CO1)
2363	movhps	%xmm0,  0 * SIZE(CO2)
2364	movhps	%xmm1,  1 * SIZE(CO2)
2365#else
2366	movsd	%xmm0,  0 * SIZE(CO1)
2367	movhps	%xmm0,  1 * SIZE(CO1)
2368	movsd	%xmm1,  0 * SIZE(CO2)
2369	movhps	%xmm1,  1 * SIZE(CO2)
2370#endif
2371
2372#if defined(LN) || defined(LT)
2373	movapd	%xmm0,  -16 * SIZE(BO)
2374	movapd	%xmm1,  -14 * SIZE(BO)
2375#else
2376	movapd	%xmm0,  -16 * SIZE(AO)
2377	movapd	%xmm1,  -14 * SIZE(AO)
2378#endif
2379
2380#ifndef LN
2381	addq	$2 * SIZE, CO1
2382	addq	$2 * SIZE, CO2
2383#endif
2384
2385
2386#if defined(LT) || defined(RN)
2387	movq	K,  %rax
2388	subq	KK, %rax
2389	leaq	(,%rax, SIZE), %rax
2390	leaq	(AO, %rax, 2), AO
2391	leaq	(BO, %rax, 2), BO
2392#endif
2393
2394#ifdef LN
2395	subq	$2, KK
2396#endif
2397
2398#ifdef LT
2399	addq	$2, KK
2400#endif
2401
2402#ifdef RT
2403       movq	K, %rax
2404       salq	$1 + BASE_SHIFT, %rax
2405       addq	%rax, AORIG
2406#endif
2407
2408	decq	I
2409	BRANCH
2410	jg	.L51
2411	ALIGN_4
2412
2413.L60:
2414	testq	$1, M
2415	BRANCH
2416	jle	.L69
2417	ALIGN_4
2418
2419#ifdef LN
2420       movq	K, %rax
2421       salq	$BASE_SHIFT, %rax
2422       subq	%rax, AORIG
2423#endif
2424
2425#if defined(LN) || defined(RT)
2426	movq	KK, %rax
2427	leaq	(, %rax, SIZE), %rax
2428	movq	AORIG, AO
2429	leaq	(AO, %rax, 1), AO
2430	leaq	(B,  %rax, 2), BO
2431#else
2432	movq	B, BO
2433#endif
2434
2435	movddup	-16 * SIZE(AO), %xmm0
2436	xorps	%xmm8,  %xmm8
2437	movaps	-16 * SIZE(BO), %xmm1
2438	xorps	%xmm9,  %xmm9
2439
2440#if defined(LT) || defined(RN)
2441	movq	KK, %rax
2442#else
2443	movq	K, %rax
2444	subq	KK, %rax
2445#endif
2446	sarq	$2, %rax
2447	NOBRANCH
2448	jle	.L65
2449	ALIGN_3
2450
2451.L62:
2452	mulpd	%xmm0, %xmm1
2453	movddup	-15 * SIZE(AO), %xmm0
2454	addpd	%xmm1, %xmm8
2455	movaps	-14 * SIZE(BO), %xmm1
2456
2457	mulpd	%xmm0, %xmm1
2458	movddup	-14 * SIZE(AO), %xmm0
2459	addpd	%xmm1, %xmm9
2460	movaps	-12 * SIZE(BO), %xmm1
2461
2462	mulpd	%xmm0, %xmm1
2463	movddup	-13 * SIZE(AO), %xmm0
2464	addpd	%xmm1, %xmm8
2465	movaps	-10 * SIZE(BO), %xmm1
2466
2467	mulpd	%xmm0, %xmm1
2468	movddup	-12 * SIZE(AO), %xmm0
2469	addpd	%xmm1, %xmm9
2470	movaps	 -8 * SIZE(BO), %xmm1
2471
2472	subq	$-4 * SIZE, AO
2473	subq	$-8 * SIZE, BO
2474
2475	subq	$1, %rax
2476	BRANCH
2477	jg	.L62
2478	ALIGN_3
2479
2480.L65:
2481#if defined(LT) || defined(RN)
2482	movq	KK, %rax
2483#else
2484	movq	K, %rax
2485	subq	KK, %rax
2486#endif
2487	andq	$3, %rax		# if (k & 1)
2488	BRANCH
2489	je	.L68
2490	ALIGN_3
2491
2492.L66:
2493	mulpd	%xmm0, %xmm1
2494	movddup	-15 * SIZE(AO), %xmm0
2495	addpd	%xmm1, %xmm8
2496	movaps	-14 * SIZE(BO), %xmm1
2497
2498	addq	$1 * SIZE, AO
2499	addq	$2 * SIZE, BO
2500
2501	subq	$1, %rax
2502	BRANCH
2503	jg	.L66
2504	ALIGN_4
2505
2506.L68:
2507#if defined(LN) || defined(RT)
2508	movq	KK, %rax
2509#ifdef LN
2510	subq	$1, %rax
2511#else
2512	subq	$2, %rax
2513#endif
2514
2515	leaq	(, %rax, SIZE), %rax
2516
2517	movq	AORIG, AO
2518	leaq	(AO, %rax, 1), AO
2519	leaq	(B,  %rax, 2), BO
2520#endif
2521
2522	addpd	%xmm9, %xmm8
2523
2524#if defined(LN) || defined(LT)
2525	movapd	-16 * SIZE(BO), %xmm0
2526#else
2527	movapd	-16 * SIZE(AO), %xmm0
2528#endif
2529
2530	subpd	%xmm8,  %xmm0
2531
2532#if defined(LN) || defined(LT)
2533	movddup	-16 * SIZE(AO), %xmm8
2534	mulpd	 %xmm8, %xmm0
2535#endif
2536
2537#if defined(RN) || defined(RT)
2538	pshufd	$0xe, %xmm0, %xmm1
2539#endif
2540
2541#ifdef RN
2542	movsd	-16 * SIZE(BO), %xmm10
2543	mulsd	%xmm10, %xmm0
2544	movsd	-15 * SIZE(BO), %xmm11
2545	mulsd	%xmm0,  %xmm11
2546	subsd	%xmm11, %xmm1
2547
2548	movsd	-13 * SIZE(BO), %xmm11
2549	mulsd	%xmm11, %xmm1
2550#endif
2551
2552#ifdef RT
2553	movsd	-13 * SIZE(BO), %xmm14
2554	mulsd	%xmm14, %xmm1
2555	movsd	-14 * SIZE(BO), %xmm15
2556	mulsd	%xmm1,  %xmm15
2557	subsd	%xmm15, %xmm0
2558
2559	movsd	-16 * SIZE(BO), %xmm15
2560	mulsd	%xmm15, %xmm0
2561#endif
2562
2563#if defined(RN) || defined(RT)
2564	unpcklpd   %xmm1, %xmm0
2565#endif
2566
2567#ifdef LN
2568	subq	$1 * SIZE, CO1
2569	subq	$1 * SIZE, CO2
2570#endif
2571
2572	movsd	%xmm0,  0 * SIZE(CO1)
2573	movhps	%xmm0,  0 * SIZE(CO2)
2574
2575#if defined(LN) || defined(LT)
2576	movapd	%xmm0,  -16 * SIZE(BO)
2577#else
2578	movapd	%xmm0,  -16 * SIZE(AO)
2579#endif
2580
2581#ifndef LN
2582	addq	$1 * SIZE, CO1
2583	addq	$1 * SIZE, CO2
2584#endif
2585
2586#if defined(LT) || defined(RN)
2587	movq	K,  %rax
2588	subq	KK, %rax
2589	leaq	(,%rax, SIZE), %rax
2590	leaq	(AO, %rax, 1), AO
2591	leaq	(BO, %rax, 2), BO
2592#endif
2593
2594#ifdef LN
2595	subq	$1, KK
2596#endif
2597
2598#ifdef LT
2599	addq	$1, KK
2600#endif
2601
2602#ifdef RT
2603       movq	K, %rax
2604       salq	$BASE_SHIFT, %rax
2605       addq	%rax, AORIG
2606#endif
2607	ALIGN_4
2608
2609.L69:
2610#ifdef LN
2611       leaq	(, K, SIZE), %rax
2612       leaq	(B, %rax, 2), B
2613#endif
2614#if defined(LT) || defined(RN)
2615	movq	BO, B
2616#endif
2617
2618#ifdef RN
2619	addq	$2, KK
2620#endif
2621
2622#ifdef RT
2623	subq	$2, KK
2624#endif
2625	ALIGN_4
2626
2627.L70:
2628	testq	$1, N
2629	jle	.L999
2630	ALIGN_4
2631
2632#if defined(LT) || defined(RN)
2633	movq	A, AO
2634#else
2635	movq	A, AORIG
2636#endif
2637
2638#ifdef RT
2639       movq	K, %rax
2640       salq	$BASE_SHIFT, %rax
2641       subq	%rax, B
2642
2643       subq	LDC, C
2644#endif
2645
2646	movq	C, CO1
2647#ifndef RT
2648	addq	LDC, C
2649#endif
2650
2651#ifdef LN
2652	movq	OFFSET, %rax
2653	addq	M, %rax
2654	movq	%rax, KK
2655#endif
2656
2657#ifdef LT
2658	movq	OFFSET, %rax
2659	movq	%rax, KK
2660#endif
2661
2662	movq	M,  I
2663	sarq	$1, I
2664	NOBRANCH
2665	jle	.L80
2666	ALIGN_4
2667
2668.L71:
2669#ifdef LN
2670       movq	K, %rax
2671       salq	$1 + BASE_SHIFT, %rax
2672       subq	%rax, AORIG
2673#endif
2674
2675#if defined(LN) || defined(RT)
2676	movq	KK, %rax
2677	leaq	(, %rax, SIZE), %rax
2678	movq	AORIG, AO
2679	leaq	(AO, %rax, 2), AO
2680	leaq	(B,  %rax, 1), BO
2681#else
2682	movq	B, BO
2683#endif
2684
2685	xorps	%xmm1, %xmm1
2686	movaps	-16 * SIZE(AO), %xmm0
2687	xorps	%xmm2, %xmm2
2688
2689	xorps	%xmm8,  %xmm8
2690	prefetcht0     2 * SIZE(CO1)
2691	xorps	%xmm9,  %xmm9
2692	xorps	%xmm10, %xmm10
2693	xorps	%xmm11, %xmm11
2694
2695#if defined(LT) || defined(RN)
2696	movq	KK, %rax
2697#else
2698	movq	K, %rax
2699	subq	KK, %rax
2700#endif
2701	sarq	$2, %rax
2702	NOBRANCH
2703	jle	.L75
2704	ALIGN_3
2705
2706.L72:
2707	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
2708
2709	addpd	%xmm1, %xmm8
2710	movddup	-16 * SIZE(BO), %xmm1
2711	mulpd	%xmm0, %xmm1
2712	movaps	-14 * SIZE(AO), %xmm0
2713
2714	addpd	%xmm1, %xmm9
2715	movddup	-15 * SIZE(BO), %xmm1
2716	mulpd	%xmm0, %xmm1
2717	movaps	-12 * SIZE(AO), %xmm0
2718
2719	addpd	%xmm1, %xmm8
2720	movddup	-14 * SIZE(BO), %xmm1
2721	mulpd	%xmm0, %xmm1
2722	movaps	-10 * SIZE(AO), %xmm0
2723
2724	addpd	%xmm1, %xmm9
2725	movddup	-13 * SIZE(BO), %xmm1
2726	mulpd	%xmm0, %xmm1
2727	movaps	 -8 * SIZE(AO), %xmm0
2728
2729	subq	$-8 * SIZE, AO
2730	subq	$-4 * SIZE, BO
2731
2732	subq	$1, %rax
2733	BRANCH
2734	jg	.L72
2735
2736	addpd	%xmm9, %xmm8
2737	ALIGN_3
2738
2739.L75:
2740#if defined(LT) || defined(RN)
2741	movq	KK, %rax
2742#else
2743	movq	K, %rax
2744	subq	KK, %rax
2745#endif
2746	andq	$3, %rax		# if (k & 1)
2747	BRANCH
2748	je	.L78
2749	ALIGN_3
2750
2751.L76:
2752	addpd	%xmm1, %xmm8
2753	movddup	-16 * SIZE(BO), %xmm1
2754	mulpd	%xmm0, %xmm1
2755	movaps	-14 * SIZE(AO), %xmm0
2756
2757	addq	$2 * SIZE, AO
2758	addq	$1 * SIZE, BO
2759
2760	subq	$1, %rax
2761	BRANCH
2762	jg	.L76
2763	ALIGN_4
2764
2765.L78:
2766#if defined(LN) || defined(RT)
2767	movq	KK, %rax
2768#ifdef LN
2769	subq	$2, %rax
2770#else
2771	subq	$1, %rax
2772#endif
2773
2774	leaq	(, %rax, SIZE), %rax
2775
2776	movq	AORIG, AO
2777	leaq	(AO, %rax, 2), AO
2778	leaq	(B,  %rax, 1), BO
2779#endif
2780
2781	addpd	%xmm1, %xmm8
2782
2783#if defined(LN) || defined(LT)
2784	movapd	-16 * SIZE(BO), %xmm0
2785#else
2786	movapd	-16 * SIZE(AO), %xmm0
2787#endif
2788
2789	subpd	%xmm8,  %xmm0
2790
2791#if defined(LN) || defined(LT)
2792	pshufd	$0xe, %xmm0, %xmm1
2793#endif
2794
2795#ifdef LN
2796	movsd	-13 * SIZE(AO), %xmm8
2797	mulsd	 %xmm8, %xmm1
2798	movsd	-14 * SIZE(AO), %xmm12
2799	mulsd	%xmm1, %xmm12
2800	subsd	%xmm12, %xmm0
2801	movsd	-16 * SIZE(AO), %xmm8
2802	mulsd	 %xmm8, %xmm0
2803#endif
2804
2805#ifdef LT
2806	movsd	-16 * SIZE(AO), %xmm8
2807	mulsd	 %xmm8, %xmm0
2808	movsd	-15 * SIZE(AO), %xmm12
2809	mulsd	%xmm0, %xmm12
2810	subsd	%xmm12, %xmm1
2811	movsd	-13 * SIZE(AO), %xmm8
2812	mulsd	 %xmm8, %xmm1
2813#endif
2814
2815#if defined(LN) || defined(LT)
2816	unpcklpd   %xmm1, %xmm0
2817#endif
2818
2819#if defined(RN) || defined(RT)
2820	movddup	-16 * SIZE(BO), %xmm10
2821	mulpd	%xmm10, %xmm0
2822#endif
2823
2824#ifdef LN
2825	subq	$2 * SIZE, CO1
2826#endif
2827
2828	movsd	%xmm0,  0 * SIZE(CO1)
2829	movhps	%xmm0,  1 * SIZE(CO1)
2830
2831#if defined(LN) || defined(LT)
2832	movapd	%xmm0,  -16 * SIZE(BO)
2833#else
2834	movapd	%xmm0,  -16 * SIZE(AO)
2835#endif
2836
2837#ifndef LN
2838	addq	$2 * SIZE, CO1
2839#endif
2840
2841
2842#if defined(LT) || defined(RN)
2843	movq	K,  %rax
2844	subq	KK, %rax
2845	leaq	(,%rax, SIZE), %rax
2846	leaq	(AO, %rax, 2), AO
2847	leaq	(BO, %rax, 1), BO
2848#endif
2849
2850#ifdef LN
2851	subq	$2, KK
2852#endif
2853
2854#ifdef LT
2855	addq	$2, KK
2856#endif
2857
2858#ifdef RT
2859       movq	K, %rax
2860       salq	$1 + BASE_SHIFT, %rax
2861       addq	%rax, AORIG
2862#endif
2863
2864	decq	I
2865	BRANCH
2866	jg	.L71
2867	ALIGN_4
2868
2869.L80:
2870	testq	$1, M
2871	BRANCH
2872	jle	.L89
2873	ALIGN_4
2874
2875#ifdef LN
2876       movq	K, %rax
2877       salq	$BASE_SHIFT, %rax
2878       subq	%rax, AORIG
2879#endif
2880
2881#if defined(LN) || defined(RT)
2882	movq	KK, %rax
2883	leaq	(, %rax, SIZE), %rax
2884	movq	AORIG, AO
2885	leaq	(AO, %rax, 1), AO
2886	leaq	(B,  %rax, 1), BO
2887#else
2888	movq	B, BO
2889#endif
2890
2891	movsd	-16 * SIZE(AO), %xmm0
2892	movhps	-15 * SIZE(AO), %xmm0
2893	xorps	%xmm8,  %xmm8
2894	movsd	-16 * SIZE(BO), %xmm1
2895	movhps	-15 * SIZE(BO), %xmm1
2896	xorps	%xmm9,  %xmm9
2897
2898#if defined(LT) || defined(RN)
2899	movq	KK, %rax
2900#else
2901	movq	K, %rax
2902	subq	KK, %rax
2903#endif
2904	sarq	$2, %rax
2905	NOBRANCH
2906	jle	.L85
2907	ALIGN_3
2908
2909.L82:
2910	mulpd	%xmm0, %xmm1
2911	movsd	-14 * SIZE(AO), %xmm0
2912	movhps	-13 * SIZE(AO), %xmm0
2913	addpd	%xmm1, %xmm8
2914	movsd	-14 * SIZE(BO), %xmm1
2915	movhps	-13 * SIZE(BO), %xmm1
2916
2917	mulpd	%xmm0, %xmm1
2918	movsd	-12 * SIZE(AO), %xmm0
2919	movhps	-11 * SIZE(AO), %xmm0
2920	addpd	%xmm1, %xmm9
2921	movsd	-12 * SIZE(BO), %xmm1
2922	movhps	-11 * SIZE(BO), %xmm1
2923
2924	subq	$-4 * SIZE, AO
2925	subq	$-4 * SIZE, BO
2926
2927	subq	$1, %rax
2928	BRANCH
2929	jg	.L82
2930
2931	addpd	%xmm9, %xmm8
2932	ALIGN_3
2933
2934.L85:
2935#if defined(LT) || defined(RN)
2936	movq	KK, %rax
2937#else
2938	movq	K, %rax
2939	subq	KK, %rax
2940#endif
2941	andq	$3, %rax		# if (k & 1)
2942	BRANCH
2943	je	.L88
2944	ALIGN_3
2945
2946.L86:
2947	mulsd	%xmm0, %xmm1
2948	movsd	-15 * SIZE(AO), %xmm0
2949	addsd	%xmm1, %xmm8
2950	movsd	-15 * SIZE(BO), %xmm1
2951
2952	addq	$1 * SIZE, AO
2953	addq	$1 * SIZE, BO
2954
2955	subq	$1, %rax
2956	BRANCH
2957	jg	.L86
2958	ALIGN_4
2959
2960.L88:
2961#if defined(LN) || defined(RT)
2962	movq	KK, %rax
2963	subq	$1, %rax
2964
2965	leaq	(, %rax, SIZE), %rax
2966
2967	movq	AORIG, AO
2968	leaq	(AO, %rax, 1), AO
2969	leaq	(B,  %rax, 1), BO
2970#endif
2971
2972	haddpd	%xmm8, %xmm8
2973
2974#if defined(LN) || defined(LT)
2975	movsd	-16 * SIZE(BO), %xmm0
2976#else
2977	movsd	-16 * SIZE(AO), %xmm0
2978#endif
2979
2980	subsd	%xmm8,  %xmm0
2981
2982#if defined(LN) || defined(LT)
2983	movsd	-16 * SIZE(AO), %xmm8
2984	mulsd	 %xmm8, %xmm0
2985#endif
2986
2987#if defined(RN) || defined(RT)
2988	movsd	-16 * SIZE(BO), %xmm10
2989	mulsd	%xmm10, %xmm0
2990#endif
2991
2992#ifdef LN
2993	subq	$1 * SIZE, CO1
2994#endif
2995
2996	movsd	%xmm0,  0 * SIZE(CO1)
2997
2998#if defined(LN) || defined(LT)
2999	movsd	%xmm0,  -16 * SIZE(BO)
3000#else
3001	movsd	%xmm0,  -16 * SIZE(AO)
3002#endif
3003
3004#ifndef LN
3005	addq	$1 * SIZE, CO1
3006#endif
3007
3008
3009#if defined(LT) || defined(RN)
3010	movq	K,  %rax
3011	subq	KK, %rax
3012	leaq	(,%rax, SIZE), %rax
3013	leaq	(AO, %rax, 1), AO
3014	leaq	(BO, %rax, 1), BO
3015#endif
3016
3017#ifdef LN
3018	subq	$1, KK
3019#endif
3020
3021#ifdef LT
3022	addq	$1, KK
3023#endif
3024
3025#ifdef RT
3026       movq	K, %rax
3027       salq	$BASE_SHIFT, %rax
3028       addq	%rax, AORIG
3029#endif
3030	ALIGN_4
3031
3032.L89:
3033#ifdef LN
3034       leaq	(, K, SIZE), %rax
3035       leaq	(B, %rax, 1), B
3036#endif
3037#if defined(LT) || defined(RN)
3038	movq	BO, B
3039#endif
3040
3041#ifdef RN
3042	addq	$1, KK
3043#endif
3044
3045#ifdef RT
3046	subq	$1, KK
3047#endif
3048	ALIGN_4
3049
3050
3051.L999:
3052	movq	  0(%rsp), %rbx
3053	movq	  8(%rsp), %rbp
3054	movq	 16(%rsp), %r12
3055	movq	 24(%rsp), %r13
3056	movq	 32(%rsp), %r14
3057	movq	 40(%rsp), %r15
3058
3059#ifdef WINDOWS_ABI
3060	movq	 48(%rsp), %rdi
3061	movq	 56(%rsp), %rsi
3062	movups	 64(%rsp), %xmm6
3063	movups	 80(%rsp), %xmm7
3064	movups	 96(%rsp), %xmm8
3065	movups	112(%rsp), %xmm9
3066	movups	128(%rsp), %xmm10
3067	movups	144(%rsp), %xmm11
3068	movups	160(%rsp), %xmm12
3069	movups	176(%rsp), %xmm13
3070	movups	192(%rsp), %xmm14
3071	movups	208(%rsp), %xmm15
3072#endif
3073
3074	addq	$STACKSIZE, %rsp
3075	ret
3076
3077	EPILOGUE
3078