1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define M	%rdi
43#define N	%rsi
44#define K	%rdx
45#define A	%rcx
46#define B	%r8
47#define C	%r9
48#define LDC	%r10
49
50#define I	%r11
51#define J	%r12
52#define AO	%r13
53#define BO	%r14
54#define	CO1	%r15
55#define CO2	%rbp
56
57#ifndef WINDOWS_ABI
58
59#define STACKSIZE 64
60
61#define OLD_LDC		 8 + STACKSIZE(%rsp)
62#define OLD_OFFSET	16 + STACKSIZE(%rsp)
63
64#else
65
66#define STACKSIZE 256
67
68#define OLD_A		40 + STACKSIZE(%rsp)
69#define OLD_B		48 + STACKSIZE(%rsp)
70#define OLD_C		56 + STACKSIZE(%rsp)
71#define OLD_LDC		64 + STACKSIZE(%rsp)
72#define OLD_OFFSET	72 + STACKSIZE(%rsp)
73
74#endif
75
76#define ALPHA	  0(%rsp)
77#define OFFSET	 16(%rsp)
78#define KK	 24(%rsp)
79#define KKK	 32(%rsp)
80#define AORIG	 40(%rsp)
81#define BORIG	 48(%rsp)
82#define BUFFER	128(%rsp)
83
84#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
85#define PREFETCH     prefetch
86#define PREFETCHW    prefetchw
87#define PREFETCHNTA  prefetchnta
88#ifndef ALLOC_HUGETLB
89#define PREFETCHSIZE (8 * 4 + 4)
90#else
91#define PREFETCHSIZE (8 * 2 + 4)
92#endif
93#endif
94
95#ifdef GENERIC
96#define PREFETCH     prefetcht0
97#define PREFETCHW    prefetcht0
98#define PREFETCHNTA  prefetchnta
99#define PREFETCHSIZE (8 * 4 + 4)
100#endif
101
102#ifdef OPTERON
103#define movsd	movlpd
104#endif
105
106#define KERNEL1(xx) \
107	mulpd	%xmm8, %xmm9 ;\
108	addpd	%xmm9, %xmm0 ;\
109	movapd	 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\
110	mulpd	%xmm8, %xmm11 ;\
111	PREFETCH	(PREFETCHSIZE     +  0) * SIZE + 1 * (xx) * SIZE(AO) ;\
112	addpd	%xmm11, %xmm1 ;\
113	movapd	 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
114	mulpd	%xmm8, %xmm13 ;\
115	mulpd	 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\
116	addpd	%xmm13, %xmm2 ;\
117	movapd	 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
118	addpd	%xmm8, %xmm3 ;\
119	movapd	 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm8
120
121#define KERNEL2(xx) \
122	mulpd	%xmm10, %xmm9 ;\
123	addpd	%xmm9, %xmm4 ;\
124	movapd	16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\
125	mulpd	%xmm10, %xmm11 ;\
126	addpd	%xmm11, %xmm5 ;\
127	movapd	10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
128	mulpd	%xmm10, %xmm13 ;\
129	mulpd	 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\
130	addpd	%xmm13, %xmm6 ;\
131	movapd	12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
132	addpd	%xmm10, %xmm7 ;\
133	movapd	10 * SIZE + 1 * (xx) * SIZE(AO), %xmm10
134
135#define KERNEL3(xx) \
136	mulpd	%xmm12, %xmm15 ;\
137	addpd	%xmm15, %xmm0 ;\
138	movapd	 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\
139	mulpd	%xmm12, %xmm11 ;\
140	addpd	%xmm11, %xmm1 ;\
141	movapd	10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
142	mulpd	%xmm12, %xmm13 ;\
143	mulpd	14 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\
144	addpd	%xmm13, %xmm2 ;\
145	movapd	12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
146	addpd	%xmm12, %xmm3 ;\
147	movapd	12 * SIZE + 1 * (xx) * SIZE(AO), %xmm12
148
149#define KERNEL4(xx) \
150	mulpd	%xmm14, %xmm15 ;\
151	addpd	%xmm15, %xmm4 ;\
152	movapd	24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\
153	mulpd	%xmm14, %xmm11 ;\
154	addpd	%xmm11, %xmm5 ;\
155	movapd	18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
156	mulpd	%xmm14, %xmm13 ;\
157	mulpd	14 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\
158	addpd	%xmm13, %xmm6 ;\
159	movapd	20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
160	addpd	%xmm14, %xmm7 ;\
161	movapd	14 * SIZE + 1 * (xx) * SIZE(AO), %xmm14
162
163#define KERNEL5(xx) \
164	mulpd	%xmm8, %xmm9 ;\
165	addpd	%xmm9, %xmm0 ;\
166	movapd	16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\
167	mulpd	%xmm8, %xmm11 ;\
168	PREFETCH	(PREFETCHSIZE     +  8) * SIZE + 1 * (xx) * SIZE(AO) ;\
169	addpd	%xmm11, %xmm1 ;\
170	movapd	18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
171	mulpd	%xmm8, %xmm13 ;\
172	mulpd	22 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\
173	addpd	%xmm13, %xmm2 ;\
174	movapd	20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
175	addpd	%xmm8, %xmm3 ;\
176	movapd	16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8
177
178#define KERNEL6(xx) \
179	mulpd	%xmm10, %xmm9 ;\
180	addpd	%xmm9, %xmm4 ;\
181	movapd	32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\
182	mulpd	%xmm10, %xmm11 ;\
183	addpd	%xmm11, %xmm5 ;\
184	movapd	26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
185	mulpd	%xmm10, %xmm13 ;\
186	mulpd	22 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\
187	addpd	%xmm13, %xmm6 ;\
188	movapd	28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
189	addpd	%xmm10, %xmm7 ;\
190	movapd	18 * SIZE + 1 * (xx) * SIZE(AO), %xmm10
191
192#define KERNEL7(xx) \
193	mulpd	%xmm12, %xmm15 ;\
194	addpd	%xmm15, %xmm0 ;\
195	movapd	24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\
196	mulpd	%xmm12, %xmm11 ;\
197	addpd	%xmm11, %xmm1 ;\
198	movapd	26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
199	mulpd	%xmm12, %xmm13 ;\
200	mulpd	30 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\
201	addpd	%xmm13, %xmm2 ;\
202	movapd	28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
203	addpd	%xmm12, %xmm3 ;\
204	movapd	20 * SIZE + 1 * (xx) * SIZE(AO), %xmm12
205
206#define KERNEL8(xx) \
207	mulpd	%xmm14, %xmm15 ;\
208	addpd	%xmm15, %xmm4 ;\
209	movapd	40 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\
210	mulpd	%xmm14, %xmm11 ;\
211	addpd	%xmm11, %xmm5 ;\
212	movapd	34 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
213	mulpd	%xmm14, %xmm13 ;\
214	mulpd	30 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\
215	addpd	%xmm13, %xmm6 ;\
216	movapd	36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
217	addpd	%xmm14, %xmm7 ;\
218	movapd	22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14
219
220	PROLOGUE
221	PROFCODE
222
223	subq	$STACKSIZE, %rsp
224	movq	%rbx,  0(%rsp)
225	movq	%rbp,  8(%rsp)
226	movq	%r12, 16(%rsp)
227	movq	%r13, 24(%rsp)
228	movq	%r14, 32(%rsp)
229	movq	%r15, 40(%rsp)
230
231#ifdef WINDOWS_ABI
232	movq	%rdi,    48(%rsp)
233	movq	%rsi,    56(%rsp)
234	movups	%xmm6,   64(%rsp)
235	movups	%xmm7,   80(%rsp)
236	movups	%xmm8,   96(%rsp)
237	movups	%xmm9,  112(%rsp)
238	movups	%xmm10, 128(%rsp)
239	movups	%xmm11, 144(%rsp)
240	movups	%xmm12, 160(%rsp)
241	movups	%xmm13, 176(%rsp)
242	movups	%xmm14, 192(%rsp)
243	movups	%xmm15, 208(%rsp)
244
245	movq	ARG1,      M
246	movq	ARG2,      N
247	movq	ARG3,      K
248	movq	OLD_A,     A
249	movq	OLD_B,     B
250	movq	OLD_C,     C
251	movq	OLD_LDC,   LDC
252	movsd	OLD_OFFSET, %xmm4
253
254	movaps	%xmm3, %xmm0
255
256#else
257	movq	OLD_LDC,   LDC
258	movsd	OLD_OFFSET, %xmm4
259
260#endif
261
262	movq	%rsp, %rbx	# save old stack
263	subq	$128 + LOCAL_BUFFER_SIZE, %rsp
264	andq	$-4096, %rsp	# align stack
265
266	STACK_TOUCHING
267
268	movsd	%xmm4, OFFSET
269	movsd	%xmm4, KK
270
271	leaq	(, LDC, SIZE), LDC
272
273#ifdef LN
274       leaq	(, M, SIZE), %rax
275       addq	%rax, C
276       imulq	K, %rax
277       addq	%rax, A
278#endif
279
280#ifdef RT
281       leaq	(, N, SIZE), %rax
282       imulq	K, %rax
283       addq	%rax, B
284       movq	N, %rax
285       imulq	LDC, %rax
286       addq	%rax, C
287#endif
288
289#ifdef RN
290	negq	KK
291#endif
292
293#ifdef RT
294       movq	N, %rax
295       subq	OFFSET, %rax
296       movq	%rax, KK
297#endif
298
299	testq	$1, N
300	je	.L40
301	ALIGN_4
302
303.L81:
304/* Copying to Sub Buffer */
305
306#ifdef LN
307	movq	OFFSET, %rax
308	addq	M, %rax
309	movq	%rax, KK
310#endif
311
312	leaq	BUFFER, BO
313
314#ifdef RT
315       movq	K, %rax
316       salq	$0 + BASE_SHIFT, %rax
317       subq	%rax, B
318#endif
319
320#if defined(LN) || defined(RT)
321	movq	KK, %rax
322	movq	B, BORIG
323	leaq	(, %rax, SIZE), %rax
324	leaq	(B,  %rax, 1), B
325	leaq	(BO, %rax, 2), BO
326#endif
327
328#ifdef LT
329	movq	OFFSET, %rax
330	movq	%rax, KK
331#endif
332
333#if defined(LT) || defined(RN)
334	movq	KK, %rax
335#else
336	movq	K, %rax
337	subq	KK, %rax
338#endif
339	sarq	$3, %rax
340	jle	.L83
341	ALIGN_4
342
343.L82:
344	PREFETCH	 56 * SIZE(B)
345
346	movsd	 0 * SIZE(B), %xmm0
347	movsd	 1 * SIZE(B), %xmm1
348	movsd	 2 * SIZE(B), %xmm2
349	movsd	 3 * SIZE(B), %xmm3
350	movsd	 4 * SIZE(B), %xmm4
351	movsd	 5 * SIZE(B), %xmm5
352	movsd	 6 * SIZE(B), %xmm6
353	movsd	 7 * SIZE(B), %xmm7
354
355	addq	$ 8 * SIZE, B
356	addq	$16 * SIZE, BO
357
358	movsd	%xmm0, -16 * SIZE(BO)
359	movsd	%xmm0, -15 * SIZE(BO)
360	movsd	%xmm1, -14 * SIZE(BO)
361	movsd	%xmm1, -13 * SIZE(BO)
362	movsd	%xmm2, -12 * SIZE(BO)
363	movsd	%xmm2, -11 * SIZE(BO)
364	movsd	%xmm3, -10 * SIZE(BO)
365	movsd	%xmm3,  -9 * SIZE(BO)
366	movsd	%xmm4,  -8 * SIZE(BO)
367	movsd	%xmm4,  -7 * SIZE(BO)
368	movsd	%xmm5,  -6 * SIZE(BO)
369	movsd	%xmm5,  -5 * SIZE(BO)
370	movsd	%xmm6,  -4 * SIZE(BO)
371	movsd	%xmm6,  -3 * SIZE(BO)
372	movsd	%xmm7,  -2 * SIZE(BO)
373	movsd	%xmm7,  -1 * SIZE(BO)
374
375	decq	%rax
376	jne	.L82
377	ALIGN_4
378
379.L83:
380#if defined(LT) || defined(RN)
381	movq	KK, %rax
382#else
383	movq	K, %rax
384	subq	KK, %rax
385#endif
386	andq	$7, %rax
387	BRANCH
388	jle	.L90
389	ALIGN_4
390
391.L84:
392	movsd	 0 * SIZE(B), %xmm0
393
394	movsd	%xmm0,  0 * SIZE(BO)
395	movsd	%xmm0,  1 * SIZE(BO)
396
397	addq	$1 * SIZE, B
398	addq	$2 * SIZE, BO
399	decq	%rax
400	jne	.L84
401	ALIGN_4
402
403.L90:
404#if defined(LT) || defined(RN)
405	movq	A, AO
406#else
407	movq	A, AORIG
408#endif
409
410#ifdef RT
411	subq	LDC, C
412#endif
413
414	movq	C, CO1			# coffset1 = c
415#ifndef RT
416	addq	LDC, C
417#endif
418
419	movq	M,  I
420	sarq	$2, I	# i = (m >> 2)
421	jle	.L100
422	ALIGN_4
423
424.L91:
425#ifdef LN
426       movq	K, %rax
427       salq	$2 + BASE_SHIFT, %rax
428       subq	%rax, AORIG
429#endif
430
431#if defined(LN) || defined(RT)
432	movq	KK, %rax
433	movq	AORIG, AO
434	leaq	(, %rax, SIZE), %rax
435	leaq	(AO, %rax, 4), AO
436#endif
437
438	leaq	BUFFER, BO
439
440#if defined(LN) || defined(RT)
441	movq	KK, %rax
442	salq	$0 + BASE_SHIFT, %rax
443	leaq	(BO, %rax, 2), BO
444#endif
445
446	movapd	 0 * SIZE(AO), %xmm8
447	pxor	%xmm0, %xmm0
448	movapd	 0 * SIZE(BO), %xmm9
449	pxor	%xmm1, %xmm1
450	movapd	 8 * SIZE(AO), %xmm10
451	pxor	%xmm2, %xmm2
452	movapd	 8 * SIZE(BO), %xmm11
453	pxor	%xmm3, %xmm3
454
455	movapd	16 * SIZE(AO), %xmm12
456	movapd	24 * SIZE(AO), %xmm14
457
458	PREFETCHW      4 * SIZE(CO1)
459
460#if defined(LT) || defined(RN)
461	movq	KK, %rax
462#else
463	movq	K, %rax
464	subq	KK, %rax
465#endif
466	sarq	$3, %rax
467	je	.L95
468	ALIGN_4
469
470.L92:
471	mulpd	%xmm9, %xmm8
472	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
473	mulpd	 2 * SIZE(AO), %xmm9
474	addpd	%xmm8, %xmm0
475	movapd	 4 * SIZE(AO), %xmm8
476	addpd	%xmm9, %xmm1
477	movapd	 2 * SIZE(BO), %xmm9
478	mulpd	%xmm9, %xmm8
479	mulpd	 6 * SIZE(AO), %xmm9
480	addpd	%xmm8, %xmm2
481	movapd	32 * SIZE(AO), %xmm8
482	PREFETCH	(PREFETCHSIZE     +  8) * SIZE(AO)
483	addpd	%xmm9, %xmm3
484	movapd	 4 * SIZE(BO), %xmm9
485	mulpd	%xmm9, %xmm10
486	mulpd	10 * SIZE(AO), %xmm9
487	addpd	%xmm10, %xmm0
488	movapd	12 * SIZE(AO), %xmm10
489	addpd	%xmm9, %xmm1
490	movapd	 6 * SIZE(BO), %xmm9
491	mulpd	%xmm9, %xmm10
492	mulpd	14 * SIZE(AO), %xmm9
493	addpd	%xmm10, %xmm2
494	movapd	40 * SIZE(AO), %xmm10
495	PREFETCH	(PREFETCHSIZE     + 16) * SIZE(AO)
496	addpd	%xmm9, %xmm3
497	movapd	16 * SIZE(BO), %xmm9
498	mulpd	%xmm11, %xmm12
499	mulpd	18 * SIZE(AO), %xmm11
500	addpd	%xmm12, %xmm0
501	movapd	20 * SIZE(AO), %xmm12
502	addpd	%xmm11, %xmm1
503	movapd	10 * SIZE(BO), %xmm11
504	mulpd	%xmm11, %xmm12
505	mulpd	22 * SIZE(AO), %xmm11
506	addpd	%xmm12, %xmm2
507	movapd	48 * SIZE(AO), %xmm12
508	PREFETCH	(PREFETCHSIZE     + 24) * SIZE(AO)
509	addpd	%xmm11, %xmm3
510	movapd	12 * SIZE(BO), %xmm11
511	mulpd	%xmm11, %xmm14
512	mulpd	26 * SIZE(AO), %xmm11
513	addpd	%xmm14, %xmm0
514	movapd	28 * SIZE(AO), %xmm14
515	addpd	%xmm11, %xmm1
516	movapd	14 * SIZE(BO), %xmm11
517	mulpd	%xmm11, %xmm14
518	mulpd	30 * SIZE(AO), %xmm11
519	addpd	%xmm14, %xmm2
520	movapd	56 * SIZE(AO), %xmm14
521	addpd	%xmm11, %xmm3
522	movapd	24 * SIZE(BO), %xmm11
523
524	addq   $32 * SIZE, AO
525	addq   $16 * SIZE, BO
526	decq   %rax
527	jne    .L92
528	ALIGN_4
529
530.L95:
531#if defined(LT) || defined(RN)
532	movq	KK, %rax
533#else
534	movq	K, %rax
535	subq	KK, %rax
536#endif
537	andq	$7, %rax		# if (k & 1)
538	BRANCH
539	je .L99
540	ALIGN_4
541
542.L96:
543	mulpd	%xmm9, %xmm8
544	mulpd	 2 * SIZE(AO), %xmm9
545	addpd	%xmm8, %xmm0
546	movapd	 4 * SIZE(AO), %xmm8
547	addpd	%xmm9, %xmm1
548	movapd	 2 * SIZE(BO), %xmm9
549
550	addq	$4 * SIZE, AO		# aoffset  += 4
551	addq	$2 * SIZE, BO		# boffset1 += 8
552	decq	%rax
553	jg	.L96
554	ALIGN_4
555
556.L99:
557	addpd	%xmm2, %xmm0
558	addpd	%xmm3, %xmm1
559
560#if defined(LN) || defined(RT)
561	movq	KK, %rax
562#ifdef LN
563	subq	$4, %rax
564#else
565	subq	$1, %rax
566#endif
567
568	movq	AORIG, AO
569	movq	BORIG, B
570	leaq	BUFFER, BO
571
572	leaq	(, %rax, SIZE), %rax
573	leaq	(AO, %rax, 4), AO
574	leaq	(B,  %rax, 1), B
575	leaq	(BO, %rax, 2), BO
576#endif
577
578#if defined(LN) || defined(LT)
579	movapd	 0 * SIZE(B), %xmm2
580	movapd	 2 * SIZE(B), %xmm3
581
582	subpd	%xmm0,  %xmm2
583	subpd	%xmm1,  %xmm3
584#else
585	movapd	 0 * SIZE(AO), %xmm2
586	movapd	 2 * SIZE(AO), %xmm3
587
588	subpd	%xmm0, %xmm2
589	subpd	%xmm1, %xmm3
590#endif
591
592#ifdef LN
593	movapd	%xmm2, %xmm0
594        unpckhpd %xmm0, %xmm0
595
596	movapd	%xmm3, %xmm1
597        unpckhpd %xmm1, %xmm1
598
599	movsd	15 * SIZE(AO), %xmm4
600	mulsd	 %xmm4, %xmm1
601
602	movsd	14 * SIZE(AO), %xmm5
603	mulsd	 %xmm1, %xmm5
604	subsd	 %xmm5, %xmm3
605	movsd	13 * SIZE(AO), %xmm6
606	mulsd	 %xmm1, %xmm6
607	subsd	 %xmm6, %xmm0
608	movsd	12 * SIZE(AO), %xmm7
609	mulsd	 %xmm1, %xmm7
610	subsd	 %xmm7, %xmm2
611
612	movsd	10 * SIZE(AO), %xmm4
613	mulsd	 %xmm4, %xmm3
614
615	movsd	 9 * SIZE(AO), %xmm5
616	mulsd	 %xmm3, %xmm5
617	subsd	 %xmm5, %xmm0
618	movsd	 8 * SIZE(AO), %xmm6
619	mulsd	 %xmm3, %xmm6
620	subsd	 %xmm6, %xmm2
621
622	movsd	 5 * SIZE(AO), %xmm4
623	mulsd	 %xmm4, %xmm0
624
625	movsd	 4 * SIZE(AO), %xmm5
626	mulsd	 %xmm0, %xmm5
627	subsd	 %xmm5, %xmm2
628
629	movsd	 0 * SIZE(AO), %xmm4
630	mulsd	 %xmm4, %xmm2
631
632	unpcklpd %xmm0, %xmm2
633	unpcklpd %xmm1, %xmm3
634#endif
635
636#ifdef LT
637	movapd	%xmm2, %xmm0
638        unpckhpd %xmm0, %xmm0
639
640	movapd	%xmm3, %xmm1
641        unpckhpd %xmm1, %xmm1
642
643	movsd	 0 * SIZE(AO), %xmm4
644	mulsd	 %xmm4, %xmm2
645
646	movsd	 1 * SIZE(AO), %xmm5
647	mulsd	 %xmm2, %xmm5
648	subsd	 %xmm5, %xmm0
649	movsd	 2 * SIZE(AO), %xmm6
650	mulsd	 %xmm2, %xmm6
651	subsd	 %xmm6, %xmm3
652	movsd	 3 * SIZE(AO), %xmm7
653	mulsd	 %xmm2, %xmm7
654	subsd	 %xmm7, %xmm1
655
656	movsd	 5 * SIZE(AO), %xmm4
657	mulsd	 %xmm4, %xmm0
658
659	movsd	 6 * SIZE(AO), %xmm5
660	mulsd	 %xmm0, %xmm5
661	subsd	 %xmm5, %xmm3
662	movsd	 7 * SIZE(AO), %xmm6
663	mulsd	 %xmm0, %xmm6
664	subsd	 %xmm6, %xmm1
665
666	movsd	10 * SIZE(AO), %xmm4
667	mulsd	 %xmm4, %xmm3
668
669	movsd	11 * SIZE(AO), %xmm5
670	mulsd	 %xmm3, %xmm5
671	subsd	 %xmm5, %xmm1
672
673	movsd	15 * SIZE(AO), %xmm4
674	mulsd	 %xmm4, %xmm1
675
676	unpcklpd %xmm0, %xmm2
677	unpcklpd %xmm1, %xmm3
678#endif
679
680#ifdef RN
681	movlpd	 0 * SIZE(B), %xmm0
682	movhpd	 0 * SIZE(B), %xmm0
683	mulpd	 %xmm0, %xmm2
684	mulpd	 %xmm0, %xmm3
685#endif
686
687#ifdef RT
688	movlpd	 0 * SIZE(B), %xmm0
689	movhpd	 0 * SIZE(B), %xmm0
690	mulpd	 %xmm0, %xmm2
691	mulpd	 %xmm0, %xmm3
692#endif
693
694#ifdef LN
695	subq	$4 * SIZE, CO1
696#endif
697
698#if defined(LN) || defined(LT)
699	movsd	%xmm2,  0 * SIZE(CO1)
700	movhpd	%xmm2,  1 * SIZE(CO1)
701	movsd	%xmm3,  2 * SIZE(CO1)
702	movhpd	%xmm3,  3 * SIZE(CO1)
703#else
704	movsd	%xmm2,  0 * SIZE(CO1)
705	movhpd	%xmm2,  1 * SIZE(CO1)
706	movsd	%xmm3,  2 * SIZE(CO1)
707	movhpd	%xmm3,  3 * SIZE(CO1)
708#endif
709
710#if defined(LN) || defined(LT)
711	movapd	%xmm2,   0 * SIZE(B)
712	movapd	%xmm3,   2 * SIZE(B)
713
714	movlpd	%xmm2,   0 * SIZE(BO)
715	movlpd	%xmm2,   1 * SIZE(BO)
716	movhpd	%xmm2,   2 * SIZE(BO)
717	movhpd	%xmm2,   3 * SIZE(BO)
718	movlpd	%xmm3,   4 * SIZE(BO)
719	movlpd	%xmm3,   5 * SIZE(BO)
720	movhpd	%xmm3,   6 * SIZE(BO)
721	movhpd	%xmm3,   7 * SIZE(BO)
722#else
723	movapd	%xmm2,   0 * SIZE(AO)
724	movapd	%xmm3,   2 * SIZE(AO)
725#endif
726
727#ifndef LN
728	addq	$4 * SIZE, CO1
729#endif
730
731#if defined(LT) || defined(RN)
732	movq	K,  %rax
733	subq	KK, %rax
734	leaq	(,%rax, SIZE), %rax
735	leaq	(AO, %rax, 4), AO
736#ifdef LT
737	addq	$4 * SIZE, B
738#endif
739#endif
740
741#ifdef LN
742	subq	$4, KK
743	movq	BORIG, B
744#endif
745
746#ifdef LT
747	addq	$4, KK
748#endif
749
750#ifdef RT
751       movq	K, %rax
752       movq	BORIG, B
753       salq	$2 + BASE_SHIFT, %rax
754       addq	%rax, AORIG
755#endif
756
757	decq	I			# i --
758	jg	.L91
759	ALIGN_4
760
761.L100:
762	testq	$2, M
763	je	.L110
764	ALIGN_4
765
766.L101:
767#ifdef LN
768       movq	K, %rax
769       salq	$1 + BASE_SHIFT, %rax
770       subq	%rax, AORIG
771#endif
772
773#if defined(LN) || defined(RT)
774	movq	KK, %rax
775	movq	AORIG, AO
776	leaq	(, %rax, SIZE), %rax
777	leaq	(AO, %rax, 2), AO
778#endif
779
780	leaq	BUFFER, BO
781
782#if defined(LN) || defined(RT)
783	movq	KK, %rax
784	salq	$0 + BASE_SHIFT, %rax
785	leaq	(BO, %rax, 2), BO
786#endif
787
788	movapd	 0 * SIZE(AO), %xmm8
789	pxor	%xmm0, %xmm0
790	movapd	 0 * SIZE(BO), %xmm9
791	pxor	%xmm1, %xmm1
792	movapd	 8 * SIZE(AO), %xmm10
793	pxor	%xmm2, %xmm2
794	movapd	 8 * SIZE(BO), %xmm11
795	pxor	%xmm3, %xmm3
796
797#if defined(LT) || defined(RN)
798	movq	KK, %rax
799#else
800	movq	K, %rax
801	subq	KK, %rax
802#endif
803	sarq	$3, %rax
804	je	.L105
805	ALIGN_4
806
807.L102:
808	mulpd	%xmm8, %xmm9
809	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
810	movapd	 2 * SIZE(AO), %xmm8
811	mulpd	 2 * SIZE(BO), %xmm8
812	addpd	%xmm9, %xmm0
813	movapd	16 * SIZE(BO), %xmm9
814	addpd	%xmm8, %xmm1
815	movapd	 4 * SIZE(AO), %xmm8
816	mulpd	 4 * SIZE(BO), %xmm8
817	addpd	%xmm8, %xmm2
818	movapd	 6 * SIZE(AO), %xmm8
819	mulpd	 6 * SIZE(BO), %xmm8
820	addpd	%xmm8, %xmm3
821	movapd	16 * SIZE(AO), %xmm8
822	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
823	mulpd	%xmm10, %xmm11
824	movapd	10 * SIZE(AO), %xmm10
825	mulpd	10 * SIZE(BO), %xmm10
826	addpd	%xmm11, %xmm0
827	movapd	24 * SIZE(BO), %xmm11
828	addpd	%xmm10, %xmm1
829	movapd	12 * SIZE(AO), %xmm10
830	mulpd	12 * SIZE(BO), %xmm10
831	addpd	%xmm10, %xmm2
832	movapd	14 * SIZE(AO), %xmm10
833	mulpd	14 * SIZE(BO), %xmm10
834	addpd	%xmm10, %xmm3
835	movapd	24 * SIZE(AO), %xmm10
836
837	addq   $16 * SIZE, AO
838	addq   $16 * SIZE, BO
839	decq   %rax
840	jne    .L102
841	ALIGN_4
842
843.L105:
844#if defined(LT) || defined(RN)
845	movq	KK, %rax
846#else
847	movq	K, %rax
848	subq	KK, %rax
849#endif
850	andq	$7, %rax		# if (k & 1)
851	BRANCH
852	je .L109
853	ALIGN_4
854
855.L106:
856	mulpd	%xmm8, %xmm9
857	addpd	%xmm9, %xmm0
858	movapd	 2 * SIZE(AO), %xmm8
859	movapd	 2 * SIZE(BO), %xmm9
860
861	addq	$2 * SIZE, AO		# aoffset  += 4
862	addq	$2 * SIZE, BO		# boffset1 += 8
863	decq	%rax
864	jg	.L106
865	ALIGN_4
866
867.L109:
868	addpd	%xmm1, %xmm0
869	addpd	%xmm3, %xmm2
870	addpd	%xmm2, %xmm0
871
872#if defined(LN) || defined(RT)
873	movq	KK, %rax
874#ifdef LN
875	subq	$2, %rax
876#else
877	subq	$1, %rax
878#endif
879
880	movq	AORIG, AO
881	movq	BORIG, B
882	leaq	BUFFER, BO
883
884	leaq	(, %rax, SIZE), %rax
885	leaq	(AO, %rax, 2), AO
886	leaq	(B,  %rax, 1), B
887	leaq	(BO, %rax, 2), BO
888#endif
889
890#if defined(LN) || defined(LT)
891	movapd	 0 * SIZE(B), %xmm2
892	subpd	%xmm0,  %xmm2
893#else
894	movapd	 0 * SIZE(AO), %xmm2
895	subpd	%xmm0, %xmm2
896#endif
897
898#ifdef LN
899	movapd	%xmm2, %xmm0
900        unpckhpd %xmm0, %xmm0
901
902	movsd	 3 * SIZE(AO), %xmm4
903	mulsd	 %xmm4, %xmm0
904
905	movsd	 2 * SIZE(AO), %xmm5
906	mulsd	 %xmm0, %xmm5
907	subsd	 %xmm5, %xmm2
908
909	movsd	 0 * SIZE(AO), %xmm4
910	mulsd	 %xmm4, %xmm2
911
912	unpcklpd %xmm0, %xmm2
913#endif
914
915#ifdef LT
916	movapd	%xmm2, %xmm0
917        unpckhpd %xmm0, %xmm0
918
919	movsd	 0 * SIZE(AO), %xmm4
920	mulsd	 %xmm4, %xmm2
921
922	movsd	 1 * SIZE(AO), %xmm5
923	mulsd	 %xmm2, %xmm5
924	subsd	 %xmm5, %xmm0
925
926	movsd	 3 * SIZE(AO), %xmm4
927	mulsd	 %xmm4, %xmm0
928
929	unpcklpd %xmm0, %xmm2
930#endif
931
932#ifdef RN
933	movlpd	 0 * SIZE(B), %xmm0
934	movhpd	 0 * SIZE(B), %xmm0
935	mulpd	 %xmm0, %xmm2
936#endif
937
938#ifdef RT
939	movlpd	 0 * SIZE(B), %xmm0
940	movhpd	 0 * SIZE(B), %xmm0
941	mulpd	 %xmm0, %xmm2
942#endif
943
944#ifdef LN
945	subq	$2 * SIZE, CO1
946#endif
947
948#if defined(LN) || defined(LT)
949	movsd	%xmm2,  0 * SIZE(CO1)
950	movhpd	%xmm2,  1 * SIZE(CO1)
951#else
952	movsd	%xmm2,  0 * SIZE(CO1)
953	movhpd	%xmm2,  1 * SIZE(CO1)
954#endif
955
956#if defined(LN) || defined(LT)
957	movapd	%xmm2,   0 * SIZE(B)
958
959	movlpd	%xmm2,   0 * SIZE(BO)
960	movlpd	%xmm2,   1 * SIZE(BO)
961	movhpd	%xmm2,   2 * SIZE(BO)
962	movhpd	%xmm2,   3 * SIZE(BO)
963#else
964	movapd	%xmm2,   0 * SIZE(AO)
965#endif
966
967#ifndef LN
968	addq	$2 * SIZE, CO1
969#endif
970
971#if defined(LT) || defined(RN)
972	movq	K,  %rax
973	subq	KK, %rax
974	leaq	(,%rax, SIZE), %rax
975	leaq	(AO, %rax, 2), AO
976#ifdef LT
977	addq	$2 * SIZE, B
978#endif
979#endif
980
981#ifdef LN
982	subq	$2, KK
983	movq	BORIG, B
984#endif
985
986#ifdef LT
987	addq	$2, KK
988#endif
989
990#ifdef RT
991       movq	K, %rax
992       movq	BORIG, B
993       salq	$1 + BASE_SHIFT, %rax
994       addq	%rax, AORIG
995#endif
996	ALIGN_4
997
998.L110:
999	testq	$1, M
1000	je	.L119
1001	ALIGN_4
1002
1003.L111:
1004#ifdef LN
1005       movq	K, %rax
1006       salq	$0 + BASE_SHIFT, %rax
1007       subq	%rax, AORIG
1008#endif
1009
1010#if defined(LN) || defined(RT)
1011	movq	KK, %rax
1012	movq	AORIG, AO
1013	leaq	(, %rax, SIZE), %rax
1014	leaq	(AO, %rax, 1), AO
1015#endif
1016
1017	leaq	BUFFER, BO
1018
1019#if defined(LN) || defined(RT)
1020	movq	KK, %rax
1021	salq	$0 + BASE_SHIFT, %rax
1022	leaq	(BO, %rax, 2), BO
1023#endif
1024
1025	movsd	 0 * SIZE(AO), %xmm8
1026	pxor	%xmm0, %xmm0
1027	movsd	 0 * SIZE(BO), %xmm9
1028	pxor	%xmm1, %xmm1
1029	movsd	 4 * SIZE(AO), %xmm10
1030	pxor	%xmm2, %xmm2
1031	movsd	 8 * SIZE(BO), %xmm11
1032	pxor	%xmm3, %xmm3
1033
1034#if defined(LT) || defined(RN)
1035	movq	KK, %rax
1036#else
1037	movq	K, %rax
1038	subq	KK, %rax
1039#endif
1040	sarq	$3, %rax
1041	je	.L115
1042	ALIGN_4
1043
1044.L112:
1045	mulsd	%xmm8, %xmm9
1046	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
1047	movsd	 1 * SIZE(AO), %xmm8
1048	addsd	%xmm9, %xmm0
1049	movsd	16 * SIZE(BO), %xmm9
1050	mulsd	 2 * SIZE(BO), %xmm8
1051	addsd	%xmm8, %xmm1
1052	movsd	 2 * SIZE(AO), %xmm8
1053	mulsd	 4 * SIZE(BO), %xmm8
1054	addsd	%xmm8, %xmm2
1055	movsd	 3 * SIZE(AO), %xmm8
1056	mulsd	 6 * SIZE(BO), %xmm8
1057	addsd	%xmm8, %xmm3
1058	movsd	 8 * SIZE(AO), %xmm8
1059	mulsd	%xmm10, %xmm11
1060	movsd	 5 * SIZE(AO), %xmm10
1061	addsd	%xmm11, %xmm0
1062	movsd	24 * SIZE(BO), %xmm11
1063	mulsd	10 * SIZE(BO), %xmm10
1064	addsd	%xmm10, %xmm1
1065	movsd	 6 * SIZE(AO), %xmm10
1066	mulsd	12 * SIZE(BO), %xmm10
1067	addsd	%xmm10, %xmm2
1068	movsd	 7 * SIZE(AO), %xmm10
1069	mulsd	14 * SIZE(BO), %xmm10
1070	addsd	%xmm10, %xmm3
1071	movsd	12 * SIZE(AO), %xmm10
1072
1073	addq   $ 8 * SIZE, AO
1074	addq   $16 * SIZE, BO
1075	decq   %rax
1076	jne    .L112
1077	ALIGN_4
1078
1079.L115:
1080#if defined(LT) || defined(RN)
1081	movq	KK, %rax
1082#else
1083	movq	K, %rax
1084	subq	KK, %rax
1085#endif
1086	andq	$7, %rax		# if (k & 1)
1087	BRANCH
1088	je .L118
1089	ALIGN_4
1090
1091.L116:
1092	mulsd	%xmm8, %xmm9
1093	movsd	 1 * SIZE(AO), %xmm8
1094	addsd	%xmm9, %xmm0
1095	movsd	 2 * SIZE(BO), %xmm9
1096
1097	addq	$1 * SIZE, AO		# aoffset  += 4
1098	addq	$2 * SIZE, BO		# boffset1 += 8
1099	decq	%rax
1100	jg	.L116
1101	ALIGN_4
1102
1103.L118:
1104	addsd	%xmm2, %xmm0
1105	addsd	%xmm3, %xmm1
1106	addsd	%xmm1, %xmm0
1107
1108#if defined(LN) || defined(RT)
1109	movq	KK, %rax
1110#ifdef LN
1111	subq	$1, %rax
1112#else
1113	subq	$1, %rax
1114#endif
1115
1116	movq	AORIG, AO
1117	movq	BORIG, B
1118	leaq	BUFFER, BO
1119
1120	leaq	(, %rax, SIZE), %rax
1121	leaq	(AO, %rax, 1), AO
1122	leaq	(B,  %rax, 1), B
1123	leaq	(BO, %rax, 2), BO
1124#endif
1125
1126#if defined(LN) || defined(LT)
1127	movsd	 0 * SIZE(B), %xmm2
1128	subsd	%xmm0,  %xmm2
1129#else
1130	movsd	 0 * SIZE(AO), %xmm2
1131	subsd	%xmm0, %xmm2
1132#endif
1133
1134#ifdef LN
1135	movsd	 0 * SIZE(AO), %xmm4
1136	mulsd	 %xmm4, %xmm2
1137#endif
1138
1139#ifdef LT
1140	movsd	 0 * SIZE(AO), %xmm4
1141	mulsd	 %xmm4, %xmm2
1142#endif
1143
1144#ifdef RN
1145	movsd	 0 * SIZE(B), %xmm0
1146	mulsd	 %xmm0, %xmm2
1147#endif
1148
1149#ifdef RT
1150	movsd	 0 * SIZE(B), %xmm0
1151	mulsd	 %xmm0, %xmm2
1152#endif
1153
1154#ifdef LN
1155	subq	$1 * SIZE, CO1
1156#endif
1157
1158#if defined(LN) || defined(LT)
1159	movsd	%xmm2,  0 * SIZE(CO1)
1160#else
1161	movsd	%xmm2,  0 * SIZE(CO1)
1162#endif
1163
1164#if defined(LN) || defined(LT)
1165	movsd	%xmm2,   0 * SIZE(B)
1166
1167	movlpd	%xmm2,   0 * SIZE(BO)
1168	movlpd	%xmm2,   1 * SIZE(BO)
1169#else
1170	movsd	%xmm2,   0 * SIZE(AO)
1171#endif
1172
1173#ifndef LN
1174	addq	$1 * SIZE, CO1
1175#endif
1176
1177#if defined(LT) || defined(RN)
1178	movq	K,  %rax
1179	subq	KK, %rax
1180	leaq	(,%rax, SIZE), %rax
1181	leaq	(AO, %rax, 1), AO
1182#ifdef LT
1183	addq	$1 * SIZE, B
1184#endif
1185#endif
1186
1187#ifdef LN
1188	subq	$1, KK
1189	movq	BORIG, B
1190#endif
1191
1192#ifdef LT
1193	addq	$1, KK
1194#endif
1195
1196#ifdef RT
1197       movq	K, %rax
1198       movq	BORIG, B
1199       salq	$0 + BASE_SHIFT, %rax
1200       addq	%rax, AORIG
1201#endif
1202	ALIGN_4
1203
1204.L119:
1205#ifdef LN
1206       leaq	(, K, SIZE), %rax
1207       leaq	(B, %rax, 1), B
1208#endif
1209
1210#if defined(LT) || defined(RN)
1211	movq	K,  %rax
1212	subq	KK, %rax
1213	leaq	(,%rax, SIZE), %rax
1214	leaq	(B,  %rax, 1), B
1215#endif
1216
1217#ifdef RN
1218	addq	$1, KK
1219#endif
1220
1221#ifdef RT
1222	subq	$1, KK
1223#endif
1224	ALIGN_4
1225
1226
1227.L40:
1228	testq	$2, N
1229	je	.L80
1230	ALIGN_4
1231
1232.L41:
1233/* Copying to Sub Buffer */
1234
1235#ifdef LN
1236	movq	OFFSET, %rax
1237	addq	M, %rax
1238	movq	%rax, KK
1239#endif
1240
1241	leaq	BUFFER, BO
1242
1243#ifdef RT
1244       movq	K, %rax
1245       salq	$1 + BASE_SHIFT, %rax
1246       subq	%rax, B
1247#endif
1248
1249#if defined(LN) || defined(RT)
1250	movq	KK, %rax
1251	movq	B, BORIG
1252	leaq	(, %rax, SIZE), %rax
1253	leaq	(B,  %rax, 2), B
1254	leaq	(BO, %rax, 4), BO
1255#endif
1256
1257#ifdef LT
1258	movq	OFFSET, %rax
1259	movq	%rax, KK
1260#endif
1261
1262#if defined(LT) || defined(RN)
1263	movq	KK, %rax
1264#else
1265	movq	K, %rax
1266	subq	KK, %rax
1267#endif
1268	sarq	$2, %rax
1269	jle	.L43
1270	ALIGN_4
1271
1272.L42:
1273	PREFETCH	 56 * SIZE(B)
1274
1275	movsd	 0 * SIZE(B), %xmm0
1276	movsd	 1 * SIZE(B), %xmm1
1277	movsd	 2 * SIZE(B), %xmm2
1278	movsd	 3 * SIZE(B), %xmm3
1279	movsd	 4 * SIZE(B), %xmm4
1280	movsd	 5 * SIZE(B), %xmm5
1281	movsd	 6 * SIZE(B), %xmm6
1282	movsd	 7 * SIZE(B), %xmm7
1283
1284	addq	$ 8 * SIZE, B
1285	addq	$16 * SIZE, BO
1286
1287	movsd	%xmm0, -16 * SIZE(BO)
1288	movsd	%xmm0, -15 * SIZE(BO)
1289	movsd	%xmm1, -14 * SIZE(BO)
1290	movsd	%xmm1, -13 * SIZE(BO)
1291	movsd	%xmm2, -12 * SIZE(BO)
1292	movsd	%xmm2, -11 * SIZE(BO)
1293	movsd	%xmm3, -10 * SIZE(BO)
1294	movsd	%xmm3,  -9 * SIZE(BO)
1295	movsd	%xmm4,  -8 * SIZE(BO)
1296	movsd	%xmm4,  -7 * SIZE(BO)
1297	movsd	%xmm5,  -6 * SIZE(BO)
1298	movsd	%xmm5,  -5 * SIZE(BO)
1299	movsd	%xmm6,  -4 * SIZE(BO)
1300	movsd	%xmm6,  -3 * SIZE(BO)
1301	movsd	%xmm7,  -2 * SIZE(BO)
1302	movsd	%xmm7,  -1 * SIZE(BO)
1303
1304	decq	%rax
1305	jne	.L42
1306	ALIGN_4
1307
1308.L43:
1309#if defined(LT) || defined(RN)
1310	movq	KK, %rax
1311#else
1312	movq	K, %rax
1313	subq	KK, %rax
1314#endif
1315	andq	$3, %rax
1316	BRANCH
1317	jle	.L50
1318	ALIGN_4
1319
1320.L44:
1321	movsd	 0 * SIZE(B), %xmm0
1322	movsd	 1 * SIZE(B), %xmm1
1323
1324	movsd	%xmm0,  0 * SIZE(BO)
1325	movsd	%xmm0,  1 * SIZE(BO)
1326	movsd	%xmm1,  2 * SIZE(BO)
1327	movsd	%xmm1,  3 * SIZE(BO)
1328
1329	addq	$2 * SIZE, B
1330	addq	$4 * SIZE, BO
1331	decq	%rax
1332	jne	.L44
1333	ALIGN_4
1334
1335.L50:
1336#if defined(LT) || defined(RN)
1337	movq	A, AO
1338#else
1339	movq	A, AORIG
1340#endif
1341
1342#ifdef RT
1343       leaq	(, LDC, 2), %rax
1344       subq	%rax, C
1345#endif
1346
1347	movq	C, CO1			# coffset1 = c
1348	leaq	(C, LDC, 1), CO2	# coffset2 = c + ldc
1349#ifndef RT
1350	leaq	(C, LDC, 2), C
1351#endif
1352
1353	movq	M,  I
1354	sarq	$2, I	# i = (m >> 2)
1355	jle	.L60
1356	ALIGN_4
1357
1358.L51:
1359#ifdef LN
1360       movq	K, %rax
1361       salq	$2 + BASE_SHIFT, %rax
1362       subq	%rax, AORIG
1363#endif
1364
1365#if defined(LN) || defined(RT)
1366	movq	KK, %rax
1367	movq	AORIG, AO
1368	leaq	(, %rax, SIZE), %rax
1369	leaq	(AO, %rax, 4), AO
1370#endif
1371
1372	leaq	BUFFER, BO
1373
1374#if defined(LN) || defined(RT)
1375	movq	KK, %rax
1376	salq	$1 + BASE_SHIFT, %rax
1377	leaq	(BO, %rax, 2), BO
1378#endif
1379
1380	movapd	 0 * SIZE(AO), %xmm8
1381	pxor	%xmm0, %xmm0
1382	movapd	 0 * SIZE(BO), %xmm9
1383	pxor	%xmm1, %xmm1
1384	movapd	 8 * SIZE(AO), %xmm10
1385	pxor	%xmm4, %xmm4
1386	movapd	 8 * SIZE(BO), %xmm11
1387	pxor	%xmm5, %xmm5
1388
1389	movapd	16 * SIZE(AO), %xmm12
1390	movapd	16 * SIZE(BO), %xmm13
1391	movapd	24 * SIZE(AO), %xmm14
1392	movapd	24 * SIZE(BO), %xmm15
1393
1394	PREFETCHW      4 * SIZE(CO1)
1395	PREFETCHW      4 * SIZE(CO2)
1396
1397#if defined(LT) || defined(RN)
1398	movq	KK, %rax
1399#else
1400	movq	K, %rax
1401	subq	KK, %rax
1402#endif
1403	sarq	$3, %rax
1404	je	.L55
1405	ALIGN_4
1406
1407.L52:
1408	mulpd	%xmm8, %xmm9
1409	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
1410	mulpd	 2 * SIZE(BO), %xmm8
1411	addpd	%xmm9, %xmm0
1412	movapd	 0 * SIZE(BO), %xmm9
1413	addpd	%xmm8, %xmm1
1414	movapd	 2 * SIZE(AO), %xmm8
1415	mulpd	%xmm8, %xmm9
1416	mulpd	 2 * SIZE(BO), %xmm8
1417	addpd	%xmm9, %xmm4
1418	movapd	 4 * SIZE(BO), %xmm9
1419	addpd	%xmm8, %xmm5
1420	movapd	 4 * SIZE(AO), %xmm8
1421
1422	mulpd	%xmm8, %xmm9
1423	mulpd	 6 * SIZE(BO), %xmm8
1424	addpd	%xmm9, %xmm0
1425	movapd	 4 * SIZE(BO), %xmm9
1426	addpd	%xmm8, %xmm1
1427	movapd	 6 * SIZE(AO), %xmm8
1428	mulpd	%xmm8, %xmm9
1429	mulpd	 6 * SIZE(BO), %xmm8
1430	addpd	%xmm9, %xmm4
1431	movapd	32 * SIZE(BO), %xmm9
1432	addpd	%xmm8, %xmm5
1433	movapd	32 * SIZE(AO), %xmm8
1434
1435	PREFETCH	(PREFETCHSIZE     +  8) * SIZE(AO)
1436	mulpd	%xmm10, %xmm11
1437	mulpd	10 * SIZE(BO), %xmm10
1438	addpd	%xmm11, %xmm0
1439	movapd	 8 * SIZE(BO), %xmm11
1440	addpd	%xmm10, %xmm1
1441	movapd	10 * SIZE(AO), %xmm10
1442	mulpd	%xmm10, %xmm11
1443	mulpd	10 * SIZE(BO), %xmm10
1444	addpd	%xmm11, %xmm4
1445	movapd	12 * SIZE(BO), %xmm11
1446	addpd	%xmm10, %xmm5
1447	movapd	12 * SIZE(AO), %xmm10
1448
1449	mulpd	%xmm10, %xmm11
1450	mulpd	14 * SIZE(BO), %xmm10
1451	addpd	%xmm11, %xmm0
1452	movapd	12 * SIZE(BO), %xmm11
1453	addpd	%xmm10, %xmm1
1454	movapd	14 * SIZE(AO), %xmm10
1455	mulpd	%xmm10, %xmm11
1456	mulpd	14 * SIZE(BO), %xmm10
1457	addpd	%xmm11, %xmm4
1458	movapd	40 * SIZE(BO), %xmm11
1459	addpd	%xmm10, %xmm5
1460	movapd	40 * SIZE(AO), %xmm10
1461
1462	PREFETCH	(PREFETCHSIZE     + 16) * SIZE(AO)
1463	mulpd	%xmm12, %xmm13
1464	mulpd	18 * SIZE(BO), %xmm12
1465	addpd	%xmm13, %xmm0
1466	movapd	16 * SIZE(BO), %xmm13
1467	addpd	%xmm12, %xmm1
1468	movapd	18 * SIZE(AO), %xmm12
1469	mulpd	%xmm12, %xmm13
1470	mulpd	18 * SIZE(BO), %xmm12
1471	addpd	%xmm13, %xmm4
1472	movapd	20 * SIZE(BO), %xmm13
1473	addpd	%xmm12, %xmm5
1474	movapd	20 * SIZE(AO), %xmm12
1475
1476	mulpd	%xmm12, %xmm13
1477	mulpd	22 * SIZE(BO), %xmm12
1478	addpd	%xmm13, %xmm0
1479	movapd	20 * SIZE(BO), %xmm13
1480	addpd	%xmm12, %xmm1
1481	movapd	22 * SIZE(AO), %xmm12
1482	mulpd	%xmm12, %xmm13
1483	mulpd	22 * SIZE(BO), %xmm12
1484	addpd	%xmm13, %xmm4
1485	movapd	48 * SIZE(BO), %xmm13
1486	addpd	%xmm12, %xmm5
1487	movapd	48 * SIZE(AO), %xmm12
1488
1489	PREFETCH	(PREFETCHSIZE     + 24) * SIZE(AO)
1490	mulpd	%xmm14, %xmm15
1491	mulpd	26 * SIZE(BO), %xmm14
1492	addpd	%xmm15, %xmm0
1493	movapd	24 * SIZE(BO), %xmm15
1494	addpd	%xmm14, %xmm1
1495	movapd	26 * SIZE(AO), %xmm14
1496	mulpd	%xmm14, %xmm15
1497	mulpd	26 * SIZE(BO), %xmm14
1498	addpd	%xmm15, %xmm4
1499	movapd	28 * SIZE(BO), %xmm15
1500	addpd	%xmm14, %xmm5
1501	movapd	28 * SIZE(AO), %xmm14
1502
1503	mulpd	%xmm14, %xmm15
1504	mulpd	30 * SIZE(BO), %xmm14
1505	addpd	%xmm15, %xmm0
1506	movapd	28 * SIZE(BO), %xmm15
1507	addpd	%xmm14, %xmm1
1508	movapd	30 * SIZE(AO), %xmm14
1509	mulpd	%xmm14, %xmm15
1510	mulpd	30 * SIZE(BO), %xmm14
1511	addpd	%xmm15, %xmm4
1512	movapd	56 * SIZE(BO), %xmm15
1513	addpd	%xmm14, %xmm5
1514	movapd	56 * SIZE(AO), %xmm14
1515
1516	addq   $32 * SIZE, AO
1517	addq   $32 * SIZE, BO
1518	decq   %rax
1519	jne    .L52
1520	ALIGN_4
1521
1522.L55:
1523#if defined(LT) || defined(RN)
1524	movq	KK, %rax
1525#else
1526	movq	K, %rax
1527	subq	KK, %rax
1528#endif
1529	andq	$7, %rax		# if (k & 1)
1530	BRANCH
1531	je .L59
1532	ALIGN_4
1533
1534.L56:
1535	movapd	 0 * SIZE(BO), %xmm9
1536	mulpd	%xmm8, %xmm9
1537	addpd	%xmm9, %xmm0
1538	mulpd	 2 * SIZE(BO), %xmm8
1539	addpd	%xmm8, %xmm1
1540	movapd	 2 * SIZE(AO), %xmm8
1541	movapd	 0 * SIZE(BO), %xmm9
1542	mulpd	%xmm8, %xmm9
1543	addpd	%xmm9, %xmm4
1544	mulpd	 2 * SIZE(BO), %xmm8
1545	addpd	%xmm8, %xmm5
1546	movapd	 4 * SIZE(AO), %xmm8
1547
1548	addq	$4 * SIZE, AO		# aoffset  += 4
1549	addq	$4 * SIZE, BO		# boffset1 += 8
1550	decq	%rax
1551	jg	.L56
1552	ALIGN_4
1553
1554.L59:
1555#if defined(LN) || defined(RT)
1556	movq	KK, %rax
1557#ifdef LN
1558	subq	$4, %rax
1559#else
1560	subq	$2, %rax
1561#endif
1562
1563	movq	AORIG, AO
1564	movq	BORIG, B
1565	leaq	BUFFER, BO
1566
1567	leaq	(, %rax, SIZE), %rax
1568	leaq	(AO, %rax, 4), AO
1569	leaq	(B,  %rax, 2), B
1570	leaq	(BO, %rax, 4), BO
1571#endif
1572
1573#if defined(LN) || defined(LT)
1574	movapd	%xmm0, %xmm8
1575	unpcklpd %xmm1, %xmm0
1576	unpckhpd %xmm1, %xmm8
1577
1578	movapd	%xmm4, %xmm12
1579	unpcklpd %xmm5, %xmm4
1580	unpckhpd %xmm5, %xmm12
1581
1582	movapd	 0 * SIZE(B), %xmm1
1583	movapd	 2 * SIZE(B), %xmm5
1584	movapd	 4 * SIZE(B), %xmm9
1585	movapd	 6 * SIZE(B), %xmm13
1586
1587	subpd	%xmm0,  %xmm1
1588	subpd	%xmm8,  %xmm5
1589	subpd	%xmm4,  %xmm9
1590	subpd	%xmm12, %xmm13
1591#else
1592	movapd	 0 * SIZE(AO), %xmm8
1593	movapd	 2 * SIZE(AO), %xmm9
1594	movapd	 4 * SIZE(AO), %xmm10
1595	movapd	 6 * SIZE(AO), %xmm11
1596
1597	subpd	%xmm0, %xmm8
1598	subpd	%xmm4, %xmm9
1599	subpd	%xmm1, %xmm10
1600	subpd	%xmm5, %xmm11
1601#endif
1602
1603#ifdef LN
1604	movlpd	15 * SIZE(AO), %xmm0
1605	movhpd	15 * SIZE(AO), %xmm0
1606	mulpd	 %xmm0, %xmm13
1607	movlpd	14 * SIZE(AO), %xmm2
1608	movhpd	14 * SIZE(AO), %xmm2
1609	mulpd	 %xmm13, %xmm2
1610	subpd	 %xmm2, %xmm9
1611	movlpd	13 * SIZE(AO), %xmm4
1612	movhpd	13 * SIZE(AO), %xmm4
1613	mulpd	 %xmm13, %xmm4
1614	subpd	 %xmm4, %xmm5
1615	movlpd	12 * SIZE(AO), %xmm6
1616	movhpd	12 * SIZE(AO), %xmm6
1617	mulpd	 %xmm13, %xmm6
1618	subpd	 %xmm6, %xmm1
1619
1620	movlpd	10 * SIZE(AO), %xmm0
1621	movhpd	10 * SIZE(AO), %xmm0
1622	mulpd	 %xmm0, %xmm9
1623	movlpd	 9 * SIZE(AO), %xmm2
1624	movhpd	 9 * SIZE(AO), %xmm2
1625	mulpd	 %xmm9, %xmm2
1626	subpd	 %xmm2, %xmm5
1627	movlpd	 8 * SIZE(AO), %xmm4
1628	movhpd	 8 * SIZE(AO), %xmm4
1629	mulpd	 %xmm9, %xmm4
1630	subpd	 %xmm4, %xmm1
1631
1632	movlpd	 5 * SIZE(AO), %xmm0
1633	movhpd	 5 * SIZE(AO), %xmm0
1634	mulpd	 %xmm0, %xmm5
1635	movlpd	 4 * SIZE(AO), %xmm2
1636	movhpd	 4 * SIZE(AO), %xmm2
1637	mulpd	 %xmm5, %xmm2
1638	subpd	 %xmm2, %xmm1
1639
1640	movlpd	 0 * SIZE(AO), %xmm0
1641	movhpd	 0 * SIZE(AO), %xmm0
1642	mulpd	 %xmm0, %xmm1
1643#endif
1644
1645#ifdef LT
1646	movlpd	 0 * SIZE(AO), %xmm0
1647	movhpd	 0 * SIZE(AO), %xmm0
1648	mulpd	 %xmm0, %xmm1
1649	movlpd	 1 * SIZE(AO), %xmm2
1650	movhpd	 1 * SIZE(AO), %xmm2
1651	mulpd	 %xmm1, %xmm2
1652	subpd	 %xmm2, %xmm5
1653	movlpd	 2 * SIZE(AO), %xmm4
1654	movhpd	 2 * SIZE(AO), %xmm4
1655	mulpd	 %xmm1, %xmm4
1656	subpd	 %xmm4, %xmm9
1657	movlpd	 3 * SIZE(AO), %xmm6
1658	movhpd	 3 * SIZE(AO), %xmm6
1659	mulpd	 %xmm1, %xmm6
1660	subpd	 %xmm6, %xmm13
1661
1662
1663	movlpd	 5 * SIZE(AO), %xmm0
1664	movhpd	 5 * SIZE(AO), %xmm0
1665	mulpd	 %xmm0, %xmm5
1666
1667	movlpd	 6 * SIZE(AO), %xmm2
1668	movhpd	 6 * SIZE(AO), %xmm2
1669	mulpd	 %xmm5, %xmm2
1670	subpd	 %xmm2, %xmm9
1671	movlpd	 7 * SIZE(AO), %xmm4
1672	movhpd	 7 * SIZE(AO), %xmm4
1673	mulpd	 %xmm5, %xmm4
1674	subpd	 %xmm4, %xmm13
1675
1676	movlpd	10 * SIZE(AO), %xmm0
1677	movhpd	10 * SIZE(AO), %xmm0
1678	mulpd	 %xmm0, %xmm9
1679	movlpd	11 * SIZE(AO), %xmm2
1680	movhpd	11 * SIZE(AO), %xmm2
1681	mulpd	 %xmm9, %xmm2
1682	subpd	 %xmm2, %xmm13
1683
1684	movlpd	15 * SIZE(AO), %xmm0
1685	movhpd	15 * SIZE(AO), %xmm0
1686	mulpd	 %xmm0, %xmm13
1687#endif
1688
1689#ifdef RN
1690	movlpd	 0 * SIZE(B), %xmm0
1691	movhpd	 0 * SIZE(B), %xmm0
1692	mulpd	 %xmm0, %xmm8
1693	mulpd	 %xmm0, %xmm9
1694
1695	movlpd	 1 * SIZE(B), %xmm1
1696	movhpd	 1 * SIZE(B), %xmm1
1697	mulpd	 %xmm8, %xmm1
1698	subpd	 %xmm1, %xmm10
1699	movlpd	 1 * SIZE(B), %xmm1
1700	movhpd	 1 * SIZE(B), %xmm1
1701	mulpd	 %xmm9, %xmm1
1702	subpd	 %xmm1, %xmm11
1703
1704	movlpd	 3 * SIZE(B), %xmm0
1705	movhpd	 3 * SIZE(B), %xmm0
1706	mulpd	 %xmm0, %xmm10
1707	mulpd	 %xmm0, %xmm11
1708#endif
1709
1710#ifdef RT
1711	movlpd	 3 * SIZE(B), %xmm0
1712	movhpd	 3 * SIZE(B), %xmm0
1713	mulpd	 %xmm0, %xmm10
1714	mulpd	 %xmm0, %xmm11
1715
1716	movlpd	 2 * SIZE(B), %xmm1
1717	movhpd	 2 * SIZE(B), %xmm1
1718	mulpd	 %xmm10, %xmm1
1719	subpd	 %xmm1, %xmm8
1720	movlpd	 2 * SIZE(B), %xmm1
1721	movhpd	 2 * SIZE(B), %xmm1
1722	mulpd	 %xmm11, %xmm1
1723	subpd	 %xmm1, %xmm9
1724
1725	movlpd	 0 * SIZE(B), %xmm0
1726	movhpd	 0 * SIZE(B), %xmm0
1727	mulpd	 %xmm0, %xmm8
1728	mulpd	 %xmm0, %xmm9
1729#endif
1730
1731#ifdef LN
1732	subq	$4 * SIZE, CO1
1733	subq	$4 * SIZE, CO2
1734#endif
1735
1736#if defined(LN) || defined(LT)
1737	movsd	%xmm1,  0 * SIZE(CO1)
1738	movsd	%xmm5,  1 * SIZE(CO1)
1739	movsd	%xmm9,  2 * SIZE(CO1)
1740	movsd	%xmm13, 3 * SIZE(CO1)
1741
1742	movhpd	%xmm1,  0 * SIZE(CO2)
1743	movhpd	%xmm5,  1 * SIZE(CO2)
1744	movhpd	%xmm9,  2 * SIZE(CO2)
1745	movhpd	%xmm13, 3 * SIZE(CO2)
1746#else
1747	movsd	%xmm8,  0 * SIZE(CO1)
1748	movhpd	%xmm8,  1 * SIZE(CO1)
1749	movsd	%xmm9,  2 * SIZE(CO1)
1750	movhpd	%xmm9,  3 * SIZE(CO1)
1751
1752	movsd	%xmm10,  0 * SIZE(CO2)
1753	movhpd	%xmm10,  1 * SIZE(CO2)
1754	movsd	%xmm11,  2 * SIZE(CO2)
1755	movhpd	%xmm11,  3 * SIZE(CO2)
1756#endif
1757
1758#if defined(LN) || defined(LT)
1759	movapd	%xmm1,   0 * SIZE(B)
1760	movapd	%xmm5,   2 * SIZE(B)
1761	movapd	%xmm9,   4 * SIZE(B)
1762	movapd	%xmm13,  6 * SIZE(B)
1763
1764	movlpd	%xmm1,   0 * SIZE(BO)
1765	movlpd	%xmm1,   1 * SIZE(BO)
1766	movhpd	%xmm1,   2 * SIZE(BO)
1767	movhpd	%xmm1,   3 * SIZE(BO)
1768	movlpd	%xmm5,   4 * SIZE(BO)
1769	movlpd	%xmm5,   5 * SIZE(BO)
1770	movhpd	%xmm5,   6 * SIZE(BO)
1771	movhpd	%xmm5,   7 * SIZE(BO)
1772	movlpd	%xmm9,   8 * SIZE(BO)
1773	movlpd	%xmm9,   9 * SIZE(BO)
1774	movhpd	%xmm9,  10 * SIZE(BO)
1775	movhpd	%xmm9,  11 * SIZE(BO)
1776	movlpd	%xmm13, 12 * SIZE(BO)
1777	movlpd	%xmm13, 13 * SIZE(BO)
1778	movhpd	%xmm13, 14 * SIZE(BO)
1779	movhpd	%xmm13, 15 * SIZE(BO)
1780#else
1781	movapd	%xmm8,   0 * SIZE(AO)
1782	movapd	%xmm9,   2 * SIZE(AO)
1783	movapd	%xmm10,  4 * SIZE(AO)
1784	movapd	%xmm11,  6 * SIZE(AO)
1785#endif
1786
1787#ifndef LN
1788	addq	$4 * SIZE, CO1
1789	addq	$4 * SIZE, CO2
1790#endif
1791
1792#if defined(LT) || defined(RN)
1793	movq	K,  %rax
1794	subq	KK, %rax
1795	leaq	(,%rax, SIZE), %rax
1796	leaq	(AO, %rax, 4), AO
1797#ifdef LT
1798	addq	$8 * SIZE, B
1799#endif
1800#endif
1801
1802#ifdef LN
1803	subq	$4, KK
1804	movq	BORIG, B
1805#endif
1806
1807#ifdef LT
1808	addq	$4, KK
1809#endif
1810
1811#ifdef RT
1812       movq	K, %rax
1813       movq	BORIG, B
1814       salq	$2 + BASE_SHIFT, %rax
1815       addq	%rax, AORIG
1816#endif
1817
1818	decq	I			# i --
1819	jg	.L51
1820	ALIGN_4
1821
1822.L60:
1823	testq	$2, M
1824	je	.L70
1825	ALIGN_4
1826
1827.L61:
1828#ifdef LN
1829       movq	K, %rax
1830       salq	$1 + BASE_SHIFT, %rax
1831       subq	%rax, AORIG
1832#endif
1833
1834#if defined(LN) || defined(RT)
1835	movq	KK, %rax
1836	movq	AORIG, AO
1837	leaq	(, %rax, SIZE), %rax
1838	leaq	(AO, %rax, 2), AO
1839#endif
1840
1841	leaq	BUFFER, BO
1842
1843#if defined(LN) || defined(RT)
1844	movq	KK, %rax
1845	salq	$1 + BASE_SHIFT, %rax
1846	leaq	(BO, %rax, 2), BO
1847#endif
1848
1849	movapd	 0 * SIZE(AO), %xmm8
1850	pxor	%xmm0, %xmm0
1851	movapd	 0 * SIZE(BO), %xmm9
1852	pxor	%xmm1, %xmm1
1853	movapd	 8 * SIZE(AO), %xmm10
1854	pxor	%xmm2, %xmm2
1855	movapd	 8 * SIZE(BO), %xmm11
1856	pxor	%xmm3, %xmm3
1857
1858	movapd	16 * SIZE(BO), %xmm13
1859	movapd	24 * SIZE(BO), %xmm15
1860
1861#if defined(LT) || defined(RN)
1862	movq	KK, %rax
1863#else
1864	movq	K, %rax
1865	subq	KK, %rax
1866#endif
1867	sarq	$3, %rax
1868	je	.L65
1869	ALIGN_4
1870
1871.L62:
1872	mulpd	%xmm8, %xmm9
1873	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
1874	mulpd	 2 * SIZE(BO), %xmm8
1875	addpd	%xmm9, %xmm0
1876	movapd	 4 * SIZE(BO), %xmm9
1877	addpd	%xmm8, %xmm1
1878	movapd	 2 * SIZE(AO), %xmm8
1879
1880	mulpd	%xmm8, %xmm9
1881	mulpd	 6 * SIZE(BO), %xmm8
1882	addpd	%xmm9, %xmm2
1883	movapd	32 * SIZE(BO), %xmm9
1884	addpd	%xmm8, %xmm3
1885	movapd	 4 * SIZE(AO), %xmm8
1886
1887	mulpd	%xmm8, %xmm11
1888	mulpd	10 * SIZE(BO), %xmm8
1889	addpd	%xmm11, %xmm0
1890	movapd	12 * SIZE(BO), %xmm11
1891	addpd	%xmm8, %xmm1
1892	movapd	 6 * SIZE(AO), %xmm8
1893
1894	mulpd	%xmm8, %xmm11
1895	mulpd	14 * SIZE(BO), %xmm8
1896	addpd	%xmm11, %xmm2
1897	movapd	40 * SIZE(BO), %xmm11
1898	addpd	%xmm8, %xmm3
1899	movapd	16 * SIZE(AO), %xmm8
1900
1901	PREFETCH	(PREFETCHSIZE     +  8) * SIZE(AO)
1902	mulpd	%xmm10, %xmm13
1903	mulpd	18 * SIZE(BO), %xmm10
1904	addpd	%xmm13, %xmm0
1905	movapd	20 * SIZE(BO), %xmm13
1906	addpd	%xmm10, %xmm1
1907	movapd	10 * SIZE(AO), %xmm10
1908
1909	mulpd	%xmm10, %xmm13
1910	mulpd	22 * SIZE(BO), %xmm10
1911	addpd	%xmm13, %xmm2
1912	movapd	48 * SIZE(BO), %xmm13
1913	addpd	%xmm10, %xmm3
1914	movapd	12 * SIZE(AO), %xmm10
1915
1916	mulpd	%xmm10, %xmm15
1917	mulpd	26 * SIZE(BO), %xmm10
1918	addpd	%xmm15, %xmm0
1919	movapd	28 * SIZE(BO), %xmm15
1920	addpd	%xmm10, %xmm1
1921	movapd	14 * SIZE(AO), %xmm10
1922
1923	mulpd	%xmm10, %xmm15
1924	mulpd	30 * SIZE(BO), %xmm10
1925	addpd	%xmm15, %xmm2
1926	movapd	56 * SIZE(BO), %xmm15
1927	addpd	%xmm10, %xmm3
1928	movapd	24 * SIZE(AO), %xmm10
1929
1930	addq   $16 * SIZE, AO
1931	addq   $32 * SIZE, BO
1932	decq   %rax
1933	jne    .L62
1934	ALIGN_4
1935
1936.L65:
1937#if defined(LT) || defined(RN)
1938	movq	KK, %rax
1939#else
1940	movq	K, %rax
1941	subq	KK, %rax
1942#endif
1943	andq	$7, %rax		# if (k & 1)
1944	BRANCH
1945	je .L69
1946	ALIGN_4
1947
1948.L66:
1949	mulpd	%xmm8, %xmm9
1950	mulpd	 2 * SIZE(BO), %xmm8
1951	addpd	%xmm9, %xmm0
1952	movapd	 4 * SIZE(BO), %xmm9
1953	addpd	%xmm8, %xmm1
1954	movapd	 2 * SIZE(AO), %xmm8
1955
1956	addq	$2 * SIZE, AO		# aoffset  += 4
1957	addq	$4 * SIZE, BO		# boffset1 += 8
1958	decq	%rax
1959	jg	.L66
1960	ALIGN_4
1961
1962.L69:
1963	addpd	%xmm2, %xmm0
1964	addpd	%xmm3, %xmm1
1965
1966#if defined(LN) || defined(RT)
1967	movq	KK, %rax
1968#ifdef LN
1969	subq	$2, %rax
1970#else
1971	subq	$2, %rax
1972#endif
1973
1974	movq	AORIG, AO
1975	movq	BORIG, B
1976	leaq	BUFFER, BO
1977
1978	leaq	(, %rax, SIZE), %rax
1979	leaq	(AO, %rax, 2), AO
1980	leaq	(B,  %rax, 2), B
1981	leaq	(BO, %rax, 4), BO
1982#endif
1983
1984#if defined(LN) || defined(LT)
1985	movapd	%xmm0, %xmm8
1986	unpcklpd %xmm1, %xmm0
1987	unpckhpd %xmm1, %xmm8
1988
1989	movapd	 0 * SIZE(B), %xmm1
1990	movapd	 2 * SIZE(B), %xmm5
1991
1992	subpd	%xmm0,  %xmm1
1993	subpd	%xmm8,  %xmm5
1994#else
1995	movapd	 0 * SIZE(AO), %xmm8
1996	movapd	 2 * SIZE(AO), %xmm10
1997
1998	subpd	%xmm0, %xmm8
1999	subpd	%xmm1, %xmm10
2000#endif
2001
2002
2003#ifdef LN
2004	movlpd	 3 * SIZE(AO), %xmm0
2005	movhpd	 3 * SIZE(AO), %xmm0
2006	mulpd	 %xmm0, %xmm5
2007
2008	movlpd	 2 * SIZE(AO), %xmm2
2009	movhpd	 2 * SIZE(AO), %xmm2
2010	mulpd	 %xmm5, %xmm2
2011	subpd	 %xmm2, %xmm1
2012
2013	movlpd	 0 * SIZE(AO), %xmm0
2014	movhpd	 0 * SIZE(AO), %xmm0
2015	mulpd	 %xmm0, %xmm1
2016#endif
2017
2018#ifdef LT
2019	movlpd	 0 * SIZE(AO), %xmm0
2020	movhpd	 0 * SIZE(AO), %xmm0
2021	mulpd	 %xmm0, %xmm1
2022
2023	movlpd	 1 * SIZE(AO), %xmm2
2024	movhpd	 1 * SIZE(AO), %xmm2
2025	mulpd	 %xmm1, %xmm2
2026	subpd	 %xmm2, %xmm5
2027
2028	movlpd	 3 * SIZE(AO), %xmm0
2029	movhpd	 3 * SIZE(AO), %xmm0
2030	mulpd	 %xmm0, %xmm5
2031#endif
2032
2033#ifdef RN
2034	movlpd	 0 * SIZE(B), %xmm0
2035	movhpd	 0 * SIZE(B), %xmm0
2036	mulpd	 %xmm0, %xmm8
2037
2038	movlpd	 1 * SIZE(B), %xmm1
2039	movhpd	 1 * SIZE(B), %xmm1
2040	mulpd	 %xmm8, %xmm1
2041	subpd	 %xmm1, %xmm10
2042
2043	movlpd	 3 * SIZE(B), %xmm0
2044	movhpd	 3 * SIZE(B), %xmm0
2045	mulpd	 %xmm0, %xmm10
2046#endif
2047
2048#ifdef RT
2049	movlpd	 3 * SIZE(B), %xmm0
2050	movhpd	 3 * SIZE(B), %xmm0
2051	mulpd	 %xmm0, %xmm10
2052
2053	movlpd	 2 * SIZE(B), %xmm1
2054	movhpd	 2 * SIZE(B), %xmm1
2055	mulpd	 %xmm10, %xmm1
2056	subpd	 %xmm1, %xmm8
2057
2058	movlpd	 0 * SIZE(B), %xmm0
2059	movhpd	 0 * SIZE(B), %xmm0
2060	mulpd	 %xmm0, %xmm8
2061#endif
2062
2063#ifdef LN
2064	subq	$2 * SIZE, CO1
2065	subq	$2 * SIZE, CO2
2066#endif
2067
2068#if defined(LN) || defined(LT)
2069	movsd	%xmm1,  0 * SIZE(CO1)
2070	movsd	%xmm5,  1 * SIZE(CO1)
2071
2072	movhpd	%xmm1,  0 * SIZE(CO2)
2073	movhpd	%xmm5,  1 * SIZE(CO2)
2074#else
2075	movsd	%xmm8,  0 * SIZE(CO1)
2076	movhpd	%xmm8,  1 * SIZE(CO1)
2077
2078	movsd	%xmm10,  0 * SIZE(CO2)
2079	movhpd	%xmm10,  1 * SIZE(CO2)
2080#endif
2081
2082#if defined(LN) || defined(LT)
2083	movapd	%xmm1,   0 * SIZE(B)
2084	movapd	%xmm5,   2 * SIZE(B)
2085
2086	movlpd	%xmm1,   0 * SIZE(BO)
2087	movlpd	%xmm1,   1 * SIZE(BO)
2088	movhpd	%xmm1,   2 * SIZE(BO)
2089	movhpd	%xmm1,   3 * SIZE(BO)
2090	movlpd	%xmm5,   4 * SIZE(BO)
2091	movlpd	%xmm5,   5 * SIZE(BO)
2092	movhpd	%xmm5,   6 * SIZE(BO)
2093	movhpd	%xmm5,   7 * SIZE(BO)
2094#else
2095	movapd	%xmm8,   0 * SIZE(AO)
2096	movapd	%xmm10,  2 * SIZE(AO)
2097#endif
2098
2099#ifndef LN
2100	addq	$2 * SIZE, CO1
2101	addq	$2 * SIZE, CO2
2102#endif
2103
2104#if defined(LT) || defined(RN)
2105	movq	K,  %rax
2106	subq	KK, %rax
2107	leaq	(,%rax, SIZE), %rax
2108	leaq	(AO, %rax, 2), AO
2109#ifdef LT
2110	addq	$4 * SIZE, B
2111#endif
2112#endif
2113
2114#ifdef LN
2115	subq	$2, KK
2116	movq	BORIG, B
2117#endif
2118
2119#ifdef LT
2120	addq	$2, KK
2121#endif
2122
2123#ifdef RT
2124	movq	K, %rax
2125	movq	BORIG, B
2126	salq	$1 + BASE_SHIFT, %rax
2127	addq	%rax, AORIG
2128#endif
2129	ALIGN_4
2130
2131.L70:
2132	testq	$1, M
2133	je	.L79
2134	ALIGN_4
2135
2136.L71:
2137#ifdef LN
2138       movq	K, %rax
2139       salq	$0 + BASE_SHIFT, %rax
2140       subq	%rax, AORIG
2141#endif
2142
2143#if defined(LN) || defined(RT)
2144	movq	KK, %rax
2145	movq	AORIG, AO
2146	leaq	(, %rax, SIZE), %rax
2147	leaq	(AO, %rax, 1), AO
2148#endif
2149
2150	leaq	BUFFER, BO
2151
2152#if defined(LN) || defined(RT)
2153	movq	KK, %rax
2154	salq	$1 + BASE_SHIFT, %rax
2155	leaq	(BO, %rax, 2), BO
2156#endif
2157
2158	movsd	 0 * SIZE(AO), %xmm8
2159	pxor	%xmm0, %xmm0
2160	movsd	 0 * SIZE(BO), %xmm9
2161	pxor	%xmm1, %xmm1
2162	movsd	 4 * SIZE(AO), %xmm10
2163	pxor	%xmm2, %xmm2
2164	movsd	 8 * SIZE(BO), %xmm11
2165	pxor	%xmm3, %xmm3
2166
2167	movsd	16 * SIZE(BO), %xmm13
2168	movsd	24 * SIZE(BO), %xmm15
2169
2170#if defined(LT) || defined(RN)
2171	movq	KK, %rax
2172#else
2173	movq	K, %rax
2174	subq	KK, %rax
2175#endif
2176	sarq	$3, %rax
2177	je	.L75
2178	ALIGN_4
2179
2180.L72:
2181	mulsd	%xmm8, %xmm9
2182	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
2183	mulsd	 2 * SIZE(BO), %xmm8
2184	addsd	%xmm9, %xmm0
2185	movsd	 4 * SIZE(BO), %xmm9
2186	addsd	%xmm8, %xmm1
2187	movsd	 1 * SIZE(AO), %xmm8
2188
2189	mulsd	%xmm8, %xmm9
2190	mulsd	 6 * SIZE(BO), %xmm8
2191	addsd	%xmm9, %xmm2
2192	movsd	32 * SIZE(BO), %xmm9
2193	addsd	%xmm8, %xmm3
2194	movsd	 2 * SIZE(AO), %xmm8
2195
2196	mulsd	%xmm8, %xmm11
2197	mulsd	10 * SIZE(BO), %xmm8
2198	addsd	%xmm11, %xmm0
2199	movsd	12 * SIZE(BO), %xmm11
2200	addsd	%xmm8, %xmm1
2201	movsd	 3 * SIZE(AO), %xmm8
2202
2203	mulsd	%xmm8, %xmm11
2204	mulsd	14 * SIZE(BO), %xmm8
2205	addsd	%xmm11, %xmm2
2206	movsd	40 * SIZE(BO), %xmm11
2207	addsd	%xmm8, %xmm3
2208	movsd	 8 * SIZE(AO), %xmm8
2209
2210	mulsd	%xmm10, %xmm13
2211	mulsd	18 * SIZE(BO), %xmm10
2212	addsd	%xmm13, %xmm0
2213	movsd	20 * SIZE(BO), %xmm13
2214	addsd	%xmm10, %xmm1
2215	movsd	 5 * SIZE(AO), %xmm10
2216
2217	mulsd	%xmm10, %xmm13
2218	mulsd	22 * SIZE(BO), %xmm10
2219	addsd	%xmm13, %xmm2
2220	movsd	48 * SIZE(BO), %xmm13
2221	addsd	%xmm10, %xmm3
2222	movsd	 6 * SIZE(AO), %xmm10
2223
2224	mulsd	%xmm10, %xmm15
2225	mulsd	26 * SIZE(BO), %xmm10
2226	addsd	%xmm15, %xmm0
2227	movsd	28 * SIZE(BO), %xmm15
2228	addsd	%xmm10, %xmm1
2229	movsd	 7 * SIZE(AO), %xmm10
2230
2231	mulsd	%xmm10, %xmm15
2232	mulsd	30 * SIZE(BO), %xmm10
2233	addsd	%xmm15, %xmm2
2234	movsd	56 * SIZE(BO), %xmm15
2235	addsd	%xmm10, %xmm3
2236	movsd	12 * SIZE(AO), %xmm10
2237
2238	addq   $ 8 * SIZE, AO
2239	addq   $32 * SIZE, BO
2240	decq   %rax
2241	jne    .L72
2242	ALIGN_4
2243
2244.L75:
2245#if defined(LT) || defined(RN)
2246	movq	KK, %rax
2247#else
2248	movq	K, %rax
2249	subq	KK, %rax
2250#endif
2251	andq	$7, %rax		# if (k & 1)
2252	BRANCH
2253	je .L78
2254	ALIGN_4
2255
2256.L76:
2257	mulsd	%xmm8, %xmm9
2258	mulsd	 2 * SIZE(BO), %xmm8
2259	addsd	%xmm9, %xmm0
2260	addsd	%xmm8, %xmm1
2261	movsd	 1 * SIZE(AO), %xmm8
2262	movsd	 4 * SIZE(BO), %xmm9
2263
2264	addq	$1 * SIZE, AO		# aoffset  += 4
2265	addq	$4 * SIZE, BO		# boffset1 += 8
2266	decq	%rax
2267	jg	.L76
2268	ALIGN_4
2269
2270.L78:
2271	addsd	%xmm2, %xmm0
2272	addsd	%xmm3, %xmm1
2273
2274#if defined(LN) || defined(RT)
2275	movq	KK, %rax
2276#ifdef LN
2277	subq	$1, %rax
2278#else
2279	subq	$2, %rax
2280#endif
2281
2282	movq	AORIG, AO
2283	movq	BORIG, B
2284	leaq	BUFFER, BO
2285
2286	leaq	(, %rax, SIZE), %rax
2287	leaq	(AO, %rax, 1), AO
2288	leaq	(B,  %rax, 2), B
2289	leaq	(BO, %rax, 4), BO
2290#endif
2291
2292#if defined(LN) || defined(LT)
2293	movsd	 0 * SIZE(B), %xmm4
2294	movsd	 1 * SIZE(B), %xmm5
2295#else
2296	movsd	 0 * SIZE(AO), %xmm4
2297	movsd	 1 * SIZE(AO), %xmm5
2298#endif
2299
2300	subsd	%xmm0,  %xmm4
2301	subsd	%xmm1,  %xmm5
2302
2303#ifdef LN
2304	movsd	 0 * SIZE(AO), %xmm0
2305
2306	mulsd	 %xmm0, %xmm4
2307	mulsd	 %xmm0, %xmm5
2308#endif
2309
2310#ifdef LT
2311	movsd	 0 * SIZE(AO), %xmm0
2312
2313	mulsd	 %xmm0, %xmm4
2314	mulsd	 %xmm0, %xmm5
2315#endif
2316
2317#ifdef RN
2318	mulsd	 0 * SIZE(B), %xmm4
2319	movsd	 1 * SIZE(B), %xmm1
2320	mulsd	 %xmm4, %xmm1
2321	subsd	 %xmm1, %xmm5
2322
2323	mulsd	 3 * SIZE(B), %xmm5
2324#endif
2325
2326#ifdef RT
2327	mulsd	 3 * SIZE(B), %xmm5
2328
2329	movlpd	 2 * SIZE(B), %xmm1
2330	mulsd	 %xmm5, %xmm1
2331	subsd	 %xmm1, %xmm4
2332
2333	mulsd	 0 * SIZE(B), %xmm4
2334#endif
2335
2336#ifdef LN
2337	subq	$1 * SIZE, CO1
2338	subq	$1 * SIZE, CO2
2339#endif
2340
2341	movsd	%xmm4,  0 * SIZE(CO1)
2342	movsd	%xmm5,  0 * SIZE(CO2)
2343
2344#if defined(LN) || defined(LT)
2345	movsd	%xmm4,   0 * SIZE(B)
2346	movsd	%xmm5,   1 * SIZE(B)
2347
2348	movsd	%xmm4,   0 * SIZE(BO)
2349	movsd	%xmm4,   1 * SIZE(BO)
2350	movsd	%xmm5,   2 * SIZE(BO)
2351	movsd	%xmm5,   3 * SIZE(BO)
2352#else
2353	movsd	%xmm4,   0 * SIZE(AO)
2354	movsd	%xmm5,   1 * SIZE(AO)
2355#endif
2356
2357#ifndef LN
2358	addq	$1 * SIZE, CO1
2359	addq	$1 * SIZE, CO2
2360#endif
2361
2362#if defined(LT) || defined(RN)
2363	movq	K,  %rax
2364	subq	KK, %rax
2365	leaq	(,%rax, SIZE), %rax
2366	leaq	(AO, %rax, 1), AO
2367#ifdef LT
2368	addq	$2 * SIZE, B
2369#endif
2370#endif
2371
2372#ifdef LN
2373	subq	$1, KK
2374	movq	BORIG, B
2375#endif
2376
2377#ifdef LT
2378	addq	$1, KK
2379#endif
2380
2381#ifdef RT
2382	movq	K, %rax
2383	movq	BORIG, B
2384	salq	$0 + BASE_SHIFT, %rax
2385	addq	%rax, AORIG
2386#endif
2387	ALIGN_4
2388
2389.L79:
2390#ifdef LN
2391       leaq	(, K, SIZE), %rax
2392       leaq	(B, %rax, 2), B
2393#endif
2394
2395#if defined(LT) || defined(RN)
2396	movq	K,  %rax
2397	subq	KK, %rax
2398	leaq	(,%rax, SIZE), %rax
2399	leaq	(B,  %rax, 2), B
2400#endif
2401
2402#ifdef RN
2403	addq	$2, KK
2404#endif
2405
2406#ifdef RT
2407	subq	$2, KK
2408#endif
2409	ALIGN_4
2410
2411.L80:
2412	movq	N,  J
2413	sarq	$2, J		# j = (n >> 2)
2414	jle	.L999
2415
2416.L01:
2417/* Copying to Sub Buffer */
2418
2419#ifdef LN
2420	movq	OFFSET, %rax
2421	addq	M, %rax
2422	movq	%rax, KK
2423#endif
2424
2425	leaq	BUFFER, BO
2426
2427#ifdef RT
2428       movq	K, %rax
2429       salq	$2 + BASE_SHIFT, %rax
2430       subq	%rax, B
2431#endif
2432
2433#if defined(LN) || defined(RT)
2434	movq	KK, %rax
2435	movq	B, BORIG
2436	leaq	(, %rax, SIZE), %rax
2437	leaq	(B,  %rax, 4), B
2438	leaq	(BO, %rax, 8), BO
2439#endif
2440
2441#ifdef LT
2442	movq	OFFSET, %rax
2443	movq	%rax, KK
2444#endif
2445
2446#if defined(LT) || defined(RN)
2447	movq	KK, %rax
2448#else
2449	movq	K, %rax
2450	subq	KK, %rax
2451#endif
2452	sarq	$2, %rax
2453	jle	.L03
2454
2455	addq	%rax, %rax
2456	ALIGN_4
2457
2458.L02:
2459	PREFETCHNTA	 40 * SIZE(B)
2460
2461	movsd	 0 * SIZE(B), %xmm0
2462	movsd	 1 * SIZE(B), %xmm1
2463	movsd	 2 * SIZE(B), %xmm2
2464	movsd	 3 * SIZE(B), %xmm3
2465	movsd	 4 * SIZE(B), %xmm4
2466	movsd	 5 * SIZE(B), %xmm5
2467	movsd	 6 * SIZE(B), %xmm6
2468	movsd	 7 * SIZE(B), %xmm7
2469
2470	addq	$16 * SIZE, BO
2471	addq	$ 8 * SIZE, B
2472
2473	movsd	%xmm0, -16 * SIZE(BO)
2474	movsd	%xmm0, -15 * SIZE(BO)
2475	movsd	%xmm1, -14 * SIZE(BO)
2476	movsd	%xmm1, -13 * SIZE(BO)
2477	movsd	%xmm2, -12 * SIZE(BO)
2478	movsd	%xmm2, -11 * SIZE(BO)
2479	movsd	%xmm3, -10 * SIZE(BO)
2480	movsd	%xmm3,  -9 * SIZE(BO)
2481	movsd	%xmm4,  -8 * SIZE(BO)
2482	movsd	%xmm4,  -7 * SIZE(BO)
2483	movsd	%xmm5,  -6 * SIZE(BO)
2484	movsd	%xmm5,  -5 * SIZE(BO)
2485	movsd	%xmm6,  -4 * SIZE(BO)
2486	movsd	%xmm6,  -3 * SIZE(BO)
2487	movsd	%xmm7,  -2 * SIZE(BO)
2488	movsd	%xmm7,  -1 * SIZE(BO)
2489
2490	decq	%rax
2491	jne	.L02
2492	ALIGN_4
2493
2494.L03:
2495#if defined(LT) || defined(RN)
2496	movq	KK, %rax
2497#else
2498	movq	K, %rax
2499	subq	KK, %rax
2500#endif
2501	andq	$3, %rax
2502	BRANCH
2503	jle	.L10
2504	ALIGN_4
2505
2506.L04:
2507	movsd	 0 * SIZE(B), %xmm0
2508	movsd	 1 * SIZE(B), %xmm1
2509	movsd	 2 * SIZE(B), %xmm2
2510	movsd	 3 * SIZE(B), %xmm3
2511
2512	movsd	%xmm0,  0 * SIZE(BO)
2513	movsd	%xmm0,  1 * SIZE(BO)
2514	movsd	%xmm1,  2 * SIZE(BO)
2515	movsd	%xmm1,  3 * SIZE(BO)
2516	movsd	%xmm2,  4 * SIZE(BO)
2517	movsd	%xmm2,  5 * SIZE(BO)
2518	movsd	%xmm3,  6 * SIZE(BO)
2519	movsd	%xmm3,  7 * SIZE(BO)
2520
2521	addq	$4 * SIZE, B
2522	addq	$8 * SIZE, BO
2523	decq	%rax
2524	jne	.L04
2525	ALIGN_4
2526
2527.L10:
2528#if defined(LT) || defined(RN)
2529	movq	A, AO
2530#else
2531	movq	A, AORIG
2532#endif
2533
2534#ifdef RT
2535       leaq	(, LDC, 4), %rax
2536       subq	%rax, C
2537#endif
2538
2539	movq	C, CO1			# coffset1 = c
2540	leaq	(C, LDC, 1), CO2	# coffset2 = c + ldc
2541#ifndef RT
2542	leaq	(C, LDC, 4), C
2543#endif
2544
2545	movq	M,  I
2546	sarq	$2, I	# i = (m >> 2)
2547	jle	.L20
2548	ALIGN_4
2549
2550.L11:
2551#ifdef LN
2552       movq	K, %rax
2553       salq	$2 + BASE_SHIFT, %rax
2554       subq	%rax, AORIG
2555#endif
2556
2557#if defined(LN) || defined(RT)
2558	movq	KK, %rax
2559	movq	AORIG, AO
2560	leaq	(, %rax, SIZE), %rax
2561	leaq	(AO, %rax, 4), AO
2562#endif
2563
2564	leaq	BUFFER, BO
2565
2566#if defined(LN) || defined(RT)
2567	movq	KK, %rax
2568	salq	$2 + BASE_SHIFT, %rax
2569	leaq	(BO, %rax, 2), BO
2570#endif
2571
2572	movapd	 0 * SIZE(BO), %xmm9
2573	movapd	 2 * SIZE(BO), %xmm11
2574	movapd	 4 * SIZE(BO), %xmm13
2575	movapd	 8 * SIZE(BO), %xmm15
2576
2577	movapd	 0 * SIZE(AO), %xmm8
2578	pxor	%xmm0, %xmm0
2579	movapd	 2 * SIZE(AO), %xmm10
2580	pxor	%xmm1, %xmm1
2581	movapd	 4 * SIZE(AO), %xmm12
2582	pxor	%xmm2, %xmm2
2583	movapd	 6 * SIZE(AO), %xmm14
2584	pxor	%xmm3, %xmm3
2585
2586	PREFETCHW      4 * SIZE(CO1)
2587	pxor	%xmm4, %xmm4
2588	PREFETCHW      4 * SIZE(CO2)
2589	pxor	%xmm5, %xmm5
2590	PREFETCHW      4 * SIZE(CO1, LDC, 2)
2591	pxor	%xmm6, %xmm6
2592	PREFETCHW      4 * SIZE(CO2, LDC, 2)
2593	pxor	%xmm7, %xmm7
2594
2595#if defined(LT) || defined(RN)
2596	movq	KK, %rax
2597#else
2598	movq	K, %rax
2599	subq	KK, %rax
2600#endif
2601	andq	$-8, %rax
2602	salq	$4, %rax
2603	je	.L15
2604.L1X:
2605	KERNEL1(16 *  0)
2606	KERNEL2(16 *  0)
2607	KERNEL3(16 *  0)
2608	KERNEL4(16 *  0)
2609	KERNEL5(16 *  0)
2610	KERNEL6(16 *  0)
2611	KERNEL7(16 *  0)
2612	KERNEL8(16 *  0)
2613	KERNEL1(16 *  1)
2614	KERNEL2(16 *  1)
2615	KERNEL3(16 *  1)
2616	KERNEL4(16 *  1)
2617	KERNEL5(16 *  1)
2618	KERNEL6(16 *  1)
2619	KERNEL7(16 *  1)
2620	KERNEL8(16 *  1)
2621	cmpq	$64 *  2, %rax
2622	jle	.L12
2623	KERNEL1(16 *  2)
2624	KERNEL2(16 *  2)
2625	KERNEL3(16 *  2)
2626	KERNEL4(16 *  2)
2627	KERNEL5(16 *  2)
2628	KERNEL6(16 *  2)
2629	KERNEL7(16 *  2)
2630	KERNEL8(16 *  2)
2631	KERNEL1(16 *  3)
2632	KERNEL2(16 *  3)
2633	KERNEL3(16 *  3)
2634	KERNEL4(16 *  3)
2635	KERNEL5(16 *  3)
2636	KERNEL6(16 *  3)
2637	KERNEL7(16 *  3)
2638	KERNEL8(16 *  3)
2639	cmpq	$64 *  4, %rax
2640	jle	.L12
2641	KERNEL1(16 *  4)
2642	KERNEL2(16 *  4)
2643	KERNEL3(16 *  4)
2644	KERNEL4(16 *  4)
2645	KERNEL5(16 *  4)
2646	KERNEL6(16 *  4)
2647	KERNEL7(16 *  4)
2648	KERNEL8(16 *  4)
2649	KERNEL1(16 *  5)
2650	KERNEL2(16 *  5)
2651	KERNEL3(16 *  5)
2652	KERNEL4(16 *  5)
2653	KERNEL5(16 *  5)
2654	KERNEL6(16 *  5)
2655	KERNEL7(16 *  5)
2656	KERNEL8(16 *  5)
2657	cmpq	$64 *  6, %rax
2658	jle	.L12
2659	KERNEL1(16 *  6)
2660	KERNEL2(16 *  6)
2661	KERNEL3(16 *  6)
2662	KERNEL4(16 *  6)
2663	KERNEL5(16 *  6)
2664	KERNEL6(16 *  6)
2665	KERNEL7(16 *  6)
2666	KERNEL8(16 *  6)
2667	KERNEL1(16 *  7)
2668	KERNEL2(16 *  7)
2669	KERNEL3(16 *  7)
2670	KERNEL4(16 *  7)
2671	KERNEL5(16 *  7)
2672	KERNEL6(16 *  7)
2673	KERNEL7(16 *  7)
2674	KERNEL8(16 *  7)
2675
2676	addq	$16 * 8  * SIZE, AO
2677	addq	$32 * 8  * SIZE, BO
2678	subq	$64 * 8, %rax
2679	jg	.L1X
2680
2681.L12:
2682	leaq	(AO, %rax, 2), AO	# * 16
2683	leaq	(BO, %rax, 4), BO	# * 64
2684	ALIGN_4
2685
2686.L15:
2687#if defined(LT) || defined(RN)
2688	movq	KK, %rax
2689#else
2690	movq	K, %rax
2691	subq	KK, %rax
2692#endif
2693	andq	$7, %rax		# if (k & 1)
2694	BRANCH
2695	je .L19
2696	ALIGN_4
2697
2698.L16:
2699	mulpd	%xmm8, %xmm9
2700	addpd	%xmm9, %xmm0
2701	movapd	 2 * SIZE(BO), %xmm9
2702	mulpd	%xmm8, %xmm9
2703	addpd	%xmm9, %xmm1
2704	movapd	 4 * SIZE(BO), %xmm9
2705	mulpd	%xmm8, %xmm9
2706	mulpd	 6 * SIZE(BO), %xmm8
2707	addpd	%xmm9, %xmm2
2708	movapd	 0 * SIZE(BO), %xmm9
2709	addpd	%xmm8, %xmm3
2710	movapd	 4 * SIZE(AO), %xmm8
2711	mulpd	%xmm10, %xmm9
2712	addpd	%xmm9, %xmm4
2713	movapd	 2 * SIZE(BO), %xmm9
2714	mulpd	%xmm10, %xmm9
2715	addpd	%xmm9, %xmm5
2716	movapd	 4 * SIZE(BO), %xmm9
2717	mulpd	%xmm10, %xmm9
2718	mulpd	 6 * SIZE(BO), %xmm10
2719	addpd	%xmm9, %xmm6
2720	movapd	 8 * SIZE(BO), %xmm9
2721	addpd	%xmm10, %xmm7
2722	movapd	 6 * SIZE(AO), %xmm10
2723
2724	addq	$4 * SIZE, AO		# aoffset  += 4
2725	addq	$8 * SIZE, BO		# boffset1 += 8
2726	decq	%rax
2727	jg	.L16
2728	ALIGN_4
2729
2730.L19:
2731#if defined(LN) || defined(RT)
2732	movq	KK, %rax
2733#ifdef LN
2734	subq	$4, %rax
2735#else
2736	subq	$4, %rax
2737#endif
2738
2739	movq	AORIG, AO
2740	movq	BORIG, B
2741	leaq	BUFFER, BO
2742
2743	leaq	(, %rax, SIZE), %rax
2744	leaq	(AO, %rax, 4), AO
2745	leaq	(B,  %rax, 4), B
2746	leaq	(BO, %rax, 8), BO
2747#endif
2748
2749#if defined(LN) || defined(LT)
2750	movapd	%xmm0, %xmm8
2751	unpcklpd %xmm1, %xmm0
2752	unpckhpd %xmm1, %xmm8
2753
2754	movapd	%xmm2, %xmm10
2755	unpcklpd %xmm3, %xmm2
2756	unpckhpd %xmm3, %xmm10
2757
2758	movapd	%xmm4, %xmm12
2759	unpcklpd %xmm5, %xmm4
2760	unpckhpd %xmm5, %xmm12
2761
2762	movapd	%xmm6, %xmm14
2763	unpcklpd %xmm7, %xmm6
2764	unpckhpd %xmm7, %xmm14
2765
2766	movapd	 0 * SIZE(B), %xmm1
2767	movapd	 2 * SIZE(B), %xmm3
2768	movapd	 4 * SIZE(B), %xmm5
2769	movapd	 6 * SIZE(B), %xmm7
2770	movapd	 8 * SIZE(B), %xmm9
2771	movapd	10 * SIZE(B), %xmm11
2772	movapd	12 * SIZE(B), %xmm13
2773	movapd	14 * SIZE(B), %xmm15
2774
2775	subpd	%xmm0,  %xmm1
2776	subpd	%xmm2,  %xmm3
2777	subpd	%xmm8,  %xmm5
2778	subpd	%xmm10, %xmm7
2779	subpd	%xmm4,  %xmm9
2780	subpd	%xmm6,  %xmm11
2781	subpd	%xmm12, %xmm13
2782	subpd	%xmm14, %xmm15
2783#else
2784	movapd	 0 * SIZE(AO), %xmm8
2785	movapd	 2 * SIZE(AO), %xmm9
2786	movapd	 4 * SIZE(AO), %xmm10
2787	movapd	 6 * SIZE(AO), %xmm11
2788
2789	movapd	 8 * SIZE(AO), %xmm12
2790	movapd	10 * SIZE(AO), %xmm13
2791	movapd	12 * SIZE(AO), %xmm14
2792	movapd	14 * SIZE(AO), %xmm15
2793
2794	subpd	%xmm0, %xmm8
2795	subpd	%xmm4, %xmm9
2796	subpd	%xmm1, %xmm10
2797	subpd	%xmm5, %xmm11
2798	subpd	%xmm2, %xmm12
2799	subpd	%xmm6, %xmm13
2800	subpd	%xmm3, %xmm14
2801	subpd	%xmm7, %xmm15
2802#endif
2803
2804#ifdef LN
2805	movlpd	15 * SIZE(AO), %xmm0
2806	movhpd	15 * SIZE(AO), %xmm0
2807	mulpd	 %xmm0, %xmm13
2808	mulpd	 %xmm0, %xmm15
2809
2810	movlpd	14 * SIZE(AO), %xmm2
2811	movhpd	14 * SIZE(AO), %xmm2
2812	mulpd	 %xmm13, %xmm2
2813	subpd	 %xmm2, %xmm9
2814	movlpd	14 * SIZE(AO), %xmm2
2815	movhpd	14 * SIZE(AO), %xmm2
2816	mulpd	 %xmm15, %xmm2
2817	subpd	 %xmm2, %xmm11
2818
2819	movlpd	13 * SIZE(AO), %xmm4
2820	movhpd	13 * SIZE(AO), %xmm4
2821	mulpd	 %xmm13, %xmm4
2822	subpd	 %xmm4, %xmm5
2823	movlpd	13 * SIZE(AO), %xmm4
2824	movhpd	13 * SIZE(AO), %xmm4
2825	mulpd	 %xmm15, %xmm4
2826	subpd	 %xmm4, %xmm7
2827
2828	movlpd	12 * SIZE(AO), %xmm6
2829	movhpd	12 * SIZE(AO), %xmm6
2830	mulpd	 %xmm13, %xmm6
2831	subpd	 %xmm6, %xmm1
2832	movlpd	12 * SIZE(AO), %xmm6
2833	movhpd	12 * SIZE(AO), %xmm6
2834	mulpd	 %xmm15, %xmm6
2835	subpd	 %xmm6, %xmm3
2836
2837	movlpd	10 * SIZE(AO), %xmm0
2838	movhpd	10 * SIZE(AO), %xmm0
2839	mulpd	 %xmm0, %xmm9
2840	mulpd	 %xmm0, %xmm11
2841
2842	movlpd	 9 * SIZE(AO), %xmm2
2843	movhpd	 9 * SIZE(AO), %xmm2
2844	mulpd	 %xmm9, %xmm2
2845	subpd	 %xmm2, %xmm5
2846	movlpd	 9 * SIZE(AO), %xmm2
2847	movhpd	 9 * SIZE(AO), %xmm2
2848	mulpd	 %xmm11, %xmm2
2849	subpd	 %xmm2, %xmm7
2850
2851	movlpd	 8 * SIZE(AO), %xmm4
2852	movhpd	 8 * SIZE(AO), %xmm4
2853	mulpd	 %xmm9, %xmm4
2854	subpd	 %xmm4, %xmm1
2855	movlpd	 8 * SIZE(AO), %xmm4
2856	movhpd	 8 * SIZE(AO), %xmm4
2857	mulpd	 %xmm11, %xmm4
2858	subpd	 %xmm4, %xmm3
2859
2860	movlpd	 5 * SIZE(AO), %xmm0
2861	movhpd	 5 * SIZE(AO), %xmm0
2862	mulpd	 %xmm0, %xmm5
2863	mulpd	 %xmm0, %xmm7
2864
2865	movlpd	 4 * SIZE(AO), %xmm2
2866	movhpd	 4 * SIZE(AO), %xmm2
2867	mulpd	 %xmm5, %xmm2
2868	subpd	 %xmm2, %xmm1
2869	movlpd	 4 * SIZE(AO), %xmm2
2870	movhpd	 4 * SIZE(AO), %xmm2
2871	mulpd	 %xmm7, %xmm2
2872	subpd	 %xmm2, %xmm3
2873
2874	movlpd	 0 * SIZE(AO), %xmm0
2875	movhpd	 0 * SIZE(AO), %xmm0
2876	mulpd	 %xmm0, %xmm1
2877	mulpd	 %xmm0, %xmm3
2878#endif
2879
2880#ifdef LT
2881	movlpd	 0 * SIZE(AO), %xmm0
2882	movhpd	 0 * SIZE(AO), %xmm0
2883	mulpd	 %xmm0, %xmm1
2884	mulpd	 %xmm0, %xmm3
2885
2886	movlpd	 1 * SIZE(AO), %xmm2
2887	movhpd	 1 * SIZE(AO), %xmm2
2888	mulpd	 %xmm1, %xmm2
2889	subpd	 %xmm2, %xmm5
2890
2891	movlpd	 1 * SIZE(AO), %xmm2
2892	movhpd	 1 * SIZE(AO), %xmm2
2893	mulpd	 %xmm3, %xmm2
2894	subpd	 %xmm2, %xmm7
2895
2896	movlpd	 2 * SIZE(AO), %xmm4
2897	movhpd	 2 * SIZE(AO), %xmm4
2898	mulpd	 %xmm1, %xmm4
2899	subpd	 %xmm4, %xmm9
2900	movlpd	 2 * SIZE(AO), %xmm4
2901	movhpd	 2 * SIZE(AO), %xmm4
2902	mulpd	 %xmm3, %xmm4
2903	subpd	 %xmm4, %xmm11
2904
2905	movlpd	 3 * SIZE(AO), %xmm6
2906	movhpd	 3 * SIZE(AO), %xmm6
2907	mulpd	 %xmm1, %xmm6
2908	subpd	 %xmm6, %xmm13
2909	movlpd	 3 * SIZE(AO), %xmm6
2910	movhpd	 3 * SIZE(AO), %xmm6
2911	mulpd	 %xmm3, %xmm6
2912	subpd	 %xmm6, %xmm15
2913
2914	movlpd	 5 * SIZE(AO), %xmm0
2915	movhpd	 5 * SIZE(AO), %xmm0
2916	mulpd	 %xmm0, %xmm5
2917	mulpd	 %xmm0, %xmm7
2918
2919	movlpd	 6 * SIZE(AO), %xmm2
2920	movhpd	 6 * SIZE(AO), %xmm2
2921	mulpd	 %xmm5, %xmm2
2922	subpd	 %xmm2, %xmm9
2923	movlpd	 6 * SIZE(AO), %xmm2
2924	movhpd	 6 * SIZE(AO), %xmm2
2925	mulpd	 %xmm7, %xmm2
2926	subpd	 %xmm2, %xmm11
2927
2928	movlpd	 7 * SIZE(AO), %xmm4
2929	movhpd	 7 * SIZE(AO), %xmm4
2930	mulpd	 %xmm5, %xmm4
2931	subpd	 %xmm4, %xmm13
2932	movlpd	 7 * SIZE(AO), %xmm4
2933	movhpd	 7 * SIZE(AO), %xmm4
2934	mulpd	 %xmm7, %xmm4
2935	subpd	 %xmm4, %xmm15
2936
2937	movlpd	10 * SIZE(AO), %xmm0
2938	movhpd	10 * SIZE(AO), %xmm0
2939	mulpd	 %xmm0, %xmm9
2940	mulpd	 %xmm0, %xmm11
2941
2942	movlpd	11 * SIZE(AO), %xmm2
2943	movhpd	11 * SIZE(AO), %xmm2
2944	mulpd	 %xmm9, %xmm2
2945	subpd	 %xmm2, %xmm13
2946	movlpd	11 * SIZE(AO), %xmm2
2947	movhpd	11 * SIZE(AO), %xmm2
2948	mulpd	 %xmm11, %xmm2
2949	subpd	 %xmm2, %xmm15
2950
2951	movlpd	15 * SIZE(AO), %xmm0
2952	movhpd	15 * SIZE(AO), %xmm0
2953	mulpd	 %xmm0, %xmm13
2954	mulpd	 %xmm0, %xmm15
2955#endif
2956
2957
2958#ifdef RN
2959	movlpd	 0 * SIZE(B), %xmm0
2960	movhpd	 0 * SIZE(B), %xmm0
2961	mulpd	 %xmm0, %xmm8
2962	mulpd	 %xmm0, %xmm9
2963
2964	movlpd	 1 * SIZE(B), %xmm1
2965	movhpd	 1 * SIZE(B), %xmm1
2966	mulpd	 %xmm8, %xmm1
2967	subpd	 %xmm1, %xmm10
2968	movlpd	 1 * SIZE(B), %xmm1
2969	movhpd	 1 * SIZE(B), %xmm1
2970	mulpd	 %xmm9, %xmm1
2971	subpd	 %xmm1, %xmm11
2972
2973	movlpd	 2 * SIZE(B), %xmm2
2974	movhpd	 2 * SIZE(B), %xmm2
2975	mulpd	 %xmm8, %xmm2
2976	subpd	 %xmm2, %xmm12
2977	movlpd	 2 * SIZE(B), %xmm2
2978	movhpd	 2 * SIZE(B), %xmm2
2979	mulpd	 %xmm9, %xmm2
2980	subpd	 %xmm2, %xmm13
2981
2982	movlpd	 3 * SIZE(B), %xmm3
2983	movhpd	 3 * SIZE(B), %xmm3
2984	mulpd	 %xmm8, %xmm3
2985	subpd	 %xmm3, %xmm14
2986	movlpd	 3 * SIZE(B), %xmm3
2987	movhpd	 3 * SIZE(B), %xmm3
2988	mulpd	 %xmm9, %xmm3
2989	subpd	 %xmm3, %xmm15
2990
2991	movlpd	 5 * SIZE(B), %xmm0
2992	movhpd	 5 * SIZE(B), %xmm0
2993	mulpd	 %xmm0, %xmm10
2994	mulpd	 %xmm0, %xmm11
2995
2996	movlpd	 6 * SIZE(B), %xmm1
2997	movhpd	 6 * SIZE(B), %xmm1
2998	mulpd	 %xmm10, %xmm1
2999	subpd	 %xmm1, %xmm12
3000	movlpd	 6 * SIZE(B), %xmm1
3001	movhpd	 6 * SIZE(B), %xmm1
3002	mulpd	 %xmm11, %xmm1
3003	subpd	 %xmm1, %xmm13
3004
3005	movlpd	 7 * SIZE(B), %xmm2
3006	movhpd	 7 * SIZE(B), %xmm2
3007	mulpd	 %xmm10, %xmm2
3008	subpd	 %xmm2, %xmm14
3009	movlpd	 7 * SIZE(B), %xmm2
3010	movhpd	 7 * SIZE(B), %xmm2
3011	mulpd	 %xmm11, %xmm2
3012	subpd	 %xmm2, %xmm15
3013
3014	movlpd	10 * SIZE(B), %xmm0
3015	movhpd	10 * SIZE(B), %xmm0
3016	mulpd	 %xmm0, %xmm12
3017	mulpd	 %xmm0, %xmm13
3018
3019	movlpd	11 * SIZE(B), %xmm1
3020	movhpd	11 * SIZE(B), %xmm1
3021	mulpd	 %xmm12, %xmm1
3022	subpd	 %xmm1, %xmm14
3023	movlpd	11 * SIZE(B), %xmm1
3024	movhpd	11 * SIZE(B), %xmm1
3025	mulpd	 %xmm13, %xmm1
3026	subpd	 %xmm1, %xmm15
3027
3028	movlpd	15 * SIZE(B), %xmm0
3029	movhpd	15 * SIZE(B), %xmm0
3030	mulpd	 %xmm0, %xmm14
3031	mulpd	 %xmm0, %xmm15
3032#endif
3033
3034#ifdef RT
3035	movlpd	15 * SIZE(B), %xmm0
3036	movhpd	15 * SIZE(B), %xmm0
3037	mulpd	 %xmm0, %xmm14
3038	mulpd	 %xmm0, %xmm15
3039
3040	movlpd	14 * SIZE(B), %xmm1
3041	movhpd	14 * SIZE(B), %xmm1
3042	mulpd	 %xmm14, %xmm1
3043	subpd	 %xmm1, %xmm12
3044	movlpd	14 * SIZE(B), %xmm1
3045	movhpd	14 * SIZE(B), %xmm1
3046	mulpd	 %xmm15, %xmm1
3047	subpd	 %xmm1, %xmm13
3048
3049	movlpd	13 * SIZE(B), %xmm2
3050	movhpd	13 * SIZE(B), %xmm2
3051	mulpd	 %xmm14, %xmm2
3052	subpd	 %xmm2, %xmm10
3053	movlpd	13 * SIZE(B), %xmm2
3054	movhpd	13 * SIZE(B), %xmm2
3055	mulpd	 %xmm15, %xmm2
3056	subpd	 %xmm2, %xmm11
3057
3058	movlpd	12 * SIZE(B), %xmm3
3059	movhpd	12 * SIZE(B), %xmm3
3060	mulpd	 %xmm14, %xmm3
3061	subpd	 %xmm3, %xmm8
3062	movlpd	12 * SIZE(B), %xmm3
3063	movhpd	12 * SIZE(B), %xmm3
3064	mulpd	 %xmm15, %xmm3
3065	subpd	 %xmm3, %xmm9
3066
3067	movlpd	10 * SIZE(B), %xmm0
3068	movhpd	10 * SIZE(B), %xmm0
3069	mulpd	 %xmm0, %xmm12
3070	mulpd	 %xmm0, %xmm13
3071
3072	movlpd	 9 * SIZE(B), %xmm1
3073	movhpd	 9 * SIZE(B), %xmm1
3074	mulpd	 %xmm12, %xmm1
3075	subpd	 %xmm1, %xmm10
3076	movlpd	 9 * SIZE(B), %xmm1
3077	movhpd	 9 * SIZE(B), %xmm1
3078	mulpd	 %xmm13, %xmm1
3079	subpd	 %xmm1, %xmm11
3080
3081	movlpd	 8 * SIZE(B), %xmm2
3082	movhpd	 8 * SIZE(B), %xmm2
3083	mulpd	 %xmm12, %xmm2
3084	subpd	 %xmm2, %xmm8
3085	movlpd	 8 * SIZE(B), %xmm2
3086	movhpd	 8 * SIZE(B), %xmm2
3087	mulpd	 %xmm13, %xmm2
3088	subpd	 %xmm2, %xmm9
3089
3090	movlpd	 5 * SIZE(B), %xmm0
3091	movhpd	 5 * SIZE(B), %xmm0
3092	mulpd	 %xmm0, %xmm10
3093	mulpd	 %xmm0, %xmm11
3094
3095	movlpd	 4 * SIZE(B), %xmm1
3096	movhpd	 4 * SIZE(B), %xmm1
3097	mulpd	 %xmm10, %xmm1
3098	subpd	 %xmm1, %xmm8
3099	movlpd	 4 * SIZE(B), %xmm1
3100	movhpd	 4 * SIZE(B), %xmm1
3101	mulpd	 %xmm11, %xmm1
3102	subpd	 %xmm1, %xmm9
3103
3104	movlpd	 0 * SIZE(B), %xmm0
3105	movhpd	 0 * SIZE(B), %xmm0
3106	mulpd	 %xmm0, %xmm8
3107	mulpd	 %xmm0, %xmm9
3108#endif
3109
3110#ifdef LN
3111	subq	$4 * SIZE, CO1
3112	subq	$4 * SIZE, CO2
3113#endif
3114
3115#if defined(LN) || defined(LT)
3116	movsd	%xmm1,  0 * SIZE(CO1)
3117	movsd	%xmm5,  1 * SIZE(CO1)
3118	movsd	%xmm9,  2 * SIZE(CO1)
3119	movsd	%xmm13, 3 * SIZE(CO1)
3120
3121	movhpd	%xmm1,  0 * SIZE(CO2)
3122	movhpd	%xmm5,  1 * SIZE(CO2)
3123	movhpd	%xmm9,  2 * SIZE(CO2)
3124	movhpd	%xmm13, 3 * SIZE(CO2)
3125
3126	movsd	%xmm3,  0 * SIZE(CO1, LDC, 2)
3127	movsd	%xmm7,  1 * SIZE(CO1, LDC, 2)
3128	movsd	%xmm11, 2 * SIZE(CO1, LDC, 2)
3129	movsd	%xmm15, 3 * SIZE(CO1, LDC, 2)
3130
3131	movhpd	%xmm3,  0 * SIZE(CO2, LDC, 2)
3132	movhpd	%xmm7,  1 * SIZE(CO2, LDC, 2)
3133	movhpd	%xmm11, 2 * SIZE(CO2, LDC, 2)
3134	movhpd	%xmm15, 3 * SIZE(CO2, LDC, 2)
3135#else
3136	movsd	%xmm8,  0 * SIZE(CO1)
3137	movhpd	%xmm8,  1 * SIZE(CO1)
3138	movsd	%xmm9,  2 * SIZE(CO1)
3139	movhpd	%xmm9,  3 * SIZE(CO1)
3140
3141	movsd	%xmm10,  0 * SIZE(CO2)
3142	movhpd	%xmm10,  1 * SIZE(CO2)
3143	movsd	%xmm11,  2 * SIZE(CO2)
3144	movhpd	%xmm11,  3 * SIZE(CO2)
3145
3146	movsd	%xmm12,  0 * SIZE(CO1, LDC, 2)
3147	movhpd	%xmm12,  1 * SIZE(CO1, LDC, 2)
3148	movsd	%xmm13,  2 * SIZE(CO1, LDC, 2)
3149	movhpd	%xmm13,  3 * SIZE(CO1, LDC, 2)
3150
3151	movsd	%xmm14,  0 * SIZE(CO2, LDC, 2)
3152	movhpd	%xmm14,  1 * SIZE(CO2, LDC, 2)
3153	movsd	%xmm15,  2 * SIZE(CO2, LDC, 2)
3154	movhpd	%xmm15,  3 * SIZE(CO2, LDC, 2)
3155#endif
3156
3157#if defined(LN) || defined(LT)
3158	movapd	%xmm1,   0 * SIZE(B)
3159	movapd	%xmm3,   2 * SIZE(B)
3160	movapd	%xmm5,   4 * SIZE(B)
3161	movapd	%xmm7,   6 * SIZE(B)
3162	movapd	%xmm9,   8 * SIZE(B)
3163	movapd	%xmm11, 10 * SIZE(B)
3164	movapd	%xmm13, 12 * SIZE(B)
3165	movapd	%xmm15, 14 * SIZE(B)
3166
3167	movlpd	%xmm1,   0 * SIZE(BO)
3168	movlpd	%xmm1,   1 * SIZE(BO)
3169	movhpd	%xmm1,   2 * SIZE(BO)
3170	movhpd	%xmm1,   3 * SIZE(BO)
3171	movlpd	%xmm3,   4 * SIZE(BO)
3172	movlpd	%xmm3,   5 * SIZE(BO)
3173	movhpd	%xmm3,   6 * SIZE(BO)
3174	movhpd	%xmm3,   7 * SIZE(BO)
3175	movlpd	%xmm5,   8 * SIZE(BO)
3176	movlpd	%xmm5,   9 * SIZE(BO)
3177	movhpd	%xmm5,  10 * SIZE(BO)
3178	movhpd	%xmm5,  11 * SIZE(BO)
3179	movlpd	%xmm7,  12 * SIZE(BO)
3180	movlpd	%xmm7,  13 * SIZE(BO)
3181	movhpd	%xmm7,  14 * SIZE(BO)
3182	movhpd	%xmm7,  15 * SIZE(BO)
3183	movlpd	%xmm9,  16 * SIZE(BO)
3184	movlpd	%xmm9,  17 * SIZE(BO)
3185	movhpd	%xmm9,  18 * SIZE(BO)
3186	movhpd	%xmm9,  19 * SIZE(BO)
3187	movlpd	%xmm11, 20 * SIZE(BO)
3188	movlpd	%xmm11, 21 * SIZE(BO)
3189	movhpd	%xmm11, 22 * SIZE(BO)
3190	movhpd	%xmm11, 23 * SIZE(BO)
3191	movlpd	%xmm13, 24 * SIZE(BO)
3192	movlpd	%xmm13, 25 * SIZE(BO)
3193	movhpd	%xmm13, 26 * SIZE(BO)
3194	movhpd	%xmm13, 27 * SIZE(BO)
3195	movlpd	%xmm15, 28 * SIZE(BO)
3196	movlpd	%xmm15, 29 * SIZE(BO)
3197	movhpd	%xmm15, 30 * SIZE(BO)
3198	movhpd	%xmm15, 31 * SIZE(BO)
3199#else
3200	movapd	%xmm8,   0 * SIZE(AO)
3201	movapd	%xmm9,   2 * SIZE(AO)
3202	movapd	%xmm10,  4 * SIZE(AO)
3203	movapd	%xmm11,  6 * SIZE(AO)
3204	movapd	%xmm12,  8 * SIZE(AO)
3205	movapd	%xmm13, 10 * SIZE(AO)
3206	movapd	%xmm14, 12 * SIZE(AO)
3207	movapd	%xmm15, 14 * SIZE(AO)
3208#endif
3209
3210#ifndef LN
3211	addq	$4 * SIZE, CO1
3212	addq	$4 * SIZE, CO2
3213#endif
3214
3215#if defined(LT) || defined(RN)
3216	movq	K,  %rax
3217	subq	KK, %rax
3218	leaq	(,%rax, SIZE), %rax
3219	leaq	(AO, %rax, 4), AO
3220#ifdef LT
3221	addq	$16 * SIZE, B
3222#endif
3223#endif
3224
3225#ifdef LN
3226	subq	$4, KK
3227	movq	BORIG, B
3228#endif
3229
3230#ifdef LT
3231	addq	$4, KK
3232#endif
3233
3234#ifdef RT
3235	movq	K, %rax
3236	movq	BORIG, B
3237	salq	$2 + BASE_SHIFT, %rax
3238	addq	%rax, AORIG
3239#endif
3240
3241	decq	I			# i --
3242	jg	.L11
3243	ALIGN_4
3244
3245.L20:
3246	testq	$3, M
3247	je	.L39
3248
3249	testq	$2, M
3250	je	.L30
3251	ALIGN_4
3252
3253.L21:
3254#ifdef LN
3255       movq	K, %rax
3256       salq	$1 + BASE_SHIFT, %rax
3257       subq	%rax, AORIG
3258#endif
3259
3260#if defined(LN) || defined(RT)
3261	movq	KK, %rax
3262	movq	AORIG, AO
3263	leaq	(, %rax, SIZE), %rax
3264	leaq	(AO, %rax, 2), AO
3265#endif
3266
3267	leaq	BUFFER, BO
3268
3269#if defined(LN) || defined(RT)
3270	movq	KK, %rax
3271	salq	$2 + BASE_SHIFT, %rax
3272	leaq	(BO, %rax, 2), BO
3273#endif
3274
3275	movapd	 0 * SIZE(AO), %xmm8
3276	pxor	%xmm0, %xmm0
3277	movapd	 0 * SIZE(BO), %xmm9
3278	pxor	%xmm1, %xmm1
3279	movapd	 8 * SIZE(AO), %xmm10
3280	pxor	%xmm2, %xmm2
3281	movapd	 8 * SIZE(BO), %xmm11
3282	pxor	%xmm3, %xmm3
3283
3284	movapd	16 * SIZE(BO), %xmm13
3285	movapd	24 * SIZE(BO), %xmm15
3286
3287#if defined(LT) || defined(RN)
3288	movq	KK, %rax
3289#else
3290	movq	K, %rax
3291	subq	KK, %rax
3292#endif
3293	sarq	$3, %rax
3294	je	.L25
3295	ALIGN_4
3296
3297.L22:
3298	mulpd	%xmm8, %xmm9
3299	addpd	%xmm9, %xmm0
3300	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
3301	movapd	 2 * SIZE(BO), %xmm9
3302	mulpd	%xmm8, %xmm9
3303	addpd	%xmm9, %xmm1
3304	movapd	 4 * SIZE(BO), %xmm9
3305	mulpd	%xmm8, %xmm9
3306	mulpd	 6 * SIZE(BO), %xmm8
3307	addpd	%xmm9, %xmm2
3308	movapd	32 * SIZE(BO), %xmm9
3309	addpd	%xmm8, %xmm3
3310	movapd	 2 * SIZE(AO), %xmm8
3311
3312	mulpd	%xmm8, %xmm11
3313	addpd	%xmm11, %xmm0
3314	movapd	10 * SIZE(BO), %xmm11
3315	mulpd	%xmm8, %xmm11
3316	addpd	%xmm11, %xmm1
3317	movapd	12 * SIZE(BO), %xmm11
3318	mulpd	%xmm8, %xmm11
3319	mulpd	14 * SIZE(BO), %xmm8
3320	addpd	%xmm11, %xmm2
3321	movapd	40 * SIZE(BO), %xmm11
3322	addpd	%xmm8, %xmm3
3323	movapd	 4 * SIZE(AO), %xmm8
3324
3325	mulpd	%xmm8, %xmm13
3326	addpd	%xmm13, %xmm0
3327	movapd	18 * SIZE(BO), %xmm13
3328	mulpd	%xmm8, %xmm13
3329	addpd	%xmm13, %xmm1
3330	movapd	20 * SIZE(BO), %xmm13
3331	mulpd	%xmm8, %xmm13
3332	mulpd	22 * SIZE(BO), %xmm8
3333	addpd	%xmm13, %xmm2
3334	movapd	48 * SIZE(BO), %xmm13
3335	addpd	%xmm8, %xmm3
3336	movapd	 6 * SIZE(AO), %xmm8
3337
3338	mulpd	%xmm8, %xmm15
3339	addpd	%xmm15, %xmm0
3340	movapd	26 * SIZE(BO), %xmm15
3341	mulpd	%xmm8, %xmm15
3342	addpd	%xmm15, %xmm1
3343	movapd	28 * SIZE(BO), %xmm15
3344	mulpd	%xmm8, %xmm15
3345	mulpd	30 * SIZE(BO), %xmm8
3346	addpd	%xmm15, %xmm2
3347	movapd	56 * SIZE(BO), %xmm15
3348	addpd	%xmm8, %xmm3
3349	movapd	16 * SIZE(AO), %xmm8
3350
3351	PREFETCH	(PREFETCHSIZE     +  8) * SIZE(AO)
3352	mulpd	%xmm10, %xmm9
3353	addpd	%xmm9, %xmm0
3354	movapd	34 * SIZE(BO), %xmm9
3355	mulpd	%xmm10, %xmm9
3356	addpd	%xmm9, %xmm1
3357	movapd	36 * SIZE(BO), %xmm9
3358	mulpd	%xmm10, %xmm9
3359	mulpd	38 * SIZE(BO), %xmm10
3360	addpd	%xmm9, %xmm2
3361	movapd	64 * SIZE(BO), %xmm9
3362	addpd	%xmm10, %xmm3
3363	movapd	10 * SIZE(AO), %xmm10
3364
3365	mulpd	%xmm10, %xmm11
3366	addpd	%xmm11, %xmm0
3367	movapd	42 * SIZE(BO), %xmm11
3368	mulpd	%xmm10, %xmm11
3369	addpd	%xmm11, %xmm1
3370	movapd	44 * SIZE(BO), %xmm11
3371	mulpd	%xmm10, %xmm11
3372	mulpd	46 * SIZE(BO), %xmm10
3373	addpd	%xmm11, %xmm2
3374	movapd	72 * SIZE(BO), %xmm11
3375	addpd	%xmm10, %xmm3
3376	movapd	12 * SIZE(AO), %xmm10
3377
3378	mulpd	%xmm10, %xmm13
3379	addpd	%xmm13, %xmm0
3380	movapd	50 * SIZE(BO), %xmm13
3381	mulpd	%xmm10, %xmm13
3382	addpd	%xmm13, %xmm1
3383	movapd	52 * SIZE(BO), %xmm13
3384	mulpd	%xmm10, %xmm13
3385	mulpd	54 * SIZE(BO), %xmm10
3386	addpd	%xmm13, %xmm2
3387	movapd	80 * SIZE(BO), %xmm13
3388	addpd	%xmm10, %xmm3
3389	movapd	14 * SIZE(AO), %xmm10
3390
3391	mulpd	%xmm10, %xmm15
3392	addpd	%xmm15, %xmm0
3393	movapd	58 * SIZE(BO), %xmm15
3394	mulpd	%xmm10, %xmm15
3395	addpd	%xmm15, %xmm1
3396	movapd	60 * SIZE(BO), %xmm15
3397	mulpd	%xmm10, %xmm15
3398	mulpd	62 * SIZE(BO), %xmm10
3399	addpd	%xmm15, %xmm2
3400	movapd	88 * SIZE(BO), %xmm15
3401	addpd	%xmm10, %xmm3
3402	movapd	24 * SIZE(AO), %xmm10
3403
3404	addq   $16 * SIZE, AO
3405	addq   $64 * SIZE, BO
3406	decq   %rax
3407	jne    .L22
3408	ALIGN_4
3409
3410.L25:
3411#if defined(LT) || defined(RN)
3412	movq	KK, %rax
3413#else
3414	movq	K, %rax
3415	subq	KK, %rax
3416#endif
3417	andq	$7, %rax		# if (k & 1)
3418	BRANCH
3419	je .L29
3420	ALIGN_4
3421
3422.L26:
3423	mulpd	%xmm8, %xmm9
3424	addpd	%xmm9, %xmm0
3425	movapd	 2 * SIZE(BO), %xmm9
3426	mulpd	%xmm8, %xmm9
3427	addpd	%xmm9, %xmm1
3428	movapd	 4 * SIZE(BO), %xmm9
3429	mulpd	%xmm8, %xmm9
3430	mulpd	 6 * SIZE(BO), %xmm8
3431	addpd	%xmm9, %xmm2
3432	movapd	 8 * SIZE(BO), %xmm9
3433	addpd	%xmm8, %xmm3
3434	movapd	 2 * SIZE(AO), %xmm8
3435
3436	addq	$2 * SIZE, AO		# aoffset  += 4
3437	addq	$8 * SIZE, BO		# boffset1 += 8
3438	decq	%rax
3439	jg	.L26
3440	ALIGN_4
3441
3442.L29:
3443#if defined(LN) || defined(RT)
3444	movq	KK, %rax
3445#ifdef LN
3446	subq	$2, %rax
3447#else
3448	subq	$4, %rax
3449#endif
3450
3451	movq	AORIG, AO
3452	movq	BORIG, B
3453	leaq	BUFFER, BO
3454
3455	leaq	(, %rax, SIZE), %rax
3456	leaq	(AO, %rax, 2), AO
3457	leaq	(B,  %rax, 4), B
3458	leaq	(BO, %rax, 8), BO
3459#endif
3460
3461#if defined(LN) || defined(LT)
3462	movapd	%xmm0, %xmm8
3463	unpcklpd %xmm1, %xmm0
3464	unpckhpd %xmm1, %xmm8
3465
3466	movapd	%xmm2, %xmm10
3467	unpcklpd %xmm3, %xmm2
3468	unpckhpd %xmm3, %xmm10
3469
3470	movapd	 0 * SIZE(B), %xmm1
3471	movapd	 2 * SIZE(B), %xmm3
3472	movapd	 4 * SIZE(B), %xmm5
3473	movapd	 6 * SIZE(B), %xmm7
3474
3475	subpd	%xmm0,  %xmm1
3476	subpd	%xmm2,  %xmm3
3477	subpd	%xmm8,  %xmm5
3478	subpd	%xmm10, %xmm7
3479#else
3480	movapd	 0 * SIZE(AO), %xmm8
3481	movapd	 2 * SIZE(AO), %xmm10
3482	movapd	 4 * SIZE(AO), %xmm12
3483	movapd	 6 * SIZE(AO), %xmm14
3484
3485	subpd	%xmm0, %xmm8
3486	subpd	%xmm1, %xmm10
3487	subpd	%xmm2, %xmm12
3488	subpd	%xmm3, %xmm14
3489#endif
3490
3491#ifdef LN
3492	movlpd	 3 * SIZE(AO), %xmm0
3493	movhpd	 3 * SIZE(AO), %xmm0
3494	mulpd	 %xmm0, %xmm5
3495	mulpd	 %xmm0, %xmm7
3496
3497	movlpd	 2 * SIZE(AO), %xmm2
3498	movhpd	 2 * SIZE(AO), %xmm2
3499	mulpd	 %xmm5, %xmm2
3500	subpd	 %xmm2, %xmm1
3501	movlpd	 2 * SIZE(AO), %xmm2
3502	movhpd	 2 * SIZE(AO), %xmm2
3503	mulpd	 %xmm7, %xmm2
3504	subpd	 %xmm2, %xmm3
3505
3506	movlpd	 0 * SIZE(AO), %xmm0
3507	movhpd	 0 * SIZE(AO), %xmm0
3508	mulpd	 %xmm0, %xmm1
3509	mulpd	 %xmm0, %xmm3
3510#endif
3511
3512#ifdef LT
3513	movlpd	 0 * SIZE(AO), %xmm0
3514	movhpd	 0 * SIZE(AO), %xmm0
3515	mulpd	 %xmm0, %xmm1
3516	mulpd	 %xmm0, %xmm3
3517
3518	movlpd	 1 * SIZE(AO), %xmm2
3519	movhpd	 1 * SIZE(AO), %xmm2
3520	mulpd	 %xmm1, %xmm2
3521	subpd	 %xmm2, %xmm5
3522	movlpd	 1 * SIZE(AO), %xmm2
3523	movhpd	 1 * SIZE(AO), %xmm2
3524	mulpd	 %xmm3, %xmm2
3525	subpd	 %xmm2, %xmm7
3526
3527	movlpd	 3 * SIZE(AO), %xmm0
3528	movhpd	 3 * SIZE(AO), %xmm0
3529	mulpd	 %xmm0, %xmm5
3530	mulpd	 %xmm0, %xmm7
3531#endif
3532
3533#ifdef RN
3534	movlpd	 0 * SIZE(B), %xmm0
3535	movhpd	 0 * SIZE(B), %xmm0
3536	mulpd	 %xmm0, %xmm8
3537
3538	movlpd	 1 * SIZE(B), %xmm1
3539	movhpd	 1 * SIZE(B), %xmm1
3540	mulpd	 %xmm8, %xmm1
3541	subpd	 %xmm1, %xmm10
3542	movlpd	 2 * SIZE(B), %xmm2
3543	movhpd	 2 * SIZE(B), %xmm2
3544	mulpd	 %xmm8, %xmm2
3545	subpd	 %xmm2, %xmm12
3546	movlpd	 3 * SIZE(B), %xmm3
3547	movhpd	 3 * SIZE(B), %xmm3
3548	mulpd	 %xmm8, %xmm3
3549	subpd	 %xmm3, %xmm14
3550
3551	movlpd	 5 * SIZE(B), %xmm0
3552	movhpd	 5 * SIZE(B), %xmm0
3553	mulpd	 %xmm0, %xmm10
3554	movlpd	 6 * SIZE(B), %xmm1
3555	movhpd	 6 * SIZE(B), %xmm1
3556	mulpd	 %xmm10, %xmm1
3557	subpd	 %xmm1, %xmm12
3558	movlpd	 7 * SIZE(B), %xmm2
3559	movhpd	 7 * SIZE(B), %xmm2
3560	mulpd	 %xmm10, %xmm2
3561	subpd	 %xmm2, %xmm14
3562
3563	movlpd	10 * SIZE(B), %xmm0
3564	movhpd	10 * SIZE(B), %xmm0
3565	mulpd	 %xmm0, %xmm12
3566
3567	movlpd	11 * SIZE(B), %xmm1
3568	movhpd	11 * SIZE(B), %xmm1
3569	mulpd	 %xmm12, %xmm1
3570	subpd	 %xmm1, %xmm14
3571
3572	movlpd	15 * SIZE(B), %xmm0
3573	movhpd	15 * SIZE(B), %xmm0
3574	mulpd	 %xmm0, %xmm14
3575#endif
3576
3577#ifdef RT
3578	movlpd	15 * SIZE(B), %xmm0
3579	movhpd	15 * SIZE(B), %xmm0
3580	mulpd	 %xmm0, %xmm14
3581
3582	movlpd	14 * SIZE(B), %xmm1
3583	movhpd	14 * SIZE(B), %xmm1
3584	mulpd	 %xmm14, %xmm1
3585	subpd	 %xmm1, %xmm12
3586	movlpd	13 * SIZE(B), %xmm2
3587	movhpd	13 * SIZE(B), %xmm2
3588	mulpd	 %xmm14, %xmm2
3589	subpd	 %xmm2, %xmm10
3590	movlpd	12 * SIZE(B), %xmm3
3591	movhpd	12 * SIZE(B), %xmm3
3592	mulpd	 %xmm14, %xmm3
3593	subpd	 %xmm3, %xmm8
3594
3595	movlpd	10 * SIZE(B), %xmm0
3596	movhpd	10 * SIZE(B), %xmm0
3597	mulpd	 %xmm0, %xmm12
3598	movlpd	 9 * SIZE(B), %xmm1
3599	movhpd	 9 * SIZE(B), %xmm1
3600	mulpd	 %xmm12, %xmm1
3601	subpd	 %xmm1, %xmm10
3602	movlpd	 8 * SIZE(B), %xmm2
3603	movhpd	 8 * SIZE(B), %xmm2
3604	mulpd	 %xmm12, %xmm2
3605	subpd	 %xmm2, %xmm8
3606
3607	movlpd	 5 * SIZE(B), %xmm0
3608	movhpd	 5 * SIZE(B), %xmm0
3609	mulpd	 %xmm0, %xmm10
3610	movlpd	 4 * SIZE(B), %xmm1
3611	movhpd	 4 * SIZE(B), %xmm1
3612	mulpd	 %xmm10, %xmm1
3613	subpd	 %xmm1, %xmm8
3614
3615	movlpd	 0 * SIZE(B), %xmm0
3616	movhpd	 0 * SIZE(B), %xmm0
3617	mulpd	 %xmm0, %xmm8
3618#endif
3619
3620#ifdef LN
3621	subq	$2 * SIZE, CO1
3622	subq	$2 * SIZE, CO2
3623#endif
3624
3625#if defined(LN) || defined(LT)
3626	movsd	%xmm1,  0 * SIZE(CO1)
3627	movsd	%xmm5,  1 * SIZE(CO1)
3628
3629	movhpd	%xmm1,  0 * SIZE(CO2)
3630	movhpd	%xmm5,  1 * SIZE(CO2)
3631
3632	movsd	%xmm3,  0 * SIZE(CO1, LDC, 2)
3633	movsd	%xmm7,  1 * SIZE(CO1, LDC, 2)
3634
3635	movhpd	%xmm3,  0 * SIZE(CO2, LDC, 2)
3636	movhpd	%xmm7,  1 * SIZE(CO2, LDC, 2)
3637#else
3638	movsd	%xmm8,  0 * SIZE(CO1)
3639	movhpd	%xmm8,  1 * SIZE(CO1)
3640
3641	movsd	%xmm10,  0 * SIZE(CO2)
3642	movhpd	%xmm10,  1 * SIZE(CO2)
3643
3644	movsd	%xmm12,  0 * SIZE(CO1, LDC, 2)
3645	movhpd	%xmm12,  1 * SIZE(CO1, LDC, 2)
3646
3647	movsd	%xmm14,  0 * SIZE(CO2, LDC, 2)
3648	movhpd	%xmm14,  1 * SIZE(CO2, LDC, 2)
3649#endif
3650
3651#if defined(LN) || defined(LT)
3652	movapd	%xmm1,   0 * SIZE(B)
3653	movapd	%xmm3,   2 * SIZE(B)
3654	movapd	%xmm5,   4 * SIZE(B)
3655	movapd	%xmm7,   6 * SIZE(B)
3656
3657	movlpd	%xmm1,   0 * SIZE(BO)
3658	movlpd	%xmm1,   1 * SIZE(BO)
3659	movhpd	%xmm1,   2 * SIZE(BO)
3660	movhpd	%xmm1,   3 * SIZE(BO)
3661	movlpd	%xmm3,   4 * SIZE(BO)
3662	movlpd	%xmm3,   5 * SIZE(BO)
3663	movhpd	%xmm3,   6 * SIZE(BO)
3664	movhpd	%xmm3,   7 * SIZE(BO)
3665	movlpd	%xmm5,   8 * SIZE(BO)
3666	movlpd	%xmm5,   9 * SIZE(BO)
3667	movhpd	%xmm5,  10 * SIZE(BO)
3668	movhpd	%xmm5,  11 * SIZE(BO)
3669	movlpd	%xmm7,  12 * SIZE(BO)
3670	movlpd	%xmm7,  13 * SIZE(BO)
3671	movhpd	%xmm7,  14 * SIZE(BO)
3672	movhpd	%xmm7,  15 * SIZE(BO)
3673#else
3674	movapd	%xmm8,   0 * SIZE(AO)
3675	movapd	%xmm10,  2 * SIZE(AO)
3676	movapd	%xmm12,  4 * SIZE(AO)
3677	movapd	%xmm14,  6 * SIZE(AO)
3678#endif
3679
3680#ifndef LN
3681	addq	$2 * SIZE, CO1
3682	addq	$2 * SIZE, CO2
3683#endif
3684
3685#if defined(LT) || defined(RN)
3686	movq	K,  %rax
3687	subq	KK, %rax
3688	leaq	(,%rax, SIZE), %rax
3689	leaq	(AO, %rax, 2), AO
3690#ifdef LT
3691	addq	$8 * SIZE, B
3692#endif
3693#endif
3694
3695#ifdef LN
3696	subq	$2, KK
3697	movq	BORIG, B
3698#endif
3699
3700#ifdef LT
3701	addq	$2, KK
3702#endif
3703
3704#ifdef RT
3705       movq	K, %rax
3706       movq	BORIG, B
3707       salq	$1 + BASE_SHIFT, %rax
3708       addq	%rax, AORIG
3709#endif
3710	ALIGN_4
3711
3712.L30:
3713	testq	$1, M
3714	je	.L39
3715	ALIGN_4
3716
3717.L31:
3718#ifdef LN
3719       movq	K, %rax
3720       salq	$0 + BASE_SHIFT, %rax
3721       subq	%rax, AORIG
3722#endif
3723
3724#if defined(LN) || defined(RT)
3725	movq	KK, %rax
3726	movq	AORIG, AO
3727	leaq	(, %rax, SIZE), %rax
3728	leaq	(AO, %rax, 1), AO
3729#endif
3730
3731	leaq	BUFFER, BO
3732
3733#if defined(LN) || defined(RT)
3734	movq	KK, %rax
3735	salq	$2 + BASE_SHIFT, %rax
3736	leaq	(BO, %rax, 2), BO
3737#endif
3738
3739	movsd	 0 * SIZE(AO), %xmm8
3740	pxor	%xmm0, %xmm0
3741	movsd	 0 * SIZE(BO), %xmm9
3742	pxor	%xmm1, %xmm1
3743	movsd	 8 * SIZE(AO), %xmm10
3744	pxor	%xmm2, %xmm2
3745	movsd	 8 * SIZE(BO), %xmm11
3746	pxor	%xmm3, %xmm3
3747
3748	movsd	16 * SIZE(BO), %xmm13
3749	movsd	24 * SIZE(BO), %xmm15
3750
3751#if defined(LT) || defined(RN)
3752	movq	KK, %rax
3753#else
3754	movq	K, %rax
3755	subq	KK, %rax
3756#endif
3757	sarq	$3, %rax
3758	je	.L35
3759	ALIGN_4
3760
3761.L32:
3762	mulsd	%xmm8, %xmm9
3763	addsd	%xmm9, %xmm0
3764	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
3765	movsd	 2 * SIZE(BO), %xmm9
3766	mulsd	%xmm8, %xmm9
3767	addsd	%xmm9, %xmm1
3768	movsd	 4 * SIZE(BO), %xmm9
3769	mulsd	%xmm8, %xmm9
3770	mulsd	 6 * SIZE(BO), %xmm8
3771	addsd	%xmm9, %xmm2
3772	movsd	32 * SIZE(BO), %xmm9
3773	addsd	%xmm8, %xmm3
3774	movsd	 1 * SIZE(AO), %xmm8
3775
3776	mulsd	%xmm8, %xmm11
3777	addsd	%xmm11, %xmm0
3778	movsd	10 * SIZE(BO), %xmm11
3779	mulsd	%xmm8, %xmm11
3780	addsd	%xmm11, %xmm1
3781	movsd	12 * SIZE(BO), %xmm11
3782	mulsd	%xmm8, %xmm11
3783	mulsd	14 * SIZE(BO), %xmm8
3784	addsd	%xmm11, %xmm2
3785	movsd	40 * SIZE(BO), %xmm11
3786	addsd	%xmm8, %xmm3
3787	movsd	 2 * SIZE(AO), %xmm8
3788
3789	mulsd	%xmm8, %xmm13
3790	addsd	%xmm13, %xmm0
3791	movsd	18 * SIZE(BO), %xmm13
3792	mulsd	%xmm8, %xmm13
3793	addsd	%xmm13, %xmm1
3794	movsd	20 * SIZE(BO), %xmm13
3795	mulsd	%xmm8, %xmm13
3796	mulsd	22 * SIZE(BO), %xmm8
3797	addsd	%xmm13, %xmm2
3798	movsd	48 * SIZE(BO), %xmm13
3799	addsd	%xmm8, %xmm3
3800	movsd	 3 * SIZE(AO), %xmm8
3801
3802	mulsd	%xmm8, %xmm15
3803	addsd	%xmm15, %xmm0
3804	movsd	26 * SIZE(BO), %xmm15
3805	mulsd	%xmm8, %xmm15
3806	addsd	%xmm15, %xmm1
3807	movsd	28 * SIZE(BO), %xmm15
3808	mulsd	%xmm8, %xmm15
3809	mulsd	30 * SIZE(BO), %xmm8
3810	addsd	%xmm15, %xmm2
3811	movsd	56 * SIZE(BO), %xmm15
3812	addsd	%xmm8, %xmm3
3813	movsd	 4 * SIZE(AO), %xmm8
3814
3815	mulsd	%xmm8, %xmm9
3816	addsd	%xmm9, %xmm0
3817	movsd	34 * SIZE(BO), %xmm9
3818	mulsd	%xmm8, %xmm9
3819	addsd	%xmm9, %xmm1
3820	movsd	36 * SIZE(BO), %xmm9
3821	mulsd	%xmm8, %xmm9
3822	mulsd	38 * SIZE(BO), %xmm8
3823	addsd	%xmm9, %xmm2
3824	movsd	64 * SIZE(BO), %xmm9
3825	addsd	%xmm8, %xmm3
3826	movsd	 5 * SIZE(AO), %xmm8
3827
3828	mulsd	%xmm8, %xmm11
3829	addsd	%xmm11, %xmm0
3830	movsd	42 * SIZE(BO), %xmm11
3831	mulsd	%xmm8, %xmm11
3832	addsd	%xmm11, %xmm1
3833	movsd	44 * SIZE(BO), %xmm11
3834	mulsd	%xmm8, %xmm11
3835	mulsd	46 * SIZE(BO), %xmm8
3836	addsd	%xmm11, %xmm2
3837	movsd	72 * SIZE(BO), %xmm11
3838	addsd	%xmm8, %xmm3
3839	movsd	 6 * SIZE(AO), %xmm8
3840
3841	mulsd	%xmm8, %xmm13
3842	addsd	%xmm13, %xmm0
3843	movsd	50 * SIZE(BO), %xmm13
3844	mulsd	%xmm8, %xmm13
3845	addsd	%xmm13, %xmm1
3846	movsd	52 * SIZE(BO), %xmm13
3847	mulsd	%xmm8, %xmm13
3848	mulsd	54 * SIZE(BO), %xmm8
3849	addsd	%xmm13, %xmm2
3850	movsd	80 * SIZE(BO), %xmm13
3851	addsd	%xmm8, %xmm3
3852	movsd	 7 * SIZE(AO), %xmm8
3853
3854	mulsd	%xmm8, %xmm15
3855	addsd	%xmm15, %xmm0
3856	movsd	58 * SIZE(BO), %xmm15
3857	mulsd	%xmm8, %xmm15
3858	addsd	%xmm15, %xmm1
3859	movsd	60 * SIZE(BO), %xmm15
3860	mulsd	%xmm8, %xmm15
3861	mulsd	62 * SIZE(BO), %xmm8
3862	addsd	%xmm15, %xmm2
3863	movsd	88 * SIZE(BO), %xmm15
3864	addsd	%xmm8, %xmm3
3865	movsd	 8 * SIZE(AO), %xmm8
3866
3867	addq   $ 8 * SIZE, AO
3868	addq   $64 * SIZE, BO
3869	decq   %rax
3870	jne    .L32
3871	ALIGN_4
3872
3873.L35:
3874#if defined(LT) || defined(RN)
3875	movq	KK, %rax
3876#else
3877	movq	K, %rax
3878	subq	KK, %rax
3879#endif
3880	andq	$7, %rax		# if (k & 1)
3881	BRANCH
3882	je .L38
3883	ALIGN_4
3884
3885.L36:
3886	mulsd	%xmm8, %xmm9
3887	addsd	%xmm9, %xmm0
3888	movsd	 2 * SIZE(BO), %xmm9
3889	mulsd	%xmm8, %xmm9
3890	addsd	%xmm9, %xmm1
3891	movsd	 4 * SIZE(BO), %xmm9
3892	mulsd	%xmm8, %xmm9
3893	mulsd	 6 * SIZE(BO), %xmm8
3894	addsd	%xmm9, %xmm2
3895	movsd	 8 * SIZE(BO), %xmm9
3896	addsd	%xmm8, %xmm3
3897	movsd	 1 * SIZE(AO), %xmm8
3898
3899	addq	$1 * SIZE, AO		# aoffset  += 4
3900	addq	$8 * SIZE, BO		# boffset1 += 8
3901	decq	%rax
3902	jg	.L36
3903	ALIGN_4
3904
3905.L38:
3906#if defined(LN) || defined(RT)
3907	movq	KK, %rax
3908#ifdef LN
3909	subq	$1, %rax
3910#else
3911	subq	$4, %rax
3912#endif
3913
3914	movq	AORIG, AO
3915	movq	BORIG, B
3916	leaq	BUFFER, BO
3917
3918	leaq	(, %rax, SIZE), %rax
3919	leaq	(AO, %rax, 1), AO
3920	leaq	(B,  %rax, 4), B
3921	leaq	(BO, %rax, 8), BO
3922#endif
3923
3924#if defined(LN) || defined(LT)
3925	movsd	 0 * SIZE(B), %xmm4
3926	movsd	 1 * SIZE(B), %xmm5
3927	movsd	 2 * SIZE(B), %xmm6
3928	movsd	 3 * SIZE(B), %xmm7
3929#else
3930	movsd	 0 * SIZE(AO), %xmm4
3931	movsd	 1 * SIZE(AO), %xmm5
3932	movsd	 2 * SIZE(AO), %xmm6
3933	movsd	 3 * SIZE(AO), %xmm7
3934#endif
3935
3936	subsd	%xmm0,  %xmm4
3937	subsd	%xmm1,  %xmm5
3938	subsd	%xmm2,  %xmm6
3939	subsd	%xmm3,  %xmm7
3940
3941#ifdef LN
3942	movsd	 0 * SIZE(AO), %xmm0
3943
3944	mulsd	 %xmm0, %xmm4
3945	mulsd	 %xmm0, %xmm5
3946	mulsd	 %xmm0, %xmm6
3947	mulsd	 %xmm0, %xmm7
3948#endif
3949
3950#ifdef LT
3951	movsd	 0 * SIZE(AO), %xmm0
3952
3953	mulsd	 %xmm0, %xmm4
3954	mulsd	 %xmm0, %xmm5
3955	mulsd	 %xmm0, %xmm6
3956	mulsd	 %xmm0, %xmm7
3957#endif
3958
3959#ifdef RN
3960	mulsd	 0 * SIZE(B), %xmm4
3961	movlpd	 1 * SIZE(B), %xmm1
3962	mulsd	 %xmm4, %xmm1
3963	subsd	 %xmm1, %xmm5
3964	movlpd	 2 * SIZE(B), %xmm2
3965	mulsd	 %xmm4, %xmm2
3966	subsd	 %xmm2, %xmm6
3967	movlpd	 3 * SIZE(B), %xmm3
3968	mulsd	 %xmm4, %xmm3
3969	subsd	 %xmm3, %xmm7
3970
3971	mulsd	 5 * SIZE(B), %xmm5
3972	movlpd	 6 * SIZE(B), %xmm1
3973	mulsd	 %xmm5, %xmm1
3974	subsd	 %xmm1, %xmm6
3975	movlpd	 7 * SIZE(B), %xmm2
3976	mulsd	 %xmm5, %xmm2
3977	subsd	 %xmm2, %xmm7
3978
3979	mulsd	10 * SIZE(B), %xmm6
3980	movlpd	11 * SIZE(B), %xmm1
3981	mulsd	 %xmm6, %xmm1
3982	subsd	 %xmm1, %xmm7
3983
3984	mulsd	15 * SIZE(B), %xmm7
3985#endif
3986
3987#ifdef RT
3988	mulsd	 15 * SIZE(B), %xmm7
3989
3990	movlpd	14 * SIZE(B), %xmm1
3991	mulsd	 %xmm7, %xmm1
3992	subsd	 %xmm1, %xmm6
3993	movlpd	13 * SIZE(B), %xmm2
3994	mulsd	 %xmm7, %xmm2
3995	subsd	 %xmm2, %xmm5
3996	movlpd	12 * SIZE(B), %xmm3
3997	mulsd	 %xmm7, %xmm3
3998	subsd	 %xmm3, %xmm4
3999
4000	mulsd	 10 * SIZE(B), %xmm6
4001
4002	movlpd	 9 * SIZE(B), %xmm1
4003	mulsd	 %xmm6, %xmm1
4004	subsd	 %xmm1, %xmm5
4005	movlpd	 8 * SIZE(B), %xmm2
4006	mulsd	 %xmm6, %xmm2
4007	subsd	 %xmm2, %xmm4
4008
4009	mulsd	 5 * SIZE(B), %xmm5
4010
4011	movlpd	 4 * SIZE(B), %xmm1
4012	mulsd	 %xmm5, %xmm1
4013	subsd	 %xmm1, %xmm4
4014
4015	mulsd	 0 * SIZE(B), %xmm4
4016#endif
4017
4018#ifdef LN
4019	subq	$1 * SIZE, CO1
4020	subq	$1 * SIZE, CO2
4021#endif
4022
4023	movsd	%xmm4,  0 * SIZE(CO1)
4024	movsd	%xmm5,  0 * SIZE(CO2)
4025	movsd	%xmm6,  0 * SIZE(CO1, LDC, 2)
4026	movsd	%xmm7,  0 * SIZE(CO2, LDC, 2)
4027
4028#if defined(LN) || defined(LT)
4029	movsd	%xmm4,   0 * SIZE(B)
4030	movsd	%xmm5,   1 * SIZE(B)
4031	movsd	%xmm6,   2 * SIZE(B)
4032	movsd	%xmm7,   3 * SIZE(B)
4033
4034	movsd	%xmm4,   0 * SIZE(BO)
4035	movsd	%xmm4,   1 * SIZE(BO)
4036	movsd	%xmm5,   2 * SIZE(BO)
4037	movsd	%xmm5,   3 * SIZE(BO)
4038	movsd	%xmm6,   4 * SIZE(BO)
4039	movsd	%xmm6,   5 * SIZE(BO)
4040	movsd	%xmm7,   6 * SIZE(BO)
4041	movsd	%xmm7,   7 * SIZE(BO)
4042#else
4043	movsd	%xmm4,   0 * SIZE(AO)
4044	movsd	%xmm5,   1 * SIZE(AO)
4045	movsd	%xmm6,   2 * SIZE(AO)
4046	movsd	%xmm7,   3 * SIZE(AO)
4047#endif
4048
4049#ifndef LN
4050	addq	$1 * SIZE, CO1
4051	addq	$1 * SIZE, CO2
4052#endif
4053
4054#if defined(LT) || defined(RN)
4055	movq	K,  %rax
4056	subq	KK, %rax
4057	leaq	(,%rax, SIZE), %rax
4058	leaq	(AO, %rax, 1), AO
4059#ifdef LT
4060	addq	$4 * SIZE, B
4061#endif
4062#endif
4063
4064#ifdef LN
4065	subq	$1, KK
4066	movq	BORIG, B
4067#endif
4068
4069#ifdef LT
4070	addq	$1, KK
4071#endif
4072
4073#ifdef RT
4074       movq	K, %rax
4075       movq	BORIG, B
4076       salq	$0 + BASE_SHIFT, %rax
4077       addq	%rax, AORIG
4078#endif
4079	ALIGN_4
4080
4081.L39:
4082#ifdef LN
4083       leaq	(, K, SIZE), %rax
4084       leaq	(B, %rax, 4), B
4085#endif
4086
4087#if defined(LT) || defined(RN)
4088	movq	K,  %rax
4089	subq	KK, %rax
4090	leaq	(,%rax, SIZE), %rax
4091	leaq	(B,  %rax, 4), B
4092#endif
4093
4094#ifdef RN
4095	addq	$4, KK
4096#endif
4097
4098#ifdef RT
4099	subq	$4, KK
4100#endif
4101
4102	decq	J			# j --
4103	jg	.L01
4104	ALIGN_4
4105
4106.L999:
4107	movq	%rbx, %rsp
4108
4109	movq	  0(%rsp), %rbx
4110	movq	  8(%rsp), %rbp
4111	movq	 16(%rsp), %r12
4112	movq	 24(%rsp), %r13
4113	movq	 32(%rsp), %r14
4114	movq	 40(%rsp), %r15
4115
4116#ifdef WINDOWS_ABI
4117	movq	 48(%rsp), %rdi
4118	movq	 56(%rsp), %rsi
4119	movups	 64(%rsp), %xmm6
4120	movups	 80(%rsp), %xmm7
4121	movups	 96(%rsp), %xmm8
4122	movups	112(%rsp), %xmm9
4123	movups	128(%rsp), %xmm10
4124	movups	144(%rsp), %xmm11
4125	movups	160(%rsp), %xmm12
4126	movups	176(%rsp), %xmm13
4127	movups	192(%rsp), %xmm14
4128	movups	208(%rsp), %xmm15
4129#endif
4130
4131	addq	$STACKSIZE, %rsp
4132	ret
4133
4134	EPILOGUE
4135