1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define M	%rdi
43#define N	%rsi
44#define K	%rdx
45
46#define A	%rcx
47#define B	%r8
48#define C	%r9
49#define LDC	%r10
50
51#define I	%r11
52#define AO	%r13
53#define BO	%r14
54#define	CO1	%r15
55#define CO2	%rbx
56#define KK	%rbp
57#define BB	%r12
58
59#ifndef WINDOWS_ABI
60
61#define STACKSIZE 128
62
63#define OLD_LDC		 8 + STACKSIZE(%rsp)
64#define OLD_OFFSET	16 + STACKSIZE(%rsp)
65
66#define OFFSET	 48(%rsp)
67#define J	 56(%rsp)
68#define KKK	 64(%rsp)
69#define AORIG	 72(%rsp)
70
71#else
72
73#define STACKSIZE 256
74
75#define OLD_A		40 + STACKSIZE(%rsp)
76#define OLD_B		48 + STACKSIZE(%rsp)
77#define OLD_C		56 + STACKSIZE(%rsp)
78#define OLD_LDC		64 + STACKSIZE(%rsp)
79#define OLD_OFFSET	72 + STACKSIZE(%rsp)
80
81#define OFFSET	 224(%rsp)
82#define J	 232(%rsp)
83#define KKK	 240(%rsp)
84#define AORIG	 248(%rsp)
85
86#endif
87
88#define PREFETCH     prefetcht1
89#define PREFETCHSIZE (16 * 12 + 3)
90#define PREFETCH_R    (4 *  4 + 0)
91
92#define KERNEL1(address) \
93	mulpd	%xmm8, %xmm9 ;\
94	PREFETCH  (PREFETCHSIZE +  0) * SIZE + (address) * 2 * SIZE(AO);\
95	addpd	%xmm9, %xmm0;\
96	movddup	 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
97	mulpd	%xmm8, %xmm9;\
98	addpd	%xmm9, %xmm1;\
99	movddup	 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
100	mulpd	%xmm8, %xmm9;\
101	addpd	%xmm9, %xmm2;\
102	movddup	 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
103	mulpd	%xmm8, %xmm9;\
104	movapd	 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\
105	addpd	%xmm9, %xmm3;\
106	movddup	 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9
107
108#define KERNEL2(address) \
109	mulpd	%xmm8, %xmm9;\
110	addpd	%xmm9, %xmm4;\
111	movddup	 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
112	mulpd	%xmm8, %xmm9;\
113	addpd	%xmm9, %xmm5;\
114	movddup	 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
115	mulpd	%xmm8, %xmm9;\
116	addpd	%xmm9, %xmm6;\
117	movddup	 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
118	mulpd	%xmm8, %xmm9;\
119	movapd	 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\
120	addpd	%xmm9, %xmm7;\
121	movddup	 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9
122
123#define KERNEL3(address) \
124	mulpd	%xmm8, %xmm9;\
125	addpd	%xmm9, %xmm0;\
126	movddup	 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
127	mulpd	%xmm8, %xmm9;\
128	addpd	%xmm9, %xmm1;\
129	movddup	 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
130	mulpd	%xmm8, %xmm9;\
131	addpd	%xmm9, %xmm2;\
132	movddup	 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
133	mulpd	%xmm8, %xmm9;\
134	movapd	 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\
135	addpd	%xmm9, %xmm3;\
136	movddup	 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9
137
138#define KERNEL4(address) \
139	mulpd	%xmm8, %xmm9;\
140	addpd	%xmm9, %xmm4;\
141	movddup	 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
142	mulpd	%xmm8, %xmm9;\
143	addpd	%xmm9, %xmm5;\
144	movddup	 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
145	mulpd	%xmm8, %xmm9;\
146	addpd	%xmm9, %xmm6;\
147	movddup	 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
148	mulpd	%xmm8, %xmm9;\
149	movapd	32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\
150	addpd	%xmm9, %xmm7;\
151	movddup	32 * SIZE + (address) * 2 * SIZE(BO), %xmm9
152
153#define KERNEL5(address) \
154	mulpd	%xmm10, %xmm11;\
155	addpd	%xmm11, %xmm0;\
156	movddup	 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
157	mulpd	%xmm10, %xmm11;\
158	addpd	%xmm11, %xmm1;\
159	movddup	10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
160	mulpd	%xmm10, %xmm11;\
161	addpd	%xmm11, %xmm2;\
162	movddup	11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
163	mulpd	%xmm10, %xmm11;\
164	movapd	10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\
165	addpd	%xmm11, %xmm3;\
166	movddup	 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11
167
168#define KERNEL6(address) \
169	mulpd	%xmm10, %xmm11;\
170	addpd	%xmm11, %xmm4;\
171	movddup	 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
172	mulpd	%xmm10, %xmm11;\
173	addpd	%xmm11, %xmm5;\
174	movddup	10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
175	mulpd	%xmm10, %xmm11;\
176	addpd	%xmm11, %xmm6;\
177	movddup	11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
178	mulpd	%xmm10, %xmm11;\
179	movapd	12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\
180	addpd	%xmm11, %xmm7;\
181	movddup	12 * SIZE + (address) * 2 * SIZE(BO), %xmm11
182
183#define KERNEL7(address) \
184	mulpd	%xmm10, %xmm11;\
185	addpd	%xmm11, %xmm0;\
186	movddup	13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
187	mulpd	%xmm10, %xmm11;\
188	addpd	%xmm11, %xmm1;\
189	movddup	14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
190	mulpd	%xmm10, %xmm11;\
191	addpd	%xmm11, %xmm2;\
192	movddup	15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
193	mulpd	%xmm10, %xmm11;\
194	movapd	14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\
195	addpd	%xmm11, %xmm3;\
196	movddup	12 * SIZE + (address) * 2 * SIZE(BO), %xmm11
197
198#define KERNEL8(address) \
199	mulpd	%xmm10, %xmm11;\
200	addpd	%xmm11, %xmm4;\
201	movddup	13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
202	mulpd	%xmm10, %xmm11;\
203	addpd	%xmm11, %xmm5;\
204	movddup	14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
205	mulpd	%xmm10, %xmm11;\
206	addpd	%xmm11, %xmm6;\
207	movddup	15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
208	mulpd	%xmm10, %xmm11;\
209	movapd	40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\
210	addpd	%xmm11, %xmm7;\
211	movddup	40 * SIZE + (address) * 2 * SIZE(BO), %xmm11
212
213#define KERNEL9(address) \
214	mulpd	%xmm12, %xmm13;\
215	PREFETCH  (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\
216	addpd	%xmm13, %xmm0;\
217	movddup	17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
218	mulpd	%xmm12, %xmm13;\
219	addpd	%xmm13, %xmm1;\
220	movddup	18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
221	mulpd	%xmm12, %xmm13;\
222	addpd	%xmm13, %xmm2;\
223	movddup	19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
224	mulpd	%xmm12, %xmm13;\
225	movapd	18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\
226	addpd	%xmm13, %xmm3;\
227	movddup	16 * SIZE + (address) * 2 * SIZE(BO), %xmm13
228
229#define KERNEL10(address) \
230	mulpd	%xmm12, %xmm13;\
231	addpd	%xmm13, %xmm4;\
232	movddup	17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
233	mulpd	%xmm12, %xmm13;\
234	addpd	%xmm13, %xmm5;\
235	movddup	18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
236	mulpd	%xmm12, %xmm13;\
237	addpd	%xmm13, %xmm6;\
238	movddup	19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
239	mulpd	%xmm12, %xmm13;\
240	movapd	20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\
241	addpd	%xmm13, %xmm7;\
242	movddup	20 * SIZE + (address) * 2 * SIZE(BO), %xmm13
243
244#define KERNEL11(address) \
245	mulpd	%xmm12, %xmm13;\
246	addpd	%xmm13, %xmm0;\
247	movddup	21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
248	mulpd	%xmm12, %xmm13;\
249	addpd	%xmm13, %xmm1;\
250	movddup	22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
251	mulpd	%xmm12, %xmm13;\
252	addpd	%xmm13, %xmm2;\
253	movddup	23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
254	mulpd	%xmm12, %xmm13;\
255	movapd	22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\
256	addpd	%xmm13, %xmm3;\
257	movddup	20 * SIZE + (address) * 2 * SIZE(BO), %xmm13
258
259#define KERNEL12(address) \
260	mulpd	%xmm12, %xmm13;\
261	addpd	%xmm13, %xmm4;\
262	movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
263	mulpd	%xmm12, %xmm13;\
264	addpd	%xmm13, %xmm5;\
265	movddup	22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
266	mulpd	%xmm12, %xmm13;\
267	addpd	%xmm13, %xmm6;\
268	movddup	23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
269	mulpd	%xmm12, %xmm13;\
270	movapd	48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\
271	addpd	%xmm13, %xmm7;\
272	movddup	48 * SIZE + (address) * 2 * SIZE(BO), %xmm13
273
274#define KERNEL13(address) \
275	mulpd	%xmm14, %xmm15;\
276	addpd	%xmm15, %xmm0;\
277	movddup	25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
278	mulpd	%xmm14, %xmm15;\
279	addpd	%xmm15, %xmm1;\
280	movddup	26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
281	mulpd	%xmm14, %xmm15;\
282	addpd	%xmm15, %xmm2;\
283	movddup	27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
284	mulpd	%xmm14, %xmm15;\
285	movapd	26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\
286	addpd	%xmm15, %xmm3;\
287	movddup	24 * SIZE + (address) * 2 * SIZE(BO), %xmm15
288
289#define KERNEL14(address) \
290	mulpd	%xmm14, %xmm15;\
291	addpd	%xmm15, %xmm4;\
292	movddup	25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
293	mulpd	%xmm14, %xmm15;\
294	addpd	%xmm15, %xmm5;\
295	movddup	26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
296	mulpd	%xmm14, %xmm15;\
297	addpd	%xmm15, %xmm6;\
298	movddup	27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
299	mulpd	%xmm14, %xmm15;\
300	movapd	28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\
301	addpd	%xmm15, %xmm7;\
302	movddup	28 * SIZE + (address) * 2 * SIZE(BO), %xmm15
303
304#define KERNEL15(address) \
305	mulpd	%xmm14, %xmm15;\
306	addpd	%xmm15, %xmm0;\
307	movddup	29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
308	mulpd	%xmm14, %xmm15;\
309	addpd	%xmm15, %xmm1;\
310	movddup	30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
311	mulpd	%xmm14, %xmm15;\
312	addpd	%xmm15, %xmm2;\
313	movddup	31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
314	mulpd	%xmm14, %xmm15;\
315	movapd	30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\
316	addpd	%xmm15, %xmm3;\
317	movddup	28 * SIZE + (address) * 2 * SIZE(BO), %xmm15
318
319#define KERNEL16(address) \
320	mulpd	%xmm14, %xmm15;\
321	addpd	%xmm15, %xmm4;\
322	movddup	29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
323	mulpd	%xmm14, %xmm15;\
324	addpd	%xmm15, %xmm5;\
325	movddup	30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
326	mulpd	%xmm14, %xmm15;\
327	addpd	%xmm15, %xmm6;\
328	movddup	31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
329	mulpd	%xmm14, %xmm15;\
330	movapd	56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\
331	addpd	%xmm15, %xmm7;\
332	movddup	56 * SIZE + (address) * 2 * SIZE(BO), %xmm15
333
334	PROLOGUE
335	PROFCODE
336
337	subq	$STACKSIZE, %rsp
338	movq	%rbx,  0(%rsp)
339	movq	%rbp,  8(%rsp)
340	movq	%r12, 16(%rsp)
341	movq	%r13, 24(%rsp)
342	movq	%r14, 32(%rsp)
343	movq	%r15, 40(%rsp)
344
345#ifdef WINDOWS_ABI
346	movq	%rdi,    48(%rsp)
347	movq	%rsi,    56(%rsp)
348	movups	%xmm6,   64(%rsp)
349	movups	%xmm7,   80(%rsp)
350	movups	%xmm8,   96(%rsp)
351	movups	%xmm9,  112(%rsp)
352	movups	%xmm10, 128(%rsp)
353	movups	%xmm11, 144(%rsp)
354	movups	%xmm12, 160(%rsp)
355	movups	%xmm13, 176(%rsp)
356	movups	%xmm14, 192(%rsp)
357	movups	%xmm15, 208(%rsp)
358
359	movq	ARG1,      M
360	movq	ARG2,      N
361	movq	ARG3,      K
362	movq	OLD_A,     A
363	movq	OLD_B,     B
364	movq	OLD_C,     C
365#endif
366
367	movq	OLD_LDC,   LDC
368	movq	OLD_OFFSET, KK
369
370	movq	KK, OFFSET
371
372	leaq	(, LDC, SIZE), LDC
373
374#ifdef LN
375       leaq	(, M, SIZE), %rax
376       addq	%rax, C
377       imulq	K, %rax
378       addq	%rax, A
379#endif
380
381#ifdef RT
382       leaq	(, N, SIZE), %rax
383       imulq	K, %rax
384       addq	%rax, B
385       movq	N, %rax
386       imulq	LDC, %rax
387       addq	%rax, C
388#endif
389
390#ifdef RN
391	negq	KK
392#endif
393
394#ifdef RT
395       movq	N, %rax
396       subq	OFFSET, %rax
397       movq	%rax, KK
398#endif
399
400	movq	N,  J
401	sarq	$2, J		# j = (n >> 2)
402	jle	.L40
403	ALIGN_4
404
405.L10:
406#if defined(LT) || defined(RN)
407	movq	A, AO
408#else
409	movq	A, AORIG
410#endif
411
412#ifdef RT
413       movq	K, %rax
414       salq	$2 + BASE_SHIFT, %rax
415       subq	%rax, B
416
417       leaq	(, LDC, 4), %rax
418       subq	%rax, C
419#endif
420
421	movq	C, CO1
422	leaq	(C, LDC, 1), CO2
423#ifndef RT
424	leaq	(C, LDC, 4), C
425#endif
426
427#ifdef LN
428	movq	OFFSET, %rax
429	addq	M, %rax
430	movq	%rax, KK
431#endif
432
433	movq	K, %rax
434	salq	$BASE_SHIFT + 2, %rax
435	leaq	(B, %rax), BB
436
437#ifdef LT
438	movq	OFFSET, %rax
439	movq	%rax, KK
440#endif
441
442	movq	M,  I
443	sarq	$2, I	# i = (m >> 2)
444	jle	.L20
445	ALIGN_4
446
447.L11:
448#ifdef LN
449       movq	K, %rax
450       salq	$2 + BASE_SHIFT, %rax
451       subq	%rax, AORIG
452#endif
453
454#if defined(LN) || defined(RT)
455	movq	KK, %rax
456	leaq	(, %rax, SIZE), %rax
457	movq	AORIG, AO
458	leaq	(AO, %rax, 4), AO
459	leaq	(B,  %rax, 4), BO
460#else
461	movq	B, BO
462#endif
463
464	prefetcht0	  0 * SIZE(BB)
465	subq	   $-8 * SIZE, BB
466
467	movapd	 0 * SIZE(AO), %xmm8
468	pxor	%xmm0, %xmm0
469	movddup	 0 * SIZE(BO), %xmm9
470	pxor	%xmm1, %xmm1
471	movapd	 8 * SIZE(AO), %xmm10
472	pxor	%xmm2, %xmm2
473	movddup	 8 * SIZE(BO), %xmm11
474	pxor	%xmm3, %xmm3
475
476	movapd	16 * SIZE(AO), %xmm12
477	movddup 16 * SIZE(BO), %xmm13
478	movapd	24 * SIZE(AO), %xmm14
479	movddup	24 * SIZE(BO), %xmm15
480
481	prefetchnta     4 * SIZE(CO1)
482	pxor	%xmm4, %xmm4
483	prefetchnta     4 * SIZE(CO2)
484	pxor	%xmm5, %xmm5
485	prefetchnta     4 * SIZE(CO1, LDC, 2)
486	pxor	%xmm6, %xmm6
487	prefetchnta     4 * SIZE(CO2, LDC, 2)
488	pxor	%xmm7, %xmm7
489
490#if defined(LT) || defined(RN)
491	movq	KK, %rax
492#else
493	movq	K, %rax
494	subq	KK, %rax
495#endif
496
497#if 1
498	andq	$-8, %rax
499	salq	$4, %rax
500	je	.L15
501.L1X:
502	KERNEL1 (16  *  0)
503	KERNEL2 (16  *  0)
504	KERNEL3 (16  *  0)
505	KERNEL4 (16  *  0)
506	KERNEL5 (16  *  0)
507	KERNEL6 (16  *  0)
508	KERNEL7 (16  *  0)
509	KERNEL8 (16  *  0)
510	KERNEL9 (16  *  0)
511	KERNEL10(16  *  0)
512	KERNEL11(16  *  0)
513	KERNEL12(16  *  0)
514	KERNEL13(16  *  0)
515	KERNEL14(16  *  0)
516	KERNEL15(16  *  0)
517	KERNEL16(16  *  0)
518	cmpq	$128 *  1, %rax
519	NOBRANCH
520	jle	.L12
521	KERNEL1 (16  *  1)
522	KERNEL2 (16  *  1)
523	KERNEL3 (16  *  1)
524	KERNEL4 (16  *  1)
525	KERNEL5 (16  *  1)
526	KERNEL6 (16  *  1)
527	KERNEL7 (16  *  1)
528	KERNEL8 (16  *  1)
529	KERNEL9 (16  *  1)
530	KERNEL10(16  *  1)
531	KERNEL11(16  *  1)
532	KERNEL12(16  *  1)
533	KERNEL13(16  *  1)
534	KERNEL14(16  *  1)
535	KERNEL15(16  *  1)
536	KERNEL16(16  *  1)
537	cmpq	$128 *  2, %rax
538	NOBRANCH
539	jle	.L12
540	KERNEL1 (16  *  2)
541	KERNEL2 (16  *  2)
542	KERNEL3 (16  *  2)
543	KERNEL4 (16  *  2)
544	KERNEL5 (16  *  2)
545	KERNEL6 (16  *  2)
546	KERNEL7 (16  *  2)
547	KERNEL8 (16  *  2)
548	KERNEL9 (16  *  2)
549	KERNEL10(16  *  2)
550	KERNEL11(16  *  2)
551	KERNEL12(16  *  2)
552	KERNEL13(16  *  2)
553	KERNEL14(16  *  2)
554	KERNEL15(16  *  2)
555	KERNEL16(16  *  2)
556	cmpq	$128 *  3, %rax
557	NOBRANCH
558	jle	.L12
559	KERNEL1 (16  *  3)
560	KERNEL2 (16  *  3)
561	KERNEL3 (16  *  3)
562	KERNEL4 (16  *  3)
563	KERNEL5 (16  *  3)
564	KERNEL6 (16  *  3)
565	KERNEL7 (16  *  3)
566	KERNEL8 (16  *  3)
567	KERNEL9 (16  *  3)
568	KERNEL10(16  *  3)
569	KERNEL11(16  *  3)
570	KERNEL12(16  *  3)
571	KERNEL13(16  *  3)
572	KERNEL14(16  *  3)
573	KERNEL15(16  *  3)
574	KERNEL16(16  *  3)
575	cmpq	$128 *  4, %rax
576	NOBRANCH
577	jle	.L12
578	KERNEL1 (16  *  4)
579	KERNEL2 (16  *  4)
580	KERNEL3 (16  *  4)
581	KERNEL4 (16  *  4)
582	KERNEL5 (16  *  4)
583	KERNEL6 (16  *  4)
584	KERNEL7 (16  *  4)
585	KERNEL8 (16  *  4)
586	KERNEL9 (16  *  4)
587	KERNEL10(16  *  4)
588	KERNEL11(16  *  4)
589	KERNEL12(16  *  4)
590	KERNEL13(16  *  4)
591	KERNEL14(16  *  4)
592	KERNEL15(16  *  4)
593	KERNEL16(16  *  4)
594	cmpq	$128 *  5, %rax
595	NOBRANCH
596	jle	.L12
597	KERNEL1 (16  *  5)
598	KERNEL2 (16  *  5)
599	KERNEL3 (16  *  5)
600	KERNEL4 (16  *  5)
601	KERNEL5 (16  *  5)
602	KERNEL6 (16  *  5)
603	KERNEL7 (16  *  5)
604	KERNEL8 (16  *  5)
605	KERNEL9 (16  *  5)
606	KERNEL10(16  *  5)
607	KERNEL11(16  *  5)
608	KERNEL12(16  *  5)
609	KERNEL13(16  *  5)
610	KERNEL14(16  *  5)
611	KERNEL15(16  *  5)
612	KERNEL16(16  *  5)
613	cmpq	$128 *  6, %rax
614	NOBRANCH
615	jle	.L12
616	KERNEL1 (16  *  6)
617	KERNEL2 (16  *  6)
618	KERNEL3 (16  *  6)
619	KERNEL4 (16  *  6)
620	KERNEL5 (16  *  6)
621	KERNEL6 (16  *  6)
622	KERNEL7 (16  *  6)
623	KERNEL8 (16  *  6)
624	KERNEL9 (16  *  6)
625	KERNEL10(16  *  6)
626	KERNEL11(16  *  6)
627	KERNEL12(16  *  6)
628	KERNEL13(16  *  6)
629	KERNEL14(16  *  6)
630	KERNEL15(16  *  6)
631	KERNEL16(16  *  6)
632	cmpq	$128 *  7, %rax
633	NOBRANCH
634	jle	.L12
635	KERNEL1 (16  *  7)
636	KERNEL2 (16  *  7)
637	KERNEL3 (16  *  7)
638	KERNEL4 (16  *  7)
639	KERNEL5 (16  *  7)
640	KERNEL6 (16  *  7)
641	KERNEL7 (16  *  7)
642	KERNEL8 (16  *  7)
643	KERNEL9 (16  *  7)
644	KERNEL10(16  *  7)
645	KERNEL11(16  *  7)
646	KERNEL12(16  *  7)
647	KERNEL13(16  *  7)
648	KERNEL14(16  *  7)
649	KERNEL15(16  *  7)
650	KERNEL16(16  *  7)
651
652	addq	$32 * 8  * SIZE, AO
653	addq	$32 * 8  * SIZE, BO
654	subq	$128 * 8, %rax
655	jg	.L1X
656
657.L12:
658	leaq	(AO, %rax, 2), AO	# * 16
659	leaq	(BO, %rax, 2), BO	# * 64
660#else
661	sarq	$3, %rax
662	je	.L15
663	ALIGN_4
664
665.L12:
666	mulpd	%xmm8, %xmm9
667	PREFETCH  (PREFETCHSIZE +  0) * SIZE(AO)
668	addpd	%xmm9, %xmm0
669	movddup	 1 * SIZE(BO), %xmm9
670	mulpd	%xmm8, %xmm9
671	addpd	%xmm9, %xmm1
672	movddup	 2 * SIZE(BO), %xmm9
673	mulpd	%xmm8, %xmm9
674	addpd	%xmm9, %xmm2
675	movddup	 3 * SIZE(BO), %xmm9
676	mulpd	%xmm8, %xmm9
677	movapd	 2 * SIZE(AO), %xmm8
678	addpd	%xmm9, %xmm3
679	movddup	 0 * SIZE(BO), %xmm9
680	mulpd	%xmm8, %xmm9
681	addpd	%xmm9, %xmm4
682	movddup	 1 * SIZE(BO), %xmm9
683	mulpd	%xmm8, %xmm9
684	addpd	%xmm9, %xmm5
685	movddup	 2 * SIZE(BO), %xmm9
686	mulpd	%xmm8, %xmm9
687	addpd	%xmm9, %xmm6
688	movddup	 3 * SIZE(BO), %xmm9
689	mulpd	%xmm8, %xmm9
690	movapd	 4 * SIZE(AO), %xmm8
691	addpd	%xmm9, %xmm7
692	movddup	 4 * SIZE(BO), %xmm9
693	mulpd	%xmm8, %xmm9
694	addpd	%xmm9, %xmm0
695	movddup	 5 * SIZE(BO), %xmm9
696	mulpd	%xmm8, %xmm9
697	addpd	%xmm9, %xmm1
698	movddup	 6 * SIZE(BO), %xmm9
699	mulpd	%xmm8, %xmm9
700	addpd	%xmm9, %xmm2
701	movddup	 7 * SIZE(BO), %xmm9
702	mulpd	%xmm8, %xmm9
703	movapd	 6 * SIZE(AO), %xmm8
704	addpd	%xmm9, %xmm3
705	movddup	 4 * SIZE(BO), %xmm9
706	mulpd	%xmm8, %xmm9
707	addpd	%xmm9, %xmm4
708	movddup	 5 * SIZE(BO), %xmm9
709	mulpd	%xmm8, %xmm9
710	addpd	%xmm9, %xmm5
711	movddup	 6 * SIZE(BO), %xmm9
712	mulpd	%xmm8, %xmm9
713	addpd	%xmm9, %xmm6
714	movddup	 7 * SIZE(BO), %xmm9
715	mulpd	%xmm8, %xmm9
716	movapd	32 * SIZE(AO), %xmm8
717	addpd	%xmm9, %xmm7
718
719	movddup	32 * SIZE(BO), %xmm9
720	mulpd	%xmm10, %xmm11
721	addpd	%xmm11, %xmm0
722	movddup	 9 * SIZE(BO), %xmm11
723	mulpd	%xmm10, %xmm11
724	addpd	%xmm11, %xmm1
725	movddup	10 * SIZE(BO), %xmm11
726	mulpd	%xmm10, %xmm11
727	addpd	%xmm11, %xmm2
728	movddup	11 * SIZE(BO), %xmm11
729	mulpd	%xmm10, %xmm11
730	movapd	10 * SIZE(AO), %xmm10
731	addpd	%xmm11, %xmm3
732
733	movddup	 8 * SIZE(BO), %xmm11
734	mulpd	%xmm10, %xmm11
735	addpd	%xmm11, %xmm4
736	movddup	 9 * SIZE(BO), %xmm11
737	mulpd	%xmm10, %xmm11
738	addpd	%xmm11, %xmm5
739	movddup	10 * SIZE(BO), %xmm11
740	mulpd	%xmm10, %xmm11
741	addpd	%xmm11, %xmm6
742	movddup	11 * SIZE(BO), %xmm11
743	mulpd	%xmm10, %xmm11
744	movapd	12 * SIZE(AO), %xmm10
745	addpd	%xmm11, %xmm7
746	movddup	12 * SIZE(BO), %xmm11
747	mulpd	%xmm10, %xmm11
748	addpd	%xmm11, %xmm0
749	movddup	13 * SIZE(BO), %xmm11
750	mulpd	%xmm10, %xmm11
751	addpd	%xmm11, %xmm1
752	movddup	14 * SIZE(BO), %xmm11
753	mulpd	%xmm10, %xmm11
754	addpd	%xmm11, %xmm2
755	movddup	15 * SIZE(BO), %xmm11
756	mulpd	%xmm10, %xmm11
757	movapd	14 * SIZE(AO), %xmm10
758	addpd	%xmm11, %xmm3
759
760	movddup	12 * SIZE(BO), %xmm11
761	mulpd	%xmm10, %xmm11
762	addpd	%xmm11, %xmm4
763	movddup	13 * SIZE(BO), %xmm11
764	mulpd	%xmm10, %xmm11
765	addpd	%xmm11, %xmm5
766	movddup	14 * SIZE(BO), %xmm11
767	mulpd	%xmm10, %xmm11
768	addpd	%xmm11, %xmm6
769	movddup	15 * SIZE(BO), %xmm11
770	mulpd	%xmm10, %xmm11
771	movapd	40 * SIZE(AO), %xmm10
772	addpd	%xmm11, %xmm7
773	movddup	40 * SIZE(BO), %xmm11
774
775	mulpd	%xmm12, %xmm13
776	PREFETCH  (PREFETCHSIZE + 16) * SIZE(AO)
777	addpd	%xmm13, %xmm0
778	movddup	17 * SIZE(BO), %xmm13
779	mulpd	%xmm12, %xmm13
780	addpd	%xmm13, %xmm1
781	movddup	18 * SIZE(BO), %xmm13
782	mulpd	%xmm12, %xmm13
783	addpd	%xmm13, %xmm2
784	movddup	19 * SIZE(BO), %xmm13
785	mulpd	%xmm12, %xmm13
786	movapd	18 * SIZE(AO), %xmm12
787	addpd	%xmm13, %xmm3
788
789	movddup	16 * SIZE(BO), %xmm13
790	mulpd	%xmm12, %xmm13
791	addpd	%xmm13, %xmm4
792	movddup	17 * SIZE(BO), %xmm13
793	mulpd	%xmm12, %xmm13
794	addpd	%xmm13, %xmm5
795	movddup	18 * SIZE(BO), %xmm13
796	mulpd	%xmm12, %xmm13
797	addpd	%xmm13, %xmm6
798	movddup	19 * SIZE(BO), %xmm13
799	mulpd	%xmm12, %xmm13
800	movapd	20 * SIZE(AO), %xmm12
801	addpd	%xmm13, %xmm7
802
803	movddup	20 * SIZE(BO), %xmm13
804	mulpd	%xmm12, %xmm13
805	addpd	%xmm13, %xmm0
806	movddup	21 * SIZE(BO), %xmm13
807	mulpd	%xmm12, %xmm13
808	addpd	%xmm13, %xmm1
809	movddup	22 * SIZE(BO), %xmm13
810	mulpd	%xmm12, %xmm13
811	addpd	%xmm13, %xmm2
812	movddup	23 * SIZE(BO), %xmm13
813	mulpd	%xmm12, %xmm13
814	movapd	22 * SIZE(AO), %xmm12
815	addpd	%xmm13, %xmm3
816
817	movddup	20 * SIZE(BO), %xmm13
818	mulpd	%xmm12, %xmm13
819	addpd	%xmm13, %xmm4
820	movddup 21 * SIZE(BO), %xmm13
821	mulpd	%xmm12, %xmm13
822	addpd	%xmm13, %xmm5
823	movddup	22 * SIZE(BO), %xmm13
824	mulpd	%xmm12, %xmm13
825	addpd	%xmm13, %xmm6
826	movddup	23 * SIZE(BO), %xmm13
827	mulpd	%xmm12, %xmm13
828	movapd	48 * SIZE(AO), %xmm12
829	addpd	%xmm13, %xmm7
830	movddup	48 * SIZE(BO), %xmm13
831
832	mulpd	%xmm14, %xmm15
833	addpd	%xmm15, %xmm0
834	movddup	25 * SIZE(BO), %xmm15
835	mulpd	%xmm14, %xmm15
836	addpd	%xmm15, %xmm1
837	movddup	26 * SIZE(BO), %xmm15
838	mulpd	%xmm14, %xmm15
839	addpd	%xmm15, %xmm2
840	movddup	27 * SIZE(BO), %xmm15
841	mulpd	%xmm14, %xmm15
842	movapd	26 * SIZE(AO), %xmm14
843	addpd	%xmm15, %xmm3
844
845	movddup	24 * SIZE(BO), %xmm15
846	mulpd	%xmm14, %xmm15
847	addpd	%xmm15, %xmm4
848	movddup	25 * SIZE(BO), %xmm15
849	mulpd	%xmm14, %xmm15
850	addpd	%xmm15, %xmm5
851	movddup	26 * SIZE(BO), %xmm15
852	mulpd	%xmm14, %xmm15
853	addpd	%xmm15, %xmm6
854	movddup	27 * SIZE(BO), %xmm15
855	mulpd	%xmm14, %xmm15
856	movapd	28 * SIZE(AO), %xmm14
857	addpd	%xmm15, %xmm7
858
859	movddup	28 * SIZE(BO), %xmm15
860	mulpd	%xmm14, %xmm15
861	addpd	%xmm15, %xmm0
862	movddup	29 * SIZE(BO), %xmm15
863	mulpd	%xmm14, %xmm15
864	addpd	%xmm15, %xmm1
865	movddup	30 * SIZE(BO), %xmm15
866	mulpd	%xmm14, %xmm15
867	addpd	%xmm15, %xmm2
868	movddup	31 * SIZE(BO), %xmm15
869	mulpd	%xmm14, %xmm15
870	movapd	30 * SIZE(AO), %xmm14
871	addpd	%xmm15, %xmm3
872
873	movddup	28 * SIZE(BO), %xmm15
874	mulpd	%xmm14, %xmm15
875	addpd	%xmm15, %xmm4
876	movddup	29 * SIZE(BO), %xmm15
877	mulpd	%xmm14, %xmm15
878	addpd	%xmm15, %xmm5
879	movddup	30 * SIZE(BO), %xmm15
880	mulpd	%xmm14, %xmm15
881	addpd	%xmm15, %xmm6
882	movddup	31 * SIZE(BO), %xmm15
883	mulpd	%xmm14, %xmm15
884	movapd	56 * SIZE(AO), %xmm14
885	addpd	%xmm15, %xmm7
886	movddup	56 * SIZE(BO), %xmm15
887
888	addq   $32 * SIZE, BO
889	addq   $32 * SIZE, AO
890	decq   %rax
891	BRANCH
892	jne    .L12
893#endif
894	ALIGN_4
895
896.L15:
897#if defined(LT) || defined(RN)
898	movq	KK, %rax
899#else
900	movq	K, %rax
901	subq	KK, %rax
902#endif
903	andq	$7, %rax		# if (k & 1)
904	BRANCH
905	je .L19
906	ALIGN_4
907
908.L16:
909	mulpd	%xmm8, %xmm9
910	movapd	 2 * SIZE(AO), %xmm10
911	addpd	%xmm9, %xmm0
912	movddup	 1 * SIZE(BO), %xmm9
913	mulpd	%xmm8, %xmm9
914	movddup	 0 * SIZE(BO), %xmm11
915	addpd	%xmm9, %xmm1
916	movddup	 2 * SIZE(BO), %xmm9
917	mulpd	%xmm8, %xmm9
918	addpd	%xmm9, %xmm2
919	movddup	 3 * SIZE(BO), %xmm9
920	mulpd	%xmm8, %xmm9
921	movapd	 4 * SIZE(AO), %xmm8
922	addpd	%xmm9, %xmm3
923	movddup	 4 * SIZE(BO), %xmm9
924	mulpd	%xmm10, %xmm11
925	addpd	%xmm11, %xmm4
926	movddup	 1 * SIZE(BO), %xmm11
927	mulpd	%xmm10, %xmm11
928	addpd	%xmm11, %xmm5
929	movddup	 2 * SIZE(BO), %xmm11
930	mulpd	%xmm10, %xmm11
931	addpd	%xmm11, %xmm6
932	movddup	 3 * SIZE(BO), %xmm11
933	mulpd	%xmm10, %xmm11
934	addpd	%xmm11, %xmm7
935
936	addq	$4 * SIZE, AO		# aoffset  += 4
937	addq	$4 * SIZE, BO		# boffset1 += 8
938	decq	%rax
939	jg	.L16
940	ALIGN_4
941
942.L19:
943#if defined(LN) || defined(RT)
944	movq	KK, %rax
945	subq	$4, %rax
946
947	leaq	(, %rax, SIZE), %rax
948
949	movq	AORIG, AO
950	leaq	(AO, %rax, 4), AO
951	leaq	(B,  %rax, 4), BO
952#endif
953
954#if defined(LN) || defined(LT)
955	movapd	%xmm0, %xmm8
956	unpcklpd %xmm1, %xmm0
957	unpckhpd %xmm1, %xmm8
958
959	movapd	%xmm2, %xmm10
960	unpcklpd %xmm3, %xmm2
961	unpckhpd %xmm3, %xmm10
962
963	movapd	%xmm4, %xmm12
964	unpcklpd %xmm5, %xmm4
965	unpckhpd %xmm5, %xmm12
966
967	movapd	%xmm6, %xmm14
968	unpcklpd %xmm7, %xmm6
969	unpckhpd %xmm7, %xmm14
970
971	movapd	 0 * SIZE(BO), %xmm1
972	movapd	 2 * SIZE(BO), %xmm3
973	movapd	 4 * SIZE(BO), %xmm5
974	movapd	 6 * SIZE(BO), %xmm7
975	movapd	 8 * SIZE(BO), %xmm9
976	movapd	10 * SIZE(BO), %xmm11
977	movapd	12 * SIZE(BO), %xmm13
978	movapd	14 * SIZE(BO), %xmm15
979
980	subpd	%xmm0,  %xmm1
981	subpd	%xmm2,  %xmm3
982	subpd	%xmm8,  %xmm5
983	subpd	%xmm10, %xmm7
984	subpd	%xmm4,  %xmm9
985	subpd	%xmm6,  %xmm11
986	subpd	%xmm12, %xmm13
987	subpd	%xmm14, %xmm15
988#else
989
990	movapd	 0 * SIZE(AO), %xmm8
991	movapd	 2 * SIZE(AO), %xmm9
992	movapd	 4 * SIZE(AO), %xmm10
993	movapd	 6 * SIZE(AO), %xmm11
994
995	movapd	 8 * SIZE(AO), %xmm12
996	movapd	10 * SIZE(AO), %xmm13
997	movapd	12 * SIZE(AO), %xmm14
998	movapd	14 * SIZE(AO), %xmm15
999
1000	subpd	%xmm0, %xmm8
1001	subpd	%xmm4, %xmm9
1002	subpd	%xmm1, %xmm10
1003	subpd	%xmm5, %xmm11
1004	subpd	%xmm2, %xmm12
1005	subpd	%xmm6, %xmm13
1006	subpd	%xmm3, %xmm14
1007	subpd	%xmm7, %xmm15
1008#endif
1009
1010
1011#ifdef LN
1012	movddup	15 * SIZE(AO), %xmm0
1013	mulpd	 %xmm0, %xmm13
1014	mulpd	 %xmm0, %xmm15
1015
1016	movddup	14 * SIZE(AO), %xmm2
1017	mulpd	 %xmm13, %xmm2
1018	subpd	 %xmm2, %xmm9
1019	movddup	14 * SIZE(AO), %xmm2
1020	mulpd	 %xmm15, %xmm2
1021	subpd	 %xmm2, %xmm11
1022
1023	movddup	13 * SIZE(AO), %xmm4
1024	mulpd	 %xmm13, %xmm4
1025	subpd	 %xmm4, %xmm5
1026	movddup	13 * SIZE(AO), %xmm4
1027	mulpd	 %xmm15, %xmm4
1028	subpd	 %xmm4, %xmm7
1029
1030	movddup	12 * SIZE(AO), %xmm6
1031	mulpd	 %xmm13, %xmm6
1032	subpd	 %xmm6, %xmm1
1033	movddup	12 * SIZE(AO), %xmm6
1034	mulpd	 %xmm15, %xmm6
1035	subpd	 %xmm6, %xmm3
1036
1037	movddup	10 * SIZE(AO), %xmm0
1038	mulpd	 %xmm0, %xmm9
1039	mulpd	 %xmm0, %xmm11
1040
1041	movddup	 9 * SIZE(AO), %xmm2
1042	mulpd	 %xmm9, %xmm2
1043	subpd	 %xmm2, %xmm5
1044	movddup	 9 * SIZE(AO), %xmm2
1045	mulpd	 %xmm11, %xmm2
1046	subpd	 %xmm2, %xmm7
1047
1048	movddup	 8 * SIZE(AO), %xmm4
1049	mulpd	 %xmm9, %xmm4
1050	subpd	 %xmm4, %xmm1
1051	movddup	 8 * SIZE(AO), %xmm4
1052	mulpd	 %xmm11, %xmm4
1053	subpd	 %xmm4, %xmm3
1054
1055	movddup	 5 * SIZE(AO), %xmm0
1056	mulpd	 %xmm0, %xmm5
1057	mulpd	 %xmm0, %xmm7
1058
1059	movddup	 4 * SIZE(AO), %xmm2
1060	mulpd	 %xmm5, %xmm2
1061	subpd	 %xmm2, %xmm1
1062	movddup	 4 * SIZE(AO), %xmm2
1063	mulpd	 %xmm7, %xmm2
1064	subpd	 %xmm2, %xmm3
1065
1066	movddup	 0 * SIZE(AO), %xmm0
1067	mulpd	 %xmm0, %xmm1
1068	mulpd	 %xmm0, %xmm3
1069#endif
1070
1071#ifdef LT
1072	movddup	 0 * SIZE(AO), %xmm0
1073	mulpd	 %xmm0, %xmm1
1074	mulpd	 %xmm0, %xmm3
1075
1076	movddup	 1 * SIZE(AO), %xmm2
1077	mulpd	 %xmm1, %xmm2
1078	subpd	 %xmm2, %xmm5
1079	movddup	 1 * SIZE(AO), %xmm2
1080	mulpd	 %xmm3, %xmm2
1081	subpd	 %xmm2, %xmm7
1082
1083	movddup	 2 * SIZE(AO), %xmm4
1084	mulpd	 %xmm1, %xmm4
1085	subpd	 %xmm4, %xmm9
1086	movddup	 2 * SIZE(AO), %xmm4
1087	mulpd	 %xmm3, %xmm4
1088	subpd	 %xmm4, %xmm11
1089
1090	movddup	 3 * SIZE(AO), %xmm6
1091	mulpd	 %xmm1, %xmm6
1092	subpd	 %xmm6, %xmm13
1093	movddup	 3 * SIZE(AO), %xmm6
1094	mulpd	 %xmm3, %xmm6
1095	subpd	 %xmm6, %xmm15
1096
1097	movddup	 5 * SIZE(AO), %xmm0
1098	mulpd	 %xmm0, %xmm5
1099	mulpd	 %xmm0, %xmm7
1100
1101	movddup	 6 * SIZE(AO), %xmm2
1102	mulpd	 %xmm5, %xmm2
1103	subpd	 %xmm2, %xmm9
1104	movddup	 6 * SIZE(AO), %xmm2
1105	mulpd	 %xmm7, %xmm2
1106	subpd	 %xmm2, %xmm11
1107
1108	movddup	 7 * SIZE(AO), %xmm4
1109	mulpd	 %xmm5, %xmm4
1110	subpd	 %xmm4, %xmm13
1111	movddup	 7 * SIZE(AO), %xmm4
1112	mulpd	 %xmm7, %xmm4
1113	subpd	 %xmm4, %xmm15
1114
1115	movddup	10 * SIZE(AO), %xmm0
1116	mulpd	 %xmm0, %xmm9
1117	mulpd	 %xmm0, %xmm11
1118
1119	movddup	11 * SIZE(AO), %xmm2
1120	mulpd	 %xmm9, %xmm2
1121	subpd	 %xmm2, %xmm13
1122	movddup	11 * SIZE(AO), %xmm2
1123	mulpd	 %xmm11, %xmm2
1124	subpd	 %xmm2, %xmm15
1125
1126	movddup	15 * SIZE(AO), %xmm0
1127	mulpd	 %xmm0, %xmm13
1128	mulpd	 %xmm0, %xmm15
1129#endif
1130
1131
1132#ifdef RN
1133	movddup	 0 * SIZE(BO), %xmm0
1134	mulpd	 %xmm0, %xmm8
1135	mulpd	 %xmm0, %xmm9
1136
1137	movddup	 1 * SIZE(BO), %xmm1
1138	mulpd	 %xmm8, %xmm1
1139	subpd	 %xmm1, %xmm10
1140	movddup	 1 * SIZE(BO), %xmm1
1141	mulpd	 %xmm9, %xmm1
1142	subpd	 %xmm1, %xmm11
1143
1144	movddup	 2 * SIZE(BO), %xmm2
1145	mulpd	 %xmm8, %xmm2
1146	subpd	 %xmm2, %xmm12
1147	movddup	 2 * SIZE(BO), %xmm2
1148	mulpd	 %xmm9, %xmm2
1149	subpd	 %xmm2, %xmm13
1150
1151	movddup	 3 * SIZE(BO), %xmm3
1152	mulpd	 %xmm8, %xmm3
1153	subpd	 %xmm3, %xmm14
1154	movddup	 3 * SIZE(BO), %xmm3
1155	mulpd	 %xmm9, %xmm3
1156	subpd	 %xmm3, %xmm15
1157
1158	movddup	 5 * SIZE(BO), %xmm0
1159	mulpd	 %xmm0, %xmm10
1160	mulpd	 %xmm0, %xmm11
1161
1162	movddup	 6 * SIZE(BO), %xmm1
1163	mulpd	 %xmm10, %xmm1
1164	subpd	 %xmm1, %xmm12
1165	movddup	 6 * SIZE(BO), %xmm1
1166	mulpd	 %xmm11, %xmm1
1167	subpd	 %xmm1, %xmm13
1168
1169	movddup	 7 * SIZE(BO), %xmm2
1170	mulpd	 %xmm10, %xmm2
1171	subpd	 %xmm2, %xmm14
1172	movddup	 7 * SIZE(BO), %xmm2
1173	mulpd	 %xmm11, %xmm2
1174	subpd	 %xmm2, %xmm15
1175
1176	movddup	10 * SIZE(BO), %xmm0
1177	mulpd	 %xmm0, %xmm12
1178	mulpd	 %xmm0, %xmm13
1179
1180	movddup	11 * SIZE(BO), %xmm1
1181	mulpd	 %xmm12, %xmm1
1182	subpd	 %xmm1, %xmm14
1183	movddup	11 * SIZE(BO), %xmm1
1184	mulpd	 %xmm13, %xmm1
1185	subpd	 %xmm1, %xmm15
1186
1187	movddup	15 * SIZE(BO), %xmm0
1188	mulpd	 %xmm0, %xmm14
1189	mulpd	 %xmm0, %xmm15
1190#endif
1191
1192#ifdef RT
1193	movddup	15 * SIZE(BO), %xmm0
1194	mulpd	 %xmm0, %xmm14
1195	mulpd	 %xmm0, %xmm15
1196
1197	movddup	14 * SIZE(BO), %xmm1
1198	mulpd	 %xmm14, %xmm1
1199	subpd	 %xmm1, %xmm12
1200	movddup	14 * SIZE(BO), %xmm1
1201	mulpd	 %xmm15, %xmm1
1202	subpd	 %xmm1, %xmm13
1203
1204	movddup	13 * SIZE(BO), %xmm2
1205	mulpd	 %xmm14, %xmm2
1206	subpd	 %xmm2, %xmm10
1207	movddup	13 * SIZE(BO), %xmm2
1208	mulpd	 %xmm15, %xmm2
1209	subpd	 %xmm2, %xmm11
1210
1211	movddup	12 * SIZE(BO), %xmm3
1212	mulpd	 %xmm14, %xmm3
1213	subpd	 %xmm3, %xmm8
1214	movddup	12 * SIZE(BO), %xmm3
1215	mulpd	 %xmm15, %xmm3
1216	subpd	 %xmm3, %xmm9
1217
1218	movddup	10 * SIZE(BO), %xmm0
1219	mulpd	 %xmm0, %xmm12
1220	mulpd	 %xmm0, %xmm13
1221
1222	movddup	 9 * SIZE(BO), %xmm1
1223	mulpd	 %xmm12, %xmm1
1224	subpd	 %xmm1, %xmm10
1225	movddup	 9 * SIZE(BO), %xmm1
1226	mulpd	 %xmm13, %xmm1
1227	subpd	 %xmm1, %xmm11
1228
1229	movddup	 8 * SIZE(BO), %xmm2
1230	mulpd	 %xmm12, %xmm2
1231	subpd	 %xmm2, %xmm8
1232	movddup	 8 * SIZE(BO), %xmm2
1233	mulpd	 %xmm13, %xmm2
1234	subpd	 %xmm2, %xmm9
1235
1236	movddup	 5 * SIZE(BO), %xmm0
1237	mulpd	 %xmm0, %xmm10
1238	mulpd	 %xmm0, %xmm11
1239
1240	movddup	 4 * SIZE(BO), %xmm1
1241	mulpd	 %xmm10, %xmm1
1242	subpd	 %xmm1, %xmm8
1243	movddup	 4 * SIZE(BO), %xmm1
1244	mulpd	 %xmm11, %xmm1
1245	subpd	 %xmm1, %xmm9
1246
1247	movddup	 0 * SIZE(BO), %xmm0
1248	mulpd	 %xmm0, %xmm8
1249	mulpd	 %xmm0, %xmm9
1250#endif
1251
1252#ifdef LN
1253	subq	$4 * SIZE, CO1
1254	subq	$4 * SIZE, CO2
1255#endif
1256
1257#if defined(LN) || defined(LT)
1258	movsd	%xmm1,  0 * SIZE(CO1)
1259	movsd	%xmm5,  1 * SIZE(CO1)
1260	movsd	%xmm9,  2 * SIZE(CO1)
1261	movsd	%xmm13, 3 * SIZE(CO1)
1262
1263	movhpd	%xmm1,  0 * SIZE(CO2)
1264	movhpd	%xmm5,  1 * SIZE(CO2)
1265	movhpd	%xmm9,  2 * SIZE(CO2)
1266	movhpd	%xmm13, 3 * SIZE(CO2)
1267
1268	movsd	%xmm3,  0 * SIZE(CO1, LDC, 2)
1269	movsd	%xmm7,  1 * SIZE(CO1, LDC, 2)
1270	movsd	%xmm11, 2 * SIZE(CO1, LDC, 2)
1271	movsd	%xmm15, 3 * SIZE(CO1, LDC, 2)
1272
1273	movhpd	%xmm3,  0 * SIZE(CO2, LDC, 2)
1274	movhpd	%xmm7,  1 * SIZE(CO2, LDC, 2)
1275	movhpd	%xmm11, 2 * SIZE(CO2, LDC, 2)
1276	movhpd	%xmm15, 3 * SIZE(CO2, LDC, 2)
1277#else
1278	movsd	%xmm8,  0 * SIZE(CO1)
1279	movhpd	%xmm8,  1 * SIZE(CO1)
1280	movsd	%xmm9,  2 * SIZE(CO1)
1281	movhpd	%xmm9,  3 * SIZE(CO1)
1282
1283	movsd	%xmm10,  0 * SIZE(CO2)
1284	movhpd	%xmm10,  1 * SIZE(CO2)
1285	movsd	%xmm11,  2 * SIZE(CO2)
1286	movhpd	%xmm11,  3 * SIZE(CO2)
1287
1288	movsd	%xmm12,  0 * SIZE(CO1, LDC, 2)
1289	movhpd	%xmm12,  1 * SIZE(CO1, LDC, 2)
1290	movsd	%xmm13,  2 * SIZE(CO1, LDC, 2)
1291	movhpd	%xmm13,  3 * SIZE(CO1, LDC, 2)
1292
1293	movsd	%xmm14,  0 * SIZE(CO2, LDC, 2)
1294	movhpd	%xmm14,  1 * SIZE(CO2, LDC, 2)
1295	movsd	%xmm15,  2 * SIZE(CO2, LDC, 2)
1296	movhpd	%xmm15,  3 * SIZE(CO2, LDC, 2)
1297#endif
1298
1299#if defined(LN) || defined(LT)
1300	movapd	%xmm1,   0 * SIZE(BO)
1301	movapd	%xmm3,   2 * SIZE(BO)
1302	movapd	%xmm5,   4 * SIZE(BO)
1303	movapd	%xmm7,   6 * SIZE(BO)
1304	movapd	%xmm9,   8 * SIZE(BO)
1305	movapd	%xmm11, 10 * SIZE(BO)
1306	movapd	%xmm13, 12 * SIZE(BO)
1307	movapd	%xmm15, 14 * SIZE(BO)
1308#else
1309	movapd	%xmm8,   0 * SIZE(AO)
1310	movapd	%xmm9,   2 * SIZE(AO)
1311	movapd	%xmm10,  4 * SIZE(AO)
1312	movapd	%xmm11,  6 * SIZE(AO)
1313	movapd	%xmm12,  8 * SIZE(AO)
1314	movapd	%xmm13, 10 * SIZE(AO)
1315	movapd	%xmm14, 12 * SIZE(AO)
1316	movapd	%xmm15, 14 * SIZE(AO)
1317#endif
1318
1319#ifndef LN
1320	addq	$4 * SIZE, CO1
1321	addq	$4 * SIZE, CO2
1322#endif
1323
1324#if defined(LT) || defined(RN)
1325	movq	K,  %rax
1326	subq	KK, %rax
1327	leaq	(,%rax, SIZE), %rax
1328	leaq	(AO, %rax, 4), AO
1329	leaq	(BO, %rax, 4), BO
1330#endif
1331
1332#ifdef LN
1333	subq	$4, KK
1334#endif
1335
1336#ifdef LT
1337	addq	$4, KK
1338#endif
1339
1340#ifdef RT
1341       movq	K, %rax
1342       salq	$2 + BASE_SHIFT, %rax
1343       addq	%rax, AORIG
1344#endif
1345
1346	decq	I			# i --
1347	jg	.L11
1348	ALIGN_4
1349
1350.L20:
1351	testq	$2, M
1352	BRANCH
1353	je	.L30
1354	ALIGN_4
1355
1356.L21:
1357
1358#ifdef LN
1359       movq	K, %rax
1360       salq	$1 + BASE_SHIFT, %rax
1361       subq	%rax, AORIG
1362#endif
1363
1364#if defined(LN) || defined(RT)
1365	movq	KK, %rax
1366	leaq	(, %rax, SIZE), %rax
1367	movq	AORIG, AO
1368	leaq	(AO, %rax, 2), AO
1369	leaq	(B,  %rax, 4), BO
1370#else
1371	movq	B, BO
1372#endif
1373
1374	movapd	 0 * SIZE(AO), %xmm8
1375	pxor	%xmm0, %xmm0
1376	movddup	 0 * SIZE(BO), %xmm9
1377	pxor	%xmm1, %xmm1
1378	movapd	 8 * SIZE(AO), %xmm10
1379	pxor	%xmm2, %xmm2
1380	movddup	 8 * SIZE(BO), %xmm11
1381	pxor	%xmm3, %xmm3
1382
1383#if defined(LT) || defined(RN)
1384	movq	KK, %rax
1385#else
1386	movq	K, %rax
1387	subq	KK, %rax
1388#endif
1389	sarq	$3, %rax
1390	je	.L25
1391	ALIGN_4
1392
1393.L22:
1394	mulpd	%xmm8, %xmm9
1395	PREFETCH  (PREFETCHSIZE +  0) * SIZE(AO)
1396	addpd	%xmm9, %xmm0
1397	movddup	 1 * SIZE(BO), %xmm9
1398	mulpd	%xmm8, %xmm9
1399	addpd	%xmm9, %xmm1
1400	movddup	 2 * SIZE(BO), %xmm9
1401	mulpd	%xmm8, %xmm9
1402	addpd	%xmm9, %xmm2
1403	movddup	 3 * SIZE(BO), %xmm9
1404	mulpd	%xmm8, %xmm9
1405	movapd	 2 * SIZE(AO), %xmm8
1406	addpd	%xmm9, %xmm3
1407	movddup	 4 * SIZE(BO), %xmm9
1408	mulpd	%xmm8, %xmm9
1409	addpd	%xmm9, %xmm0
1410	movddup	 5 * SIZE(BO), %xmm9
1411	mulpd	%xmm8, %xmm9
1412	addpd	%xmm9, %xmm1
1413	movddup	 6 * SIZE(BO), %xmm9
1414	mulpd	%xmm8, %xmm9
1415	addpd	%xmm9, %xmm2
1416	movddup	 7 * SIZE(BO), %xmm9
1417	mulpd	%xmm8, %xmm9
1418	movapd	 4 * SIZE(AO), %xmm8
1419	addpd	%xmm9, %xmm3
1420	movddup	16 * SIZE(BO), %xmm9
1421	mulpd	%xmm8, %xmm11
1422	addpd	%xmm11, %xmm0
1423	movddup	 9 * SIZE(BO), %xmm11
1424	mulpd	%xmm8, %xmm11
1425	addpd	%xmm11, %xmm1
1426	movddup	10 * SIZE(BO), %xmm11
1427	mulpd	%xmm8, %xmm11
1428	addpd	%xmm11, %xmm2
1429	movddup	11 * SIZE(BO), %xmm11
1430	mulpd	%xmm8, %xmm11
1431	movapd	 6 * SIZE(AO), %xmm8
1432	addpd	%xmm11, %xmm3
1433	movddup	12 * SIZE(BO), %xmm11
1434	mulpd	%xmm8, %xmm11
1435	addpd	%xmm11, %xmm0
1436	movddup	13 * SIZE(BO), %xmm11
1437	mulpd	%xmm8, %xmm11
1438	addpd	%xmm11, %xmm1
1439	movddup	14 * SIZE(BO), %xmm11
1440	mulpd	%xmm8, %xmm11
1441	addpd	%xmm11, %xmm2
1442	movddup	15 * SIZE(BO), %xmm11
1443	mulpd	%xmm8, %xmm11
1444	movapd	16 * SIZE(AO), %xmm8
1445	addpd	%xmm11, %xmm3
1446	movddup	24 * SIZE(BO), %xmm11
1447	mulpd	%xmm10, %xmm9
1448	addpd	%xmm9, %xmm0
1449	movddup	17 * SIZE(BO), %xmm9
1450	mulpd	%xmm10, %xmm9
1451	addpd	%xmm9, %xmm1
1452	movddup	18 * SIZE(BO), %xmm9
1453	mulpd	%xmm10, %xmm9
1454	addpd	%xmm9, %xmm2
1455	movddup	19 * SIZE(BO), %xmm9
1456	mulpd	%xmm10, %xmm9
1457	movapd	10 * SIZE(AO), %xmm10
1458	addpd	%xmm9, %xmm3
1459	movddup	20 * SIZE(BO), %xmm9
1460	mulpd	%xmm10, %xmm9
1461	addpd	%xmm9, %xmm0
1462	movddup	21 * SIZE(BO), %xmm9
1463	mulpd	%xmm10, %xmm9
1464	addpd	%xmm9, %xmm1
1465	movddup	22 * SIZE(BO), %xmm9
1466	mulpd	%xmm10, %xmm9
1467	addpd	%xmm9, %xmm2
1468	movddup	23 * SIZE(BO), %xmm9
1469	mulpd	%xmm10, %xmm9
1470	movapd	12 * SIZE(AO), %xmm10
1471	addpd	%xmm9, %xmm3
1472	movddup	32 * SIZE(BO), %xmm9
1473	mulpd	%xmm10, %xmm11
1474	addpd	%xmm11, %xmm0
1475	movddup	25 * SIZE(BO), %xmm11
1476	mulpd	%xmm10, %xmm11
1477	addpd	%xmm11, %xmm1
1478	movddup	26 * SIZE(BO), %xmm11
1479	mulpd	%xmm10, %xmm11
1480	addpd	%xmm11, %xmm2
1481	movddup	27 * SIZE(BO), %xmm11
1482	mulpd	%xmm10, %xmm11
1483	movapd	14 * SIZE(AO), %xmm10
1484	addpd	%xmm11, %xmm3
1485	movddup	28 * SIZE(BO), %xmm11
1486	mulpd	%xmm10, %xmm11
1487	addpd	%xmm11, %xmm0
1488	movddup	29 * SIZE(BO), %xmm11
1489	mulpd	%xmm10, %xmm11
1490	addpd	%xmm11, %xmm1
1491	movddup	30 * SIZE(BO), %xmm11
1492	mulpd	%xmm10, %xmm11
1493	addpd	%xmm11, %xmm2
1494	movddup	31 * SIZE(BO), %xmm11
1495	mulpd	%xmm10, %xmm11
1496	movapd	24 * SIZE(AO), %xmm10
1497	addpd	%xmm11, %xmm3
1498	movddup	40 * SIZE(BO), %xmm11
1499
1500	addq   $16 * SIZE, AO
1501	addq   $32 * SIZE, BO
1502	decq   %rax
1503	jne    .L22
1504	ALIGN_4
1505
1506.L25:
1507#if defined(LT) || defined(RN)
1508	movq	KK, %rax
1509#else
1510	movq	K, %rax
1511	subq	KK, %rax
1512#endif
1513	andq	$7, %rax		# if (k & 1)
1514	BRANCH
1515	je .L29
1516	ALIGN_4
1517
1518.L26:
1519	mulpd	%xmm8, %xmm9
1520	addpd	%xmm9, %xmm0
1521	movddup	 1 * SIZE(BO), %xmm9
1522	mulpd	%xmm8, %xmm9
1523	addpd	%xmm9, %xmm1
1524	movddup	 2 * SIZE(BO), %xmm9
1525	mulpd	%xmm8, %xmm9
1526	addpd	%xmm9, %xmm2
1527	movddup	 3 * SIZE(BO), %xmm9
1528	mulpd	%xmm8, %xmm9
1529	movapd	 2 * SIZE(AO), %xmm8
1530	addpd	%xmm9, %xmm3
1531	movddup	 4 * SIZE(BO), %xmm9
1532
1533	addq	$2 * SIZE, AO		# aoffset  += 4
1534	addq	$4 * SIZE, BO		# boffset1 += 8
1535	decq	%rax
1536	jg	.L26
1537	ALIGN_4
1538
1539.L29:
1540#if defined(LN) || defined(RT)
1541	movq	KK, %rax
1542#ifdef LN
1543	subq	$2, %rax
1544#else
1545	subq	$4, %rax
1546#endif
1547
1548	leaq	(, %rax, SIZE), %rax
1549	movq	AORIG, AO
1550	leaq	(AO, %rax, 2), AO
1551	leaq	(B,  %rax, 4), BO
1552#endif
1553
1554#if defined(LN) || defined(LT)
1555	movapd	%xmm0, %xmm8
1556	unpcklpd %xmm1, %xmm0
1557	unpckhpd %xmm1, %xmm8
1558
1559	movapd	%xmm2, %xmm10
1560	unpcklpd %xmm3, %xmm2
1561	unpckhpd %xmm3, %xmm10
1562
1563	movapd	 0 * SIZE(BO), %xmm1
1564	movapd	 2 * SIZE(BO), %xmm3
1565	movapd	 4 * SIZE(BO), %xmm5
1566	movapd	 6 * SIZE(BO), %xmm7
1567
1568	subpd	%xmm0,  %xmm1
1569	subpd	%xmm2,  %xmm3
1570	subpd	%xmm8,  %xmm5
1571	subpd	%xmm10, %xmm7
1572#else
1573
1574	movapd	 0 * SIZE(AO), %xmm8
1575	movapd	 2 * SIZE(AO), %xmm10
1576	movapd	 4 * SIZE(AO), %xmm12
1577	movapd	 6 * SIZE(AO), %xmm14
1578
1579	subpd	%xmm0, %xmm8
1580	subpd	%xmm1, %xmm10
1581	subpd	%xmm2, %xmm12
1582	subpd	%xmm3, %xmm14
1583#endif
1584
1585#ifdef LN
1586	movddup	 3 * SIZE(AO), %xmm0
1587	mulpd	 %xmm0, %xmm5
1588	mulpd	 %xmm0, %xmm7
1589
1590	movddup	 2 * SIZE(AO), %xmm2
1591	mulpd	 %xmm5, %xmm2
1592	subpd	 %xmm2, %xmm1
1593	movddup	 2 * SIZE(AO), %xmm2
1594	mulpd	 %xmm7, %xmm2
1595	subpd	 %xmm2, %xmm3
1596
1597	movddup	 0 * SIZE(AO), %xmm0
1598	mulpd	 %xmm0, %xmm1
1599	mulpd	 %xmm0, %xmm3
1600#endif
1601
1602#ifdef LT
1603	movddup	 0 * SIZE(AO), %xmm0
1604	mulpd	 %xmm0, %xmm1
1605	mulpd	 %xmm0, %xmm3
1606
1607	movddup	 1 * SIZE(AO), %xmm2
1608	mulpd	 %xmm1, %xmm2
1609	subpd	 %xmm2, %xmm5
1610	movddup	 1 * SIZE(AO), %xmm2
1611	mulpd	 %xmm3, %xmm2
1612	subpd	 %xmm2, %xmm7
1613
1614	movddup	 3 * SIZE(AO), %xmm0
1615	mulpd	 %xmm0, %xmm5
1616	mulpd	 %xmm0, %xmm7
1617#endif
1618
1619#ifdef RN
1620	movddup	 0 * SIZE(BO), %xmm0
1621	mulpd	 %xmm0, %xmm8
1622
1623	movddup	 1 * SIZE(BO), %xmm1
1624	mulpd	 %xmm8, %xmm1
1625	subpd	 %xmm1, %xmm10
1626	movddup	 2 * SIZE(BO), %xmm2
1627	mulpd	 %xmm8, %xmm2
1628	subpd	 %xmm2, %xmm12
1629	movddup	 3 * SIZE(BO), %xmm3
1630	mulpd	 %xmm8, %xmm3
1631	subpd	 %xmm3, %xmm14
1632
1633	movddup	 5 * SIZE(BO), %xmm0
1634	mulpd	 %xmm0, %xmm10
1635	movddup	 6 * SIZE(BO), %xmm1
1636	mulpd	 %xmm10, %xmm1
1637	subpd	 %xmm1, %xmm12
1638	movddup	 7 * SIZE(BO), %xmm2
1639	mulpd	 %xmm10, %xmm2
1640	subpd	 %xmm2, %xmm14
1641
1642	movddup	10 * SIZE(BO), %xmm0
1643	mulpd	 %xmm0, %xmm12
1644
1645	movddup	11 * SIZE(BO), %xmm1
1646	mulpd	 %xmm12, %xmm1
1647	subpd	 %xmm1, %xmm14
1648
1649	movddup	15 * SIZE(BO), %xmm0
1650	mulpd	 %xmm0, %xmm14
1651#endif
1652
1653#ifdef RT
1654	movddup	15 * SIZE(BO), %xmm0
1655	mulpd	 %xmm0, %xmm14
1656
1657	movddup	14 * SIZE(BO), %xmm1
1658	mulpd	 %xmm14, %xmm1
1659	subpd	 %xmm1, %xmm12
1660	movddup	13 * SIZE(BO), %xmm2
1661	mulpd	 %xmm14, %xmm2
1662	subpd	 %xmm2, %xmm10
1663	movddup	12 * SIZE(BO), %xmm3
1664	mulpd	 %xmm14, %xmm3
1665	subpd	 %xmm3, %xmm8
1666
1667	movddup	10 * SIZE(BO), %xmm0
1668	mulpd	 %xmm0, %xmm12
1669	movddup	 9 * SIZE(BO), %xmm1
1670	mulpd	 %xmm12, %xmm1
1671	subpd	 %xmm1, %xmm10
1672	movddup	 8 * SIZE(BO), %xmm2
1673	mulpd	 %xmm12, %xmm2
1674	subpd	 %xmm2, %xmm8
1675
1676	movddup	 5 * SIZE(BO), %xmm0
1677	mulpd	 %xmm0, %xmm10
1678	movddup	 4 * SIZE(BO), %xmm1
1679	mulpd	 %xmm10, %xmm1
1680	subpd	 %xmm1, %xmm8
1681
1682	movddup	 0 * SIZE(BO), %xmm0
1683	mulpd	 %xmm0, %xmm8
1684#endif
1685
1686#ifdef LN
1687	subq	$2 * SIZE, CO1
1688	subq	$2 * SIZE, CO2
1689#endif
1690
1691#if defined(LN) || defined(LT)
1692	movsd	%xmm1,  0 * SIZE(CO1)
1693	movsd	%xmm5,  1 * SIZE(CO1)
1694	movhpd	%xmm1,  0 * SIZE(CO2)
1695	movhpd	%xmm5,  1 * SIZE(CO2)
1696
1697	movsd	%xmm3,  0 * SIZE(CO1, LDC, 2)
1698	movsd	%xmm7,  1 * SIZE(CO1, LDC, 2)
1699	movhpd	%xmm3,  0 * SIZE(CO2, LDC, 2)
1700	movhpd	%xmm7,  1 * SIZE(CO2, LDC, 2)
1701#else
1702	movsd	%xmm8,  0 * SIZE(CO1)
1703	movhpd	%xmm8,  1 * SIZE(CO1)
1704	movsd	%xmm10,  0 * SIZE(CO2)
1705	movhpd	%xmm10,  1 * SIZE(CO2)
1706
1707	movsd	%xmm12,  0 * SIZE(CO1, LDC, 2)
1708	movhpd	%xmm12,  1 * SIZE(CO1, LDC, 2)
1709	movsd	%xmm14,  0 * SIZE(CO2, LDC, 2)
1710	movhpd	%xmm14,  1 * SIZE(CO2, LDC, 2)
1711#endif
1712
1713#if defined(LN) || defined(LT)
1714	movapd	%xmm1,   0 * SIZE(BO)
1715	movapd	%xmm3,   2 * SIZE(BO)
1716	movapd	%xmm5,   4 * SIZE(BO)
1717	movapd	%xmm7,   6 * SIZE(BO)
1718#else
1719	movapd	%xmm8,   0 * SIZE(AO)
1720	movapd	%xmm10,  2 * SIZE(AO)
1721	movapd	%xmm12,  4 * SIZE(AO)
1722	movapd	%xmm14,  6 * SIZE(AO)
1723#endif
1724
1725#ifndef LN
1726	addq	$2 * SIZE, CO1
1727	addq	$2 * SIZE, CO2
1728#endif
1729
1730#if defined(LT) || defined(RN)
1731	movq	K,  %rax
1732	subq	KK, %rax
1733	leaq	(,%rax, SIZE), %rax
1734	leaq	(AO, %rax, 2), AO
1735	leaq	(BO, %rax, 4), BO
1736#endif
1737
1738#ifdef LN
1739	subq	$2, KK
1740#endif
1741
1742#ifdef LT
1743	addq	$2, KK
1744#endif
1745
1746#ifdef RT
1747       movq	K, %rax
1748       salq	$1 + BASE_SHIFT, %rax
1749       addq	%rax, AORIG
1750#endif
1751	ALIGN_4
1752
1753.L30:
1754	testq	$1, M
1755	je	.L39
1756	ALIGN_4
1757
1758.L31:
1759#ifdef LN
1760       movq	K, %rax
1761       salq	$0 + BASE_SHIFT, %rax
1762       subq	%rax, AORIG
1763#endif
1764
1765
1766#if defined(LN) || defined(RT)
1767	movq	KK, %rax
1768	leaq	(, %rax, SIZE), %rax
1769	movq	AORIG, AO
1770	leaq	(AO, %rax, 1), AO
1771	leaq	(B,  %rax, 4), BO
1772#else
1773	movq	B, BO
1774#endif
1775
1776	movddup	 0 * SIZE(AO), %xmm8
1777	pxor	%xmm0, %xmm0
1778	movapd	 0 * SIZE(BO), %xmm9
1779	pxor	%xmm1, %xmm1
1780	movddup	 4 * SIZE(AO), %xmm10
1781	pxor	%xmm2, %xmm2
1782	movapd	 8 * SIZE(BO), %xmm11
1783	pxor	%xmm3, %xmm3
1784
1785#if defined(LT) || defined(RN)
1786	movq	KK, %rax
1787#else
1788	movq	K, %rax
1789	subq	KK, %rax
1790#endif
1791	sarq	$3, %rax
1792	je	.L35
1793	ALIGN_4
1794
1795.L32:
1796	mulpd	%xmm8, %xmm9
1797	PREFETCH  (PREFETCHSIZE +  0) * SIZE(AO)
1798	addpd	%xmm9, %xmm0
1799	movapd	 2 * SIZE(BO), %xmm9
1800	mulpd	%xmm8, %xmm9
1801	movddup	 1 * SIZE(AO), %xmm8
1802	addpd	%xmm9, %xmm1
1803	movapd	 4 * SIZE(BO), %xmm9
1804	mulpd	%xmm8, %xmm9
1805	addpd	%xmm9, %xmm0
1806	movapd	 6 * SIZE(BO), %xmm9
1807	mulpd	%xmm8, %xmm9
1808	movddup	 2 * SIZE(AO), %xmm8
1809	addpd	%xmm9, %xmm1
1810	movapd	16 * SIZE(BO), %xmm9
1811	mulpd	%xmm8, %xmm11
1812	addpd	%xmm11, %xmm0
1813	movapd	10 * SIZE(BO), %xmm11
1814	mulpd	%xmm8, %xmm11
1815	movddup	 3 * SIZE(AO), %xmm8
1816	addpd	%xmm11, %xmm1
1817	movapd	12 * SIZE(BO), %xmm11
1818	mulpd	%xmm8, %xmm11
1819	addpd	%xmm11, %xmm0
1820	movapd	14 * SIZE(BO), %xmm11
1821	mulpd	%xmm8, %xmm11
1822	movddup	 8 * SIZE(AO), %xmm8
1823	addpd	%xmm11, %xmm1
1824	movapd	24 * SIZE(BO), %xmm11
1825	mulpd	%xmm10, %xmm9
1826	addpd	%xmm9, %xmm0
1827	movapd	18 * SIZE(BO), %xmm9
1828	mulpd	%xmm10, %xmm9
1829	movddup	 5 * SIZE(AO), %xmm10
1830	addpd	%xmm9, %xmm1
1831	movapd	20 * SIZE(BO), %xmm9
1832	mulpd	%xmm10, %xmm9
1833	addpd	%xmm9, %xmm0
1834	movapd	22 * SIZE(BO), %xmm9
1835	mulpd	%xmm10, %xmm9
1836	movddup	 6 * SIZE(AO), %xmm10
1837	addpd	%xmm9, %xmm1
1838	movapd	32 * SIZE(BO), %xmm9
1839	mulpd	%xmm10, %xmm11
1840	addpd	%xmm11, %xmm0
1841	movapd	26 * SIZE(BO), %xmm11
1842	mulpd	%xmm10, %xmm11
1843	movddup	 7 * SIZE(AO), %xmm10
1844	addpd	%xmm11, %xmm1
1845	movapd	28 * SIZE(BO), %xmm11
1846	mulpd	%xmm10, %xmm11
1847	addpd	%xmm11, %xmm0
1848	movapd	30 * SIZE(BO), %xmm11
1849	mulpd	%xmm10, %xmm11
1850	movddup	12 * SIZE(AO), %xmm10
1851	addpd	%xmm11, %xmm1
1852	movapd	40 * SIZE(BO), %xmm11
1853
1854	addq   $ 8 * SIZE, AO
1855	addq   $32 * SIZE, BO
1856	decq   %rax
1857	jne    .L32
1858	ALIGN_4
1859
1860.L35:
1861#if defined(LT) || defined(RN)
1862	movq	KK, %rax
1863#else
1864	movq	K, %rax
1865	subq	KK, %rax
1866#endif
1867	andq	$7, %rax		# if (k & 1)
1868	BRANCH
1869	je .L38
1870	ALIGN_4
1871
1872.L36:
1873	mulpd	%xmm8, %xmm9
1874	addpd	%xmm9, %xmm0
1875	movapd	 2 * SIZE(BO), %xmm9
1876	mulpd	%xmm8, %xmm9
1877	movddup	 1 * SIZE(AO), %xmm8
1878	addpd	%xmm9, %xmm1
1879	movapd	 4 * SIZE(BO), %xmm9
1880
1881	addq	$1 * SIZE, AO		# aoffset  += 4
1882	addq	$4 * SIZE, BO		# boffset1 += 8
1883	decq	%rax
1884	jg	.L36
1885	ALIGN_4
1886
1887.L38:
1888
1889#if defined(LN) || defined(RT)
1890	movq	KK, %rax
1891#ifdef LN
1892	subq	$1, %rax
1893#else
1894	subq	$4, %rax
1895#endif
1896
1897	leaq	(, %rax, SIZE), %rax
1898	movq	AORIG, AO
1899	leaq	(AO, %rax, 1), AO
1900	leaq	(B,  %rax, 4), BO
1901#endif
1902
1903
1904#if defined(LN) || defined(LT)
1905	movapd	 0 * SIZE(BO), %xmm2
1906	movapd	 2 * SIZE(BO), %xmm3
1907
1908	subpd	%xmm0,  %xmm2
1909	subpd	%xmm1,  %xmm3
1910#else
1911	movapd	 0 * SIZE(AO), %xmm2
1912	movapd	 2 * SIZE(AO), %xmm3
1913
1914	subpd	%xmm0, %xmm2
1915	subpd	%xmm1, %xmm3
1916#endif
1917
1918#ifdef LN
1919	movddup	 0 * SIZE(AO), %xmm0
1920	mulpd	 %xmm0, %xmm2
1921	mulpd	 %xmm0, %xmm3
1922#endif
1923
1924#ifdef LT
1925	movddup	 0 * SIZE(AO), %xmm0
1926	mulpd	 %xmm0, %xmm2
1927	mulpd	 %xmm0, %xmm3
1928#endif
1929
1930#ifdef RN
1931	movapd	%xmm2, %xmm0
1932        unpckhpd %xmm0, %xmm0
1933
1934	movapd	%xmm3, %xmm1
1935        unpckhpd %xmm1, %xmm1
1936
1937	movsd	 0 * SIZE(BO), %xmm4
1938	mulsd	 %xmm4, %xmm2
1939
1940	movsd	 1 * SIZE(BO), %xmm5
1941	mulsd	 %xmm2, %xmm5
1942	subsd	 %xmm5, %xmm0
1943	movsd	 2 * SIZE(BO), %xmm6
1944	mulsd	 %xmm2, %xmm6
1945	subsd	 %xmm6, %xmm3
1946	movsd	 3 * SIZE(BO), %xmm7
1947	mulsd	 %xmm2, %xmm7
1948	subsd	 %xmm7, %xmm1
1949
1950	movsd	 5 * SIZE(BO), %xmm4
1951	mulsd	 %xmm4, %xmm0
1952
1953	movsd	 6 * SIZE(BO), %xmm5
1954	mulsd	 %xmm0, %xmm5
1955	subsd	 %xmm5, %xmm3
1956	movsd	 7 * SIZE(BO), %xmm6
1957	mulsd	 %xmm0, %xmm6
1958	subsd	 %xmm6, %xmm1
1959
1960	movsd	10 * SIZE(BO), %xmm4
1961	mulsd	 %xmm4, %xmm3
1962
1963	movsd	11 * SIZE(BO), %xmm5
1964	mulsd	 %xmm3, %xmm5
1965	subsd	 %xmm5, %xmm1
1966
1967	movsd	15 * SIZE(BO), %xmm4
1968	mulsd	 %xmm4, %xmm1
1969
1970	unpcklpd %xmm0, %xmm2
1971	unpcklpd %xmm1, %xmm3
1972#endif
1973
1974#ifdef RT
1975	movapd	%xmm2, %xmm0
1976        unpckhpd %xmm0, %xmm0
1977
1978	movapd	%xmm3, %xmm1
1979        unpckhpd %xmm1, %xmm1
1980
1981	movsd	15 * SIZE(BO), %xmm4
1982	mulsd	 %xmm4, %xmm1
1983
1984	movsd	14 * SIZE(BO), %xmm5
1985	mulsd	 %xmm1, %xmm5
1986	subsd	 %xmm5, %xmm3
1987	movsd	13 * SIZE(BO), %xmm6
1988	mulsd	 %xmm1, %xmm6
1989	subsd	 %xmm6, %xmm0
1990	movsd	12 * SIZE(BO), %xmm7
1991	mulsd	 %xmm1, %xmm7
1992	subsd	 %xmm7, %xmm2
1993
1994	movsd	10 * SIZE(BO), %xmm4
1995	mulsd	 %xmm4, %xmm3
1996
1997	movsd	 9 * SIZE(BO), %xmm5
1998	mulsd	 %xmm3, %xmm5
1999	subsd	 %xmm5, %xmm0
2000	movsd	 8 * SIZE(BO), %xmm6
2001	mulsd	 %xmm3, %xmm6
2002	subsd	 %xmm6, %xmm2
2003
2004	movsd	 5 * SIZE(BO), %xmm4
2005	mulsd	 %xmm4, %xmm0
2006
2007	movsd	 4 * SIZE(BO), %xmm5
2008	mulsd	 %xmm0, %xmm5
2009	subsd	 %xmm5, %xmm2
2010
2011	movsd	 0 * SIZE(BO), %xmm4
2012	mulsd	 %xmm4, %xmm2
2013
2014	unpcklpd %xmm0, %xmm2
2015	unpcklpd %xmm1, %xmm3
2016
2017#endif
2018
2019#ifdef LN
2020	subq	$1 * SIZE, CO1
2021	subq	$1 * SIZE, CO2
2022#endif
2023
2024#if defined(LN) || defined(LT)
2025	movsd	%xmm2,  0 * SIZE(CO1)
2026	movhpd	%xmm2,  0 * SIZE(CO2)
2027	movsd	%xmm3,  0 * SIZE(CO1, LDC, 2)
2028	movhpd	%xmm3,  0 * SIZE(CO2, LDC, 2)
2029#else
2030	movsd	%xmm2,  0 * SIZE(CO1)
2031	movhpd	%xmm2,  0 * SIZE(CO2)
2032	movsd	%xmm3,  0 * SIZE(CO1, LDC, 2)
2033	movhpd	%xmm3,  0 * SIZE(CO2, LDC, 2)
2034#endif
2035
2036#if defined(LN) || defined(LT)
2037	movapd	%xmm2,   0 * SIZE(BO)
2038	movapd	%xmm3,   2 * SIZE(BO)
2039#else
2040	movapd	%xmm2,   0 * SIZE(AO)
2041	movapd	%xmm3,   2 * SIZE(AO)
2042#endif
2043
2044#ifndef LN
2045	addq	$1 * SIZE, CO1
2046	addq	$1 * SIZE, CO2
2047#endif
2048
2049#if defined(LT) || defined(RN)
2050	movq	K,  %rax
2051	subq	KK, %rax
2052	leaq	(,%rax, SIZE), %rax
2053	leaq	(AO, %rax, 1), AO
2054	leaq	(BO, %rax, 4), BO
2055#endif
2056
2057#ifdef LN
2058	subq	$1, KK
2059#endif
2060
2061#ifdef LT
2062	addq	$1, KK
2063#endif
2064
2065#ifdef RT
2066       movq	K, %rax
2067       salq	$0 + BASE_SHIFT, %rax
2068       addq	%rax, AORIG
2069#endif
2070	ALIGN_4
2071
2072.L39:
2073#ifdef LN
2074       leaq	(, K, SIZE), %rax
2075       leaq	(B, %rax, 4), B
2076#endif
2077#if defined(LT) || defined(RN)
2078	movq	BO, B
2079#endif
2080
2081#ifdef RN
2082	addq	$4, KK
2083#endif
2084
2085#ifdef RT
2086	subq	$4, KK
2087#endif
2088
2089	decq	J			# j --
2090	jg	.L10
2091	ALIGN_4
2092
2093.L40:
2094	testq	$2, N
2095	je	.L80
2096	ALIGN_4
2097
2098#if defined(LT) || defined(RN)
2099	movq	A, AO
2100#else
2101	movq	A, AORIG
2102#endif
2103
2104#ifdef RT
2105       movq	K, %rax
2106       salq	$1 + BASE_SHIFT, %rax
2107       subq	%rax, B
2108
2109       leaq	(, LDC, 2), %rax
2110       subq	%rax, C
2111#endif
2112
2113	movq	C, CO1
2114	leaq	(C, LDC, 1), CO2
2115#ifndef RT
2116	leaq	(C, LDC, 2), C
2117#endif
2118
2119#ifdef LN
2120	movq	OFFSET, %rax
2121	addq	M, %rax
2122	movq	%rax, KK
2123#endif
2124
2125	movq	K, %rax
2126	salq	$BASE_SHIFT + 1, %rax
2127	leaq	(B, %rax), BB
2128
2129#ifdef LT
2130	movq	OFFSET, %rax
2131	movq	%rax, KK
2132#endif
2133
2134	movq	M,  I
2135	sarq	$2, I	# i = (m >> 2)
2136	jle	.L60
2137	ALIGN_4
2138
2139.L51:
2140#ifdef LN
2141       movq	K, %rax
2142       salq	$2 + BASE_SHIFT, %rax
2143       subq	%rax, AORIG
2144#endif
2145
2146#if defined(LN) || defined(RT)
2147	movq	KK, %rax
2148	leaq	(, %rax, SIZE), %rax
2149	movq	AORIG, AO
2150	leaq	(AO, %rax, 4), AO
2151	leaq	(B,  %rax, 2), BO
2152#else
2153	movq	B, BO
2154#endif
2155
2156	prefetcht0	  0 * SIZE(BB)
2157	subq	   $-4 * SIZE, BB
2158
2159	movapd	 0 * SIZE(AO), %xmm8
2160	pxor	%xmm0, %xmm0
2161	movddup	 0 * SIZE(BO), %xmm9
2162	pxor	%xmm1, %xmm1
2163	movapd	 8 * SIZE(AO), %xmm10
2164	pxor	%xmm4, %xmm4
2165	movddup	 8 * SIZE(BO), %xmm11
2166	pxor	%xmm5, %xmm5
2167
2168#ifdef HAVE_3DNOW
2169	prefetchw      4 * SIZE(CO1)
2170	prefetchw      4 * SIZE(CO2)
2171#else
2172	prefetchnta     4 * SIZE(CO1)
2173	prefetchnta     4 * SIZE(CO2)
2174#endif
2175
2176#if defined(LT) || defined(RN)
2177	movq	KK, %rax
2178#else
2179	movq	K, %rax
2180	subq	KK, %rax
2181#endif
2182	sarq	$3, %rax
2183	je	.L55
2184	ALIGN_4
2185
2186.L52:
2187	mulpd	%xmm8, %xmm9
2188	PREFETCH  (PREFETCHSIZE +  0) * SIZE(AO)
2189	addpd	%xmm9, %xmm0
2190	movddup	 1 * SIZE(BO), %xmm9
2191	mulpd	%xmm8, %xmm9
2192	movapd	 2 * SIZE(AO), %xmm8
2193	addpd	%xmm9, %xmm1
2194	movddup	 0 * SIZE(BO), %xmm9
2195	mulpd	%xmm8, %xmm9
2196	addpd	%xmm9, %xmm4
2197	movddup	 1 * SIZE(BO), %xmm9
2198	mulpd	%xmm8, %xmm9
2199	movapd	 4 * SIZE(AO), %xmm8
2200	addpd	%xmm9, %xmm5
2201	movddup	 2 * SIZE(BO), %xmm9
2202	mulpd	%xmm8, %xmm9
2203	addpd	%xmm9, %xmm0
2204	movddup	 3 * SIZE(BO), %xmm9
2205	mulpd	%xmm8, %xmm9
2206	movapd	 6 * SIZE(AO), %xmm8
2207	addpd	%xmm9, %xmm1
2208	movddup	 2 * SIZE(BO), %xmm9
2209	mulpd	%xmm8, %xmm9
2210	addpd	%xmm9, %xmm4
2211	movddup	 3 * SIZE(BO), %xmm9
2212	mulpd	%xmm8, %xmm9
2213	movapd	16 * SIZE(AO), %xmm8
2214	addpd	%xmm9, %xmm5
2215	movddup	 4 * SIZE(BO), %xmm9
2216	mulpd	%xmm10, %xmm9
2217	addpd	%xmm9, %xmm0
2218	movddup	 5 * SIZE(BO), %xmm9
2219	mulpd	%xmm10, %xmm9
2220	movapd	10 * SIZE(AO), %xmm10
2221	addpd	%xmm9, %xmm1
2222	movddup	 4 * SIZE(BO), %xmm9
2223	mulpd	%xmm10, %xmm9
2224	addpd	%xmm9, %xmm4
2225	movddup	 5 * SIZE(BO), %xmm9
2226	mulpd	%xmm10, %xmm9
2227	movapd	12 * SIZE(AO), %xmm10
2228	addpd	%xmm9, %xmm5
2229	movddup	 6 * SIZE(BO), %xmm9
2230	mulpd	%xmm10, %xmm9
2231	addpd	%xmm9, %xmm0
2232	movddup	 7 * SIZE(BO), %xmm9
2233	mulpd	%xmm10, %xmm9
2234	movapd	14 * SIZE(AO), %xmm10
2235	addpd	%xmm9, %xmm1
2236	movddup	 6 * SIZE(BO), %xmm9
2237	mulpd	%xmm10, %xmm9
2238	addpd	%xmm9, %xmm4
2239	movddup	 7 * SIZE(BO), %xmm9
2240	mulpd	%xmm10, %xmm9
2241	movapd	40 * SIZE(AO), %xmm10
2242	addpd	%xmm9, %xmm5
2243	movddup	16 * SIZE(BO), %xmm9
2244	mulpd	%xmm8, %xmm11
2245	PREFETCH  (PREFETCHSIZE + 16) * SIZE(AO)
2246	addpd	%xmm11, %xmm0
2247	movddup	 9 * SIZE(BO), %xmm11
2248	mulpd	%xmm8, %xmm11
2249	movapd	18 * SIZE(AO), %xmm8
2250	addpd	%xmm11, %xmm1
2251	movddup	 8 * SIZE(BO), %xmm11
2252	mulpd	%xmm8, %xmm11
2253	addpd	%xmm11, %xmm4
2254	movddup	 9 * SIZE(BO), %xmm11
2255	mulpd	%xmm8, %xmm11
2256	movapd	20 * SIZE(AO), %xmm8
2257	addpd	%xmm11, %xmm5
2258	movddup	10 * SIZE(BO), %xmm11
2259	mulpd	%xmm8, %xmm11
2260	addpd	%xmm11, %xmm0
2261	movddup	11 * SIZE(BO), %xmm11
2262	mulpd	%xmm8, %xmm11
2263	movapd	22 * SIZE(AO), %xmm8
2264	addpd	%xmm11, %xmm1
2265	movddup	10 * SIZE(BO), %xmm11
2266	mulpd	%xmm8, %xmm11
2267	addpd	%xmm11, %xmm4
2268	movddup	11 * SIZE(BO), %xmm11
2269	mulpd	%xmm8, %xmm11
2270	movapd	24 * SIZE(AO), %xmm8
2271	addpd	%xmm11, %xmm5
2272	movddup	12 * SIZE(BO), %xmm11
2273	mulpd	%xmm8, %xmm11
2274	addpd	%xmm11, %xmm0
2275	movddup	13 * SIZE(BO), %xmm11
2276	mulpd	%xmm8, %xmm11
2277	movapd	26 * SIZE(AO), %xmm8
2278	addpd	%xmm11, %xmm1
2279	movddup	12 * SIZE(BO), %xmm11
2280	mulpd	%xmm8, %xmm11
2281	addpd	%xmm11, %xmm4
2282	movddup	13 * SIZE(BO), %xmm11
2283	mulpd	%xmm8, %xmm11
2284	movapd	28 * SIZE(AO), %xmm8
2285	addpd	%xmm11, %xmm5
2286	movddup	14 * SIZE(BO), %xmm11
2287	mulpd	%xmm8, %xmm11
2288	addpd	%xmm11, %xmm0
2289	movddup	15 * SIZE(BO), %xmm11
2290	mulpd	%xmm8, %xmm11
2291	movapd	30 * SIZE(AO), %xmm8
2292	addpd	%xmm11, %xmm1
2293	movddup	14 * SIZE(BO), %xmm11
2294	mulpd	%xmm8, %xmm11
2295	addpd	%xmm11, %xmm4
2296	movddup	15 * SIZE(BO), %xmm11
2297	mulpd	%xmm8, %xmm11
2298	movapd	32 * SIZE(AO), %xmm8
2299	addpd	%xmm11, %xmm5
2300	movddup	24 * SIZE(BO), %xmm11
2301
2302	addq   $32 * SIZE, AO
2303	addq   $16 * SIZE, BO
2304	decq   %rax
2305	jne    .L52
2306	ALIGN_4
2307
2308.L55:
2309#if defined(LT) || defined(RN)
2310	movq	KK, %rax
2311#else
2312	movq	K, %rax
2313	subq	KK, %rax
2314#endif
2315	andq	$7, %rax		# if (k & 1)
2316	BRANCH
2317	je .L59
2318	ALIGN_4
2319
2320.L56:
2321	mulpd	%xmm8, %xmm9
2322	movapd	 2 * SIZE(AO), %xmm10
2323	addpd	%xmm9, %xmm0
2324	movddup	 1 * SIZE(BO), %xmm9
2325	mulpd	%xmm8, %xmm9
2326	movddup	 0 * SIZE(BO), %xmm11
2327	addpd	%xmm9, %xmm1
2328	movddup	 2 * SIZE(BO), %xmm9
2329	mulpd	%xmm10, %xmm11
2330	movapd	 4 * SIZE(AO), %xmm8
2331	addpd	%xmm11, %xmm4
2332	movddup	 1 * SIZE(BO), %xmm11
2333	mulpd	%xmm10, %xmm11
2334	addpd	%xmm11, %xmm5
2335
2336	addq	$4 * SIZE, AO		# aoffset  += 4
2337	addq	$2 * SIZE, BO		# boffset1 += 8
2338	decq	%rax
2339	jg	.L56
2340	ALIGN_4
2341
2342.L59:
2343#if defined(LN) || defined(RT)
2344	movq	KK, %rax
2345#ifdef LN
2346	subq	$4, %rax
2347#else
2348	subq	$2, %rax
2349#endif
2350	leaq	(, %rax, SIZE), %rax
2351
2352	movq	AORIG, AO
2353	leaq	(AO, %rax, 4), AO
2354	leaq	(B,  %rax, 2), BO
2355#endif
2356
2357#if defined(LN) || defined(LT)
2358	movapd	%xmm0, %xmm8
2359	unpcklpd %xmm1, %xmm0
2360	unpckhpd %xmm1, %xmm8
2361
2362	movapd	%xmm4, %xmm12
2363	unpcklpd %xmm5, %xmm4
2364	unpckhpd %xmm5, %xmm12
2365
2366	movapd	 0 * SIZE(BO), %xmm1
2367	movapd	 2 * SIZE(BO), %xmm5
2368	movapd	 4 * SIZE(BO), %xmm9
2369	movapd	 6 * SIZE(BO), %xmm13
2370
2371	subpd	%xmm0,  %xmm1
2372	subpd	%xmm8,  %xmm5
2373	subpd	%xmm4,  %xmm9
2374	subpd	%xmm12, %xmm13
2375#else
2376
2377	movapd	 0 * SIZE(AO), %xmm8
2378	movapd	 2 * SIZE(AO), %xmm9
2379	movapd	 4 * SIZE(AO), %xmm10
2380	movapd	 6 * SIZE(AO), %xmm11
2381
2382	subpd	%xmm0, %xmm8
2383	subpd	%xmm4, %xmm9
2384	subpd	%xmm1, %xmm10
2385	subpd	%xmm5, %xmm11
2386#endif
2387
2388
2389#ifdef LN
2390	movddup	15 * SIZE(AO), %xmm0
2391	mulpd	 %xmm0, %xmm13
2392
2393	movddup	14 * SIZE(AO), %xmm2
2394	mulpd	 %xmm13, %xmm2
2395	subpd	 %xmm2, %xmm9
2396	movddup	13 * SIZE(AO), %xmm4
2397	mulpd	 %xmm13, %xmm4
2398	subpd	 %xmm4, %xmm5
2399	movddup	12 * SIZE(AO), %xmm6
2400	mulpd	 %xmm13, %xmm6
2401	subpd	 %xmm6, %xmm1
2402
2403	movddup	10 * SIZE(AO), %xmm0
2404	mulpd	 %xmm0, %xmm9
2405	movddup	 9 * SIZE(AO), %xmm2
2406	mulpd	 %xmm9, %xmm2
2407	subpd	 %xmm2, %xmm5
2408	movddup	 8 * SIZE(AO), %xmm4
2409	mulpd	 %xmm9, %xmm4
2410	subpd	 %xmm4, %xmm1
2411
2412	movddup	 5 * SIZE(AO), %xmm0
2413	mulpd	 %xmm0, %xmm5
2414	movddup	 4 * SIZE(AO), %xmm2
2415	mulpd	 %xmm5, %xmm2
2416	subpd	 %xmm2, %xmm1
2417
2418	movddup	 0 * SIZE(AO), %xmm0
2419	mulpd	 %xmm0, %xmm1
2420#endif
2421
2422
2423#ifdef LT
2424	movddup	 0 * SIZE(AO), %xmm0
2425	mulpd	 %xmm0, %xmm1
2426
2427	movddup	 1 * SIZE(AO), %xmm2
2428	mulpd	 %xmm1, %xmm2
2429	subpd	 %xmm2, %xmm5
2430	movddup	 2 * SIZE(AO), %xmm4
2431	mulpd	 %xmm1, %xmm4
2432	subpd	 %xmm4, %xmm9
2433	movddup	 3 * SIZE(AO), %xmm6
2434	mulpd	 %xmm1, %xmm6
2435	subpd	 %xmm6, %xmm13
2436
2437	movddup	 5 * SIZE(AO), %xmm0
2438	mulpd	 %xmm0, %xmm5
2439
2440	movddup	 6 * SIZE(AO), %xmm2
2441	mulpd	 %xmm5, %xmm2
2442	subpd	 %xmm2, %xmm9
2443	movddup	 7 * SIZE(AO), %xmm4
2444	mulpd	 %xmm5, %xmm4
2445	subpd	 %xmm4, %xmm13
2446
2447	movddup	10 * SIZE(AO), %xmm0
2448	mulpd	 %xmm0, %xmm9
2449
2450	movddup	11 * SIZE(AO), %xmm2
2451	mulpd	 %xmm9, %xmm2
2452	subpd	 %xmm2, %xmm13
2453
2454	movddup	15 * SIZE(AO), %xmm0
2455	mulpd	 %xmm0, %xmm13
2456#endif
2457
2458#ifdef RN
2459	movddup	 0 * SIZE(BO), %xmm0
2460	mulpd	 %xmm0, %xmm8
2461	mulpd	 %xmm0, %xmm9
2462
2463	movddup	 1 * SIZE(BO), %xmm1
2464	mulpd	 %xmm8, %xmm1
2465	subpd	 %xmm1, %xmm10
2466	movddup	 1 * SIZE(BO), %xmm1
2467	mulpd	 %xmm9, %xmm1
2468	subpd	 %xmm1, %xmm11
2469
2470	movddup	 3 * SIZE(BO), %xmm0
2471	mulpd	 %xmm0, %xmm10
2472	mulpd	 %xmm0, %xmm11
2473#endif
2474
2475#ifdef RT
2476	movddup	 3 * SIZE(BO), %xmm0
2477	mulpd	 %xmm0, %xmm10
2478	mulpd	 %xmm0, %xmm11
2479
2480	movddup	 2 * SIZE(BO), %xmm1
2481	mulpd	 %xmm10, %xmm1
2482	subpd	 %xmm1, %xmm8
2483	movddup	 2 * SIZE(BO), %xmm1
2484	mulpd	 %xmm11, %xmm1
2485	subpd	 %xmm1, %xmm9
2486
2487	movddup	 0 * SIZE(BO), %xmm0
2488	mulpd	 %xmm0, %xmm8
2489	mulpd	 %xmm0, %xmm9
2490#endif
2491
2492#ifdef LN
2493	subq	$4 * SIZE, CO1
2494	subq	$4 * SIZE, CO2
2495#endif
2496
2497#if defined(LN) || defined(LT)
2498	movsd	%xmm1,  0 * SIZE(CO1)
2499	movsd	%xmm5,  1 * SIZE(CO1)
2500	movsd	%xmm9,  2 * SIZE(CO1)
2501	movsd	%xmm13, 3 * SIZE(CO1)
2502
2503	movhpd	%xmm1,  0 * SIZE(CO2)
2504	movhpd	%xmm5,  1 * SIZE(CO2)
2505	movhpd	%xmm9,  2 * SIZE(CO2)
2506	movhpd	%xmm13, 3 * SIZE(CO2)
2507#else
2508	movsd	%xmm8,  0 * SIZE(CO1)
2509	movhpd	%xmm8,  1 * SIZE(CO1)
2510	movsd	%xmm9,  2 * SIZE(CO1)
2511	movhpd	%xmm9,  3 * SIZE(CO1)
2512
2513	movsd	%xmm10,  0 * SIZE(CO2)
2514	movhpd	%xmm10,  1 * SIZE(CO2)
2515	movsd	%xmm11,  2 * SIZE(CO2)
2516	movhpd	%xmm11,  3 * SIZE(CO2)
2517#endif
2518
2519#if defined(LN) || defined(LT)
2520	movapd	%xmm1,   0 * SIZE(BO)
2521	movapd	%xmm5,   2 * SIZE(BO)
2522	movapd	%xmm9,   4 * SIZE(BO)
2523	movapd	%xmm13,  6 * SIZE(BO)
2524#else
2525	movapd	%xmm8,   0 * SIZE(AO)
2526	movapd	%xmm9,   2 * SIZE(AO)
2527	movapd	%xmm10,  4 * SIZE(AO)
2528	movapd	%xmm11,  6 * SIZE(AO)
2529#endif
2530
2531#ifndef LN
2532	addq	$4 * SIZE, CO1
2533	addq	$4 * SIZE, CO2
2534#endif
2535
2536#if defined(LT) || defined(RN)
2537	movq	K,  %rax
2538	subq	KK, %rax
2539	leaq	(,%rax, SIZE), %rax
2540	leaq	(AO, %rax, 4), AO
2541	leaq	(BO, %rax, 2), BO
2542#endif
2543
2544#ifdef LN
2545	subq	$4, KK
2546#endif
2547
2548#ifdef LT
2549	addq	$4, KK
2550#endif
2551
2552#ifdef RT
2553       movq	K, %rax
2554       salq	$2 + BASE_SHIFT, %rax
2555       addq	%rax, AORIG
2556#endif
2557
2558	decq	I			# i --
2559	jg	.L51
2560	ALIGN_4
2561
2562.L60:
2563	testq	$2, M
2564	je	.L70
2565	ALIGN_4
2566
2567.L61:
2568#ifdef LN
2569       movq	K, %rax
2570       salq	$1 + BASE_SHIFT, %rax
2571       subq	%rax, AORIG
2572#endif
2573
2574#if defined(LN) || defined(RT)
2575	movq	KK, %rax
2576	leaq	(, %rax, SIZE), %rax
2577	movq	AORIG, AO
2578	leaq	(AO, %rax, 2), AO
2579	leaq	(B,  %rax, 2), BO
2580#else
2581	movq	B, BO
2582#endif
2583
2584	movapd	 0 * SIZE(AO), %xmm8
2585	pxor	%xmm0, %xmm0
2586	movddup	 0 * SIZE(BO), %xmm9
2587	pxor	%xmm1, %xmm1
2588	movapd	 8 * SIZE(AO), %xmm10
2589	pxor	%xmm2, %xmm2
2590	movddup	 8 * SIZE(BO), %xmm11
2591	pxor	%xmm3, %xmm3
2592
2593#if defined(LT) || defined(RN)
2594	movq	KK, %rax
2595#else
2596	movq	K, %rax
2597	subq	KK, %rax
2598#endif
2599	sarq	$3, %rax
2600	je	.L65
2601	ALIGN_4
2602
2603.L62:
2604	mulpd	%xmm8, %xmm9
2605	PREFETCH  (PREFETCHSIZE + 0) * SIZE(AO)
2606	addpd	%xmm9, %xmm0
2607	movddup	 1 * SIZE(BO), %xmm9
2608	mulpd	%xmm8, %xmm9
2609	movapd	 2 * SIZE(AO), %xmm8
2610	addpd	%xmm9, %xmm1
2611	movddup	 2 * SIZE(BO), %xmm9
2612	mulpd	%xmm8, %xmm9
2613	addpd	%xmm9, %xmm2
2614	movddup	 3 * SIZE(BO), %xmm9
2615	mulpd	%xmm8, %xmm9
2616	movapd	 4 * SIZE(AO), %xmm8
2617	addpd	%xmm9, %xmm3
2618	movddup	 4 * SIZE(BO), %xmm9
2619	mulpd	%xmm8, %xmm9
2620	addpd	%xmm9, %xmm0
2621	movddup	 5 * SIZE(BO), %xmm9
2622	mulpd	%xmm8, %xmm9
2623	movapd	 6 * SIZE(AO), %xmm8
2624	addpd	%xmm9, %xmm1
2625	movddup	 6 * SIZE(BO), %xmm9
2626	mulpd	%xmm8, %xmm9
2627	addpd	%xmm9, %xmm2
2628	movddup	 7 * SIZE(BO), %xmm9
2629	mulpd	%xmm8, %xmm9
2630	movapd	16 * SIZE(AO), %xmm8
2631	addpd	%xmm9, %xmm3
2632	movddup	16 * SIZE(BO), %xmm9
2633	mulpd	%xmm10, %xmm11
2634	addpd	%xmm11, %xmm0
2635	movddup	 9 * SIZE(BO), %xmm11
2636	mulpd	%xmm10, %xmm11
2637	movapd	10 * SIZE(AO), %xmm10
2638	addpd	%xmm11, %xmm1
2639	movddup	10 * SIZE(BO), %xmm11
2640	mulpd	%xmm10, %xmm11
2641	addpd	%xmm11, %xmm2
2642	movddup	11 * SIZE(BO), %xmm11
2643	mulpd	%xmm10, %xmm11
2644	movapd	12 * SIZE(AO), %xmm10
2645	addpd	%xmm11, %xmm3
2646	movddup	12 * SIZE(BO), %xmm11
2647	mulpd	%xmm10, %xmm11
2648	addpd	%xmm11, %xmm0
2649	movddup	13 * SIZE(BO), %xmm11
2650	mulpd	%xmm10, %xmm11
2651	movapd	14 * SIZE(AO), %xmm10
2652	addpd	%xmm11, %xmm1
2653	movddup	14 * SIZE(BO), %xmm11
2654	mulpd	%xmm10, %xmm11
2655	addpd	%xmm11, %xmm2
2656	movddup	15 * SIZE(BO), %xmm11
2657	mulpd	%xmm10, %xmm11
2658	movapd	24 * SIZE(AO), %xmm10
2659	addpd	%xmm11, %xmm3
2660	movddup	24 * SIZE(BO), %xmm11
2661
2662	addq   $16 * SIZE, AO
2663	addq   $16 * SIZE, BO
2664	decq   %rax
2665	jne    .L62
2666	ALIGN_4
2667
2668.L65:
2669#if defined(LT) || defined(RN)
2670	movq	KK, %rax
2671#else
2672	movq	K, %rax
2673	subq	KK, %rax
2674#endif
2675	andq	$7, %rax		# if (k & 1)
2676	BRANCH
2677	je .L69
2678	ALIGN_4
2679
2680.L66:
2681	mulpd	%xmm8, %xmm9
2682	addpd	%xmm9, %xmm0
2683	movddup	 1 * SIZE(BO), %xmm9
2684	mulpd	%xmm8, %xmm9
2685	movapd	 2 * SIZE(AO), %xmm8
2686	addpd	%xmm9, %xmm1
2687	movddup	 2 * SIZE(BO), %xmm9
2688
2689	addq	$2 * SIZE, AO		# aoffset  += 4
2690	addq	$2 * SIZE, BO		# boffset1 += 8
2691	decq	%rax
2692	jg	.L66
2693	ALIGN_4
2694
2695.L69:
2696	addpd	%xmm2, %xmm0
2697	addpd	%xmm3, %xmm1
2698
2699#if defined(LN) || defined(RT)
2700	movq	KK, %rax
2701#ifdef LN
2702	subq	$2, %rax
2703#else
2704	subq	$2, %rax
2705#endif
2706	leaq	(, %rax, SIZE), %rax
2707	movq	AORIG, AO
2708	leaq	(AO, %rax, 2), AO
2709	leaq	(B,  %rax, 2), BO
2710#endif
2711
2712#if defined(LN) || defined(LT)
2713	movapd	%xmm0, %xmm8
2714	unpcklpd %xmm1, %xmm0
2715	unpckhpd %xmm1, %xmm8
2716
2717	movapd	 0 * SIZE(BO), %xmm1
2718	movapd	 2 * SIZE(BO), %xmm5
2719
2720	subpd	%xmm0,  %xmm1
2721	subpd	%xmm8,  %xmm5
2722#else
2723
2724	movapd	 0 * SIZE(AO), %xmm8
2725	movapd	 2 * SIZE(AO), %xmm10
2726
2727	subpd	%xmm0, %xmm8
2728	subpd	%xmm1, %xmm10
2729#endif
2730
2731#ifdef LN
2732	movddup	 3 * SIZE(AO), %xmm0
2733	mulpd	 %xmm0, %xmm5
2734	movddup	 2 * SIZE(AO), %xmm2
2735	mulpd	 %xmm5, %xmm2
2736	subpd	 %xmm2, %xmm1
2737
2738	movddup	 0 * SIZE(AO), %xmm0
2739	mulpd	 %xmm0, %xmm1
2740#endif
2741
2742#ifdef LT
2743	movddup	 0 * SIZE(AO), %xmm0
2744	mulpd	 %xmm0, %xmm1
2745
2746	movddup	 1 * SIZE(AO), %xmm2
2747	mulpd	 %xmm1, %xmm2
2748	subpd	 %xmm2, %xmm5
2749
2750	movddup	 3 * SIZE(AO), %xmm0
2751	mulpd	 %xmm0, %xmm5
2752#endif
2753
2754#ifdef RN
2755	movddup	 0 * SIZE(BO), %xmm0
2756	mulpd	 %xmm0, %xmm8
2757
2758	movddup	 1 * SIZE(BO), %xmm1
2759	mulpd	 %xmm8, %xmm1
2760	subpd	 %xmm1, %xmm10
2761
2762	movddup	 3 * SIZE(BO), %xmm0
2763	mulpd	 %xmm0, %xmm10
2764#endif
2765
2766#ifdef RT
2767	movddup	 3 * SIZE(BO), %xmm0
2768	mulpd	 %xmm0, %xmm10
2769
2770	movddup	 2 * SIZE(BO), %xmm1
2771	mulpd	 %xmm10, %xmm1
2772	subpd	 %xmm1, %xmm8
2773
2774	movddup	 0 * SIZE(BO), %xmm0
2775	mulpd	 %xmm0, %xmm8
2776#endif
2777
2778#ifdef LN
2779	subq	$2 * SIZE, CO1
2780	subq	$2 * SIZE, CO2
2781#endif
2782
2783#if defined(LN) || defined(LT)
2784	movsd	%xmm1,  0 * SIZE(CO1)
2785	movsd	%xmm5,  1 * SIZE(CO1)
2786	movhpd	%xmm1,  0 * SIZE(CO2)
2787	movhpd	%xmm5,  1 * SIZE(CO2)
2788#else
2789	movsd	%xmm8,  0 * SIZE(CO1)
2790	movhpd	%xmm8,  1 * SIZE(CO1)
2791	movsd	%xmm10, 0 * SIZE(CO2)
2792	movhpd	%xmm10, 1 * SIZE(CO2)
2793#endif
2794
2795#if defined(LN) || defined(LT)
2796	movapd	%xmm1,   0 * SIZE(BO)
2797	movapd	%xmm5,   2 * SIZE(BO)
2798#else
2799	movapd	%xmm8,   0 * SIZE(AO)
2800	movapd	%xmm10,  2 * SIZE(AO)
2801#endif
2802
2803#ifndef LN
2804	addq	$2 * SIZE, CO1
2805	addq	$2 * SIZE, CO2
2806#endif
2807
2808#if defined(LT) || defined(RN)
2809	movq	K,  %rax
2810	subq	KK, %rax
2811	leaq	(,%rax, SIZE), %rax
2812	leaq	(AO, %rax, 2), AO
2813	leaq	(BO, %rax, 2), BO
2814#endif
2815
2816#ifdef LN
2817	subq	$2, KK
2818#endif
2819
2820#ifdef LT
2821	addq	$2, KK
2822#endif
2823
2824#ifdef RT
2825       movq	K, %rax
2826       salq	$1 + BASE_SHIFT, %rax
2827       addq	%rax, AORIG
2828#endif
2829	ALIGN_4
2830
2831.L70:
2832	testq	$1, M
2833	je	.L79
2834	ALIGN_4
2835
2836.L71:
2837#ifdef LN
2838       movq	K, %rax
2839       salq	$0 + BASE_SHIFT, %rax
2840       subq	%rax, AORIG
2841#endif
2842
2843#if defined(LN) || defined(RT)
2844	movq	KK, %rax
2845	leaq	(, %rax, SIZE), %rax
2846	movq	AORIG, AO
2847	leaq	(AO, %rax, 1), AO
2848	leaq	(B,  %rax, 2), BO
2849#else
2850	movq	B, BO
2851#endif
2852
2853	movddup	 0 * SIZE(AO), %xmm8
2854	pxor	%xmm0, %xmm0
2855	movapd	 0 * SIZE(BO), %xmm9
2856	pxor	%xmm1, %xmm1
2857	movddup	 4 * SIZE(AO), %xmm10
2858	pxor	%xmm2, %xmm2
2859	movapd	 8 * SIZE(BO), %xmm11
2860	pxor	%xmm3, %xmm3
2861
2862#if defined(LT) || defined(RN)
2863	movq	KK, %rax
2864#else
2865	movq	K, %rax
2866	subq	KK, %rax
2867#endif
2868	sarq	$3, %rax
2869	je	.L75
2870	ALIGN_4
2871
2872.L72:
2873	mulpd	%xmm8, %xmm9
2874	PREFETCH  (PREFETCHSIZE + 0) * SIZE(AO)
2875	movddup	 1 * SIZE(AO), %xmm8
2876	addpd	%xmm9, %xmm0
2877	mulpd	 2 * SIZE(BO), %xmm8
2878	movapd	16 * SIZE(BO), %xmm9
2879	addpd	%xmm8, %xmm1
2880	movddup	 2 * SIZE(AO), %xmm8
2881	mulpd	 4 * SIZE(BO), %xmm8
2882	addpd	%xmm8, %xmm2
2883	movddup	 3 * SIZE(AO), %xmm8
2884	mulpd	 6 * SIZE(BO), %xmm8
2885	addpd	%xmm8, %xmm3
2886	movddup	 8 * SIZE(AO), %xmm8
2887	mulpd	%xmm10, %xmm11
2888	movddup	 5 * SIZE(AO), %xmm10
2889	addpd	%xmm11, %xmm0
2890	mulpd	10 * SIZE(BO), %xmm10
2891	movapd	24 * SIZE(BO), %xmm11
2892	addpd	%xmm10, %xmm1
2893	movddup	 6 * SIZE(AO), %xmm10
2894	mulpd	12 * SIZE(BO), %xmm10
2895	addpd	%xmm10, %xmm2
2896	movddup	 7 * SIZE(AO), %xmm10
2897	mulpd	14 * SIZE(BO), %xmm10
2898	addpd	%xmm10, %xmm3
2899	movddup	12 * SIZE(AO), %xmm10
2900
2901	addq   $ 8 * SIZE, AO
2902	addq   $16 * SIZE, BO
2903	decq   %rax
2904	jne    .L72
2905	ALIGN_4
2906
2907.L75:
2908#if defined(LT) || defined(RN)
2909	movq	KK, %rax
2910#else
2911	movq	K, %rax
2912	subq	KK, %rax
2913#endif
2914	andq	$7, %rax		# if (k & 1)
2915	BRANCH
2916	je .L78
2917	ALIGN_4
2918
2919.L76:
2920	mulpd	%xmm8, %xmm9
2921	movddup	 1 * SIZE(AO), %xmm8
2922	addpd	%xmm9, %xmm0
2923	movapd	 2 * SIZE(BO), %xmm9
2924
2925	addq	$1 * SIZE, AO		# aoffset  += 4
2926	addq	$2 * SIZE, BO		# boffset1 += 8
2927	decq	%rax
2928	jg	.L76
2929	ALIGN_4
2930
2931.L78:
2932	addpd	%xmm1, %xmm0
2933	addpd	%xmm3, %xmm2
2934	addpd	%xmm2, %xmm0
2935
2936#if defined(LN) || defined(RT)
2937	movq	KK, %rax
2938#ifdef LN
2939	subq	$1, %rax
2940#else
2941	subq	$2, %rax
2942#endif
2943	leaq	(, %rax, SIZE), %rax
2944	movq	AORIG, AO
2945	leaq	(AO, %rax, 1), AO
2946	leaq	(B,  %rax, 2), BO
2947#endif
2948
2949#if defined(LN) || defined(LT)
2950	movapd	 0 * SIZE(BO), %xmm2
2951	subpd	%xmm0, %xmm2
2952#else
2953	movapd	 0 * SIZE(AO), %xmm2
2954	subpd	%xmm0, %xmm2
2955#endif
2956
2957#ifdef LN
2958	movddup	 0 * SIZE(AO), %xmm0
2959	mulpd	 %xmm0, %xmm2
2960#endif
2961
2962#ifdef LT
2963	movddup	 0 * SIZE(AO), %xmm0
2964	mulpd	 %xmm0, %xmm2
2965#endif
2966
2967#ifdef RN
2968	movapd	%xmm2, %xmm0
2969        unpckhpd %xmm0, %xmm0
2970
2971	movsd	 0 * SIZE(BO), %xmm4
2972	mulsd	 %xmm4, %xmm2
2973
2974	movsd	 1 * SIZE(BO), %xmm5
2975	mulsd	 %xmm2, %xmm5
2976	subsd	 %xmm5, %xmm0
2977
2978	movsd	 3 * SIZE(BO), %xmm4
2979	mulsd	 %xmm4, %xmm0
2980
2981	unpcklpd %xmm0, %xmm2
2982#endif
2983
2984#ifdef RT
2985	movapd	%xmm2, %xmm0
2986        unpckhpd %xmm0, %xmm0
2987
2988	movsd	 3 * SIZE(BO), %xmm4
2989	mulsd	 %xmm4, %xmm0
2990
2991	movsd	 2 * SIZE(BO), %xmm5
2992	mulsd	 %xmm0, %xmm5
2993	subsd	 %xmm5, %xmm2
2994
2995	movsd	 0 * SIZE(BO), %xmm4
2996	mulsd	 %xmm4, %xmm2
2997
2998	unpcklpd %xmm0, %xmm2
2999#endif
3000
3001#ifdef LN
3002	subq	$1 * SIZE, CO1
3003	subq	$1 * SIZE, CO2
3004#endif
3005
3006#if defined(LN) || defined(LT)
3007	movsd	%xmm2,  0 * SIZE(CO1)
3008	movhpd	%xmm2,  0 * SIZE(CO2)
3009#else
3010	movsd	%xmm2,  0 * SIZE(CO1)
3011	movhpd	%xmm2,  0 * SIZE(CO2)
3012#endif
3013
3014#if defined(LN) || defined(LT)
3015	movapd	%xmm2,   0 * SIZE(BO)
3016#else
3017	movapd	%xmm2,   0 * SIZE(AO)
3018#endif
3019
3020#ifndef LN
3021	addq	$1 * SIZE, CO1
3022	addq	$1 * SIZE, CO2
3023#endif
3024
3025#if defined(LT) || defined(RN)
3026	movq	K,  %rax
3027	subq	KK, %rax
3028	leaq	(,%rax, SIZE), %rax
3029	leaq	(AO, %rax, 1), AO
3030	leaq	(BO, %rax, 2), BO
3031#endif
3032
3033#ifdef LN
3034	subq	$1, KK
3035#endif
3036
3037#ifdef LT
3038	addq	$1, KK
3039#endif
3040
3041#ifdef RT
3042       movq	K, %rax
3043       salq	$0 + BASE_SHIFT, %rax
3044       addq	%rax, AORIG
3045#endif
3046	ALIGN_4
3047
3048.L79:
3049#ifdef LN
3050       leaq	(, K, SIZE), %rax
3051       leaq	(B, %rax, 2), B
3052#endif
3053
3054#if defined(LT) || defined(RN)
3055	movq	BO, B
3056#endif
3057
3058#ifdef RN
3059	addq	$2, KK
3060#endif
3061
3062#ifdef RT
3063	subq	$2, KK
3064#endif
3065	ALIGN_4
3066
3067.L80:
3068	testq	$1, N
3069	je	.L999
3070	ALIGN_4
3071
3072#if defined(LT) || defined(RN)
3073	movq	A, AO
3074#else
3075	movq	A, AORIG
3076#endif
3077
3078#ifdef RT
3079       movq	K, %rax
3080       salq	$0 + BASE_SHIFT, %rax
3081       subq	%rax, B
3082
3083       subq	LDC, C
3084#endif
3085
3086
3087	movq	C, CO1
3088#ifndef RT
3089	addq	LDC, C
3090#endif
3091
3092#ifdef LN
3093	movq	OFFSET, %rax
3094	addq	M, %rax
3095	movq	%rax, KK
3096#endif
3097
3098#ifdef LT
3099	movq	OFFSET, %rax
3100	movq	%rax, KK
3101#endif
3102
3103	movq	M,  I
3104	sarq	$2, I	# i = (m >> 2)
3105	jle	.L100
3106	ALIGN_4
3107
3108.L91:
3109#ifdef LN
3110       movq	K, %rax
3111       salq	$2 + BASE_SHIFT, %rax
3112       subq	%rax, AORIG
3113#endif
3114
3115#if defined(LN) || defined(RT)
3116	movq	KK, %rax
3117	leaq	(, %rax, SIZE), %rax
3118	movq	AORIG, AO
3119	leaq	(AO, %rax, 4), AO
3120	leaq	(B,  %rax, 1), BO
3121#else
3122	movq	B, BO
3123#endif
3124
3125	movapd	 0 * SIZE(AO), %xmm8
3126	pxor	%xmm0, %xmm0
3127	movddup	 0 * SIZE(BO), %xmm9
3128	pxor	%xmm1, %xmm1
3129	movapd	 8 * SIZE(AO), %xmm10
3130	pxor	%xmm2, %xmm2
3131	movddup	 4 * SIZE(BO), %xmm11
3132	pxor	%xmm3, %xmm3
3133
3134#ifdef HAVE_3DNOW
3135	prefetchw      4 * SIZE(CO1)
3136#else
3137	prefetchnta     4 * SIZE(CO1)
3138#endif
3139
3140#if defined(LT) || defined(RN)
3141	movq	KK, %rax
3142#else
3143	movq	K, %rax
3144	subq	KK, %rax
3145#endif
3146	sarq	$3, %rax
3147	je	.L95
3148	ALIGN_4
3149
3150.L92:
3151	mulpd	%xmm9, %xmm8
3152	PREFETCH  (PREFETCHSIZE + 0) * SIZE(AO)
3153	mulpd	 2 * SIZE(AO), %xmm9
3154	addpd	%xmm8, %xmm0
3155	movapd	 4 * SIZE(AO), %xmm8
3156	addpd	%xmm9, %xmm1
3157	movddup	 1 * SIZE(BO), %xmm9
3158	mulpd	%xmm9, %xmm8
3159	mulpd	 6 * SIZE(AO), %xmm9
3160	addpd	%xmm8, %xmm2
3161	movapd	16 * SIZE(AO), %xmm8
3162	addpd	%xmm9, %xmm3
3163	movddup	 2 * SIZE(BO), %xmm9
3164	mulpd	%xmm9, %xmm10
3165	mulpd	10 * SIZE(AO), %xmm9
3166	addpd	%xmm10, %xmm0
3167	movapd	12 * SIZE(AO), %xmm10
3168	addpd	%xmm9, %xmm1
3169	movddup	 3 * SIZE(BO), %xmm9
3170	mulpd	%xmm9, %xmm10
3171	mulpd	14 * SIZE(AO), %xmm9
3172	addpd	%xmm10, %xmm2
3173	movapd	24 * SIZE(AO), %xmm10
3174	PREFETCH  (PREFETCHSIZE + 16) * SIZE(AO)
3175	addpd	%xmm9, %xmm3
3176	movddup	 8 * SIZE(BO), %xmm9
3177	mulpd	%xmm11, %xmm8
3178	mulpd	18 * SIZE(AO), %xmm11
3179	addpd	%xmm8, %xmm0
3180	movapd	20 * SIZE(AO), %xmm8
3181	addpd	%xmm11, %xmm1
3182	movddup	 5 * SIZE(BO), %xmm11
3183	mulpd	%xmm11, %xmm8
3184	mulpd	22 * SIZE(AO), %xmm11
3185	addpd	%xmm8, %xmm2
3186	movapd	32 * SIZE(AO), %xmm8
3187	addpd	%xmm11, %xmm3
3188	movddup	 6 * SIZE(BO), %xmm11
3189	mulpd	%xmm11, %xmm10
3190	mulpd	26 * SIZE(AO), %xmm11
3191	addpd	%xmm10, %xmm0
3192	movapd	28 * SIZE(AO), %xmm10
3193	addpd	%xmm11, %xmm1
3194	movddup	 7 * SIZE(BO), %xmm11
3195	mulpd	%xmm11, %xmm10
3196	mulpd	30 * SIZE(AO), %xmm11
3197	addpd	%xmm10, %xmm2
3198	movapd	40 * SIZE(AO), %xmm10
3199	addpd	%xmm11, %xmm3
3200	movddup	12 * SIZE(BO), %xmm11
3201
3202	addq   $32 * SIZE, AO
3203	addq   $8 * SIZE, BO
3204	decq   %rax
3205	jne    .L92
3206	ALIGN_4
3207
3208.L95:
3209#if defined(LT) || defined(RN)
3210	movq	KK, %rax
3211#else
3212	movq	K, %rax
3213	subq	KK, %rax
3214#endif
3215	andq	$7, %rax		# if (k & 1)
3216	BRANCH
3217	je .L99
3218	ALIGN_4
3219
3220.L96:
3221	mulpd	%xmm9, %xmm8
3222	mulpd	 2 * SIZE(AO), %xmm9
3223	addpd	%xmm8, %xmm0
3224	movapd	 4 * SIZE(AO), %xmm8
3225	addpd	%xmm9, %xmm1
3226	movddup	 1 * SIZE(BO), %xmm9
3227
3228	addq	$4 * SIZE, AO		# aoffset  += 4
3229	addq	$1 * SIZE, BO		# boffset1 += 8
3230	decq	%rax
3231	jg	.L96
3232	ALIGN_4
3233
3234.L99:
3235	addpd	%xmm2, %xmm0
3236	addpd	%xmm3, %xmm1
3237
3238#if defined(LN) || defined(RT)
3239	movq	KK, %rax
3240#ifdef LN
3241	subq	$4, %rax
3242#else
3243	subq	$1, %rax
3244#endif
3245	leaq	(, %rax, SIZE), %rax
3246
3247	movq	AORIG, AO
3248	leaq	(AO, %rax, 4), AO
3249	leaq	(B,  %rax, 1), BO
3250#endif
3251
3252#if defined(LN) || defined(LT)
3253	movapd	 0 * SIZE(BO), %xmm2
3254	movapd	 2 * SIZE(BO), %xmm3
3255
3256	subpd	%xmm0,  %xmm2
3257	subpd	%xmm1,  %xmm3
3258#else
3259	movapd	 0 * SIZE(AO), %xmm2
3260	movapd	 2 * SIZE(AO), %xmm3
3261
3262	subpd	%xmm0, %xmm2
3263	subpd	%xmm1, %xmm3
3264#endif
3265
3266#ifdef LN
3267	movapd	%xmm2, %xmm0
3268        unpckhpd %xmm0, %xmm0
3269
3270	movapd	%xmm3, %xmm1
3271        unpckhpd %xmm1, %xmm1
3272
3273	movsd	15 * SIZE(AO), %xmm4
3274	mulsd	 %xmm4, %xmm1
3275
3276	movsd	14 * SIZE(AO), %xmm5
3277	mulsd	 %xmm1, %xmm5
3278	subsd	 %xmm5, %xmm3
3279	movsd	13 * SIZE(AO), %xmm6
3280	mulsd	 %xmm1, %xmm6
3281	subsd	 %xmm6, %xmm0
3282	movsd	12 * SIZE(AO), %xmm7
3283	mulsd	 %xmm1, %xmm7
3284	subsd	 %xmm7, %xmm2
3285
3286	movsd	10 * SIZE(AO), %xmm4
3287	mulsd	 %xmm4, %xmm3
3288
3289	movsd	 9 * SIZE(AO), %xmm5
3290	mulsd	 %xmm3, %xmm5
3291	subsd	 %xmm5, %xmm0
3292	movsd	 8 * SIZE(AO), %xmm6
3293	mulsd	 %xmm3, %xmm6
3294	subsd	 %xmm6, %xmm2
3295
3296	movsd	 5 * SIZE(AO), %xmm4
3297	mulsd	 %xmm4, %xmm0
3298
3299	movsd	 4 * SIZE(AO), %xmm5
3300	mulsd	 %xmm0, %xmm5
3301	subsd	 %xmm5, %xmm2
3302
3303	movsd	 0 * SIZE(AO), %xmm4
3304	mulsd	 %xmm4, %xmm2
3305
3306	unpcklpd %xmm0, %xmm2
3307	unpcklpd %xmm1, %xmm3
3308#endif
3309
3310#ifdef LT
3311	movapd	%xmm2, %xmm0
3312        unpckhpd %xmm0, %xmm0
3313
3314	movapd	%xmm3, %xmm1
3315        unpckhpd %xmm1, %xmm1
3316
3317	movsd	 0 * SIZE(AO), %xmm4
3318	mulsd	 %xmm4, %xmm2
3319
3320	movsd	 1 * SIZE(AO), %xmm5
3321	mulsd	 %xmm2, %xmm5
3322	subsd	 %xmm5, %xmm0
3323	movsd	 2 * SIZE(AO), %xmm6
3324	mulsd	 %xmm2, %xmm6
3325	subsd	 %xmm6, %xmm3
3326	movsd	 3 * SIZE(AO), %xmm7
3327	mulsd	 %xmm2, %xmm7
3328	subsd	 %xmm7, %xmm1
3329
3330	movsd	 5 * SIZE(AO), %xmm4
3331	mulsd	 %xmm4, %xmm0
3332
3333	movsd	 6 * SIZE(AO), %xmm5
3334	mulsd	 %xmm0, %xmm5
3335	subsd	 %xmm5, %xmm3
3336	movsd	 7 * SIZE(AO), %xmm6
3337	mulsd	 %xmm0, %xmm6
3338	subsd	 %xmm6, %xmm1
3339
3340	movsd	10 * SIZE(AO), %xmm4
3341	mulsd	 %xmm4, %xmm3
3342
3343	movsd	11 * SIZE(AO), %xmm5
3344	mulsd	 %xmm3, %xmm5
3345	subsd	 %xmm5, %xmm1
3346
3347	movsd	15 * SIZE(AO), %xmm4
3348	mulsd	 %xmm4, %xmm1
3349
3350	unpcklpd %xmm0, %xmm2
3351	unpcklpd %xmm1, %xmm3
3352#endif
3353
3354#ifdef RN
3355	movddup	 0 * SIZE(BO), %xmm0
3356	mulpd	 %xmm0, %xmm2
3357	mulpd	 %xmm0, %xmm3
3358#endif
3359
3360#ifdef RT
3361	movddup	 0 * SIZE(BO), %xmm0
3362	mulpd	 %xmm0, %xmm2
3363	mulpd	 %xmm0, %xmm3
3364#endif
3365
3366#ifdef LN
3367	subq	$4 * SIZE, CO1
3368#endif
3369
3370#if defined(LN) || defined(LT)
3371	movsd	%xmm2,  0 * SIZE(CO1)
3372	movhpd	%xmm2,  1 * SIZE(CO1)
3373	movsd	%xmm3,  2 * SIZE(CO1)
3374	movhpd	%xmm3,  3 * SIZE(CO1)
3375#else
3376	movsd	%xmm2,  0 * SIZE(CO1)
3377	movhpd	%xmm2,  1 * SIZE(CO1)
3378	movsd	%xmm3,  2 * SIZE(CO1)
3379	movhpd	%xmm3,  3 * SIZE(CO1)
3380#endif
3381
3382#if defined(LN) || defined(LT)
3383	movapd	%xmm2,   0 * SIZE(BO)
3384	movapd	%xmm3,   2 * SIZE(BO)
3385#else
3386	movapd	%xmm2,   0 * SIZE(AO)
3387	movapd	%xmm3,   2 * SIZE(AO)
3388#endif
3389
3390#ifndef LN
3391	addq	$4 * SIZE, CO1
3392#endif
3393
3394#if defined(LT) || defined(RN)
3395	movq	K,  %rax
3396	subq	KK, %rax
3397	leaq	(,%rax, SIZE), %rax
3398	leaq	(AO, %rax, 4), AO
3399	leaq	(BO, %rax, 1), BO
3400#endif
3401
3402#ifdef LN
3403	subq	$4, KK
3404#endif
3405
3406#ifdef LT
3407	addq	$4, KK
3408#endif
3409
3410#ifdef RT
3411       movq	K, %rax
3412       salq	$2 + BASE_SHIFT, %rax
3413       addq	%rax, AORIG
3414#endif
3415
3416	decq	I			# i --
3417	jg	.L91
3418	ALIGN_4
3419
3420.L100:
3421	testq	$2, M
3422	je	.L110
3423	ALIGN_4
3424
3425.L101:
3426#ifdef LN
3427       movq	K, %rax
3428       salq	$1 + BASE_SHIFT, %rax
3429       subq	%rax, AORIG
3430#endif
3431
3432#if defined(LN) || defined(RT)
3433	movq	KK, %rax
3434	leaq	(, %rax, SIZE), %rax
3435	movq	AORIG, AO
3436	leaq	(AO, %rax, 2), AO
3437	leaq	(B,  %rax, 1), BO
3438#else
3439	movq	B, BO
3440#endif
3441
3442	movapd	 0 * SIZE(AO), %xmm8
3443	pxor	%xmm0, %xmm0
3444	movddup	 0 * SIZE(BO), %xmm9
3445	pxor	%xmm1, %xmm1
3446	movapd	 8 * SIZE(AO), %xmm10
3447	pxor	%xmm2, %xmm2
3448	movddup	 4 * SIZE(BO), %xmm11
3449	pxor	%xmm3, %xmm3
3450
3451#if defined(LT) || defined(RN)
3452	movq	KK, %rax
3453#else
3454	movq	K, %rax
3455	subq	KK, %rax
3456#endif
3457	sarq	$3, %rax
3458	je	.L105
3459	ALIGN_4
3460
3461.L102:
3462	mulpd	%xmm9, %xmm8
3463	PREFETCH  (PREFETCHSIZE + 0) * SIZE(AO)
3464	movddup	 1 * SIZE(BO), %xmm9
3465	addpd	%xmm8, %xmm0
3466	mulpd	 2 * SIZE(AO), %xmm9
3467	movapd	16 * SIZE(AO), %xmm8
3468	addpd	%xmm9, %xmm1
3469	movddup	 2 * SIZE(BO), %xmm9
3470	mulpd	 4 * SIZE(AO), %xmm9
3471	addpd	%xmm9, %xmm2
3472	movddup	 3 * SIZE(BO), %xmm9
3473	mulpd	 6 * SIZE(AO), %xmm9
3474	addpd	%xmm9, %xmm3
3475	movddup	 8 * SIZE(BO), %xmm9
3476	mulpd	%xmm11, %xmm10
3477	movddup	 5 * SIZE(BO), %xmm11
3478	addpd	%xmm10, %xmm0
3479	mulpd	10 * SIZE(AO), %xmm11
3480	movapd	24 * SIZE(AO), %xmm10
3481	addpd	%xmm11, %xmm1
3482	movddup	 6 * SIZE(BO), %xmm11
3483	mulpd	12 * SIZE(AO), %xmm11
3484	addpd	%xmm11, %xmm2
3485	movddup	 7 * SIZE(BO), %xmm11
3486	mulpd	14 * SIZE(AO), %xmm11
3487	addpd	%xmm11, %xmm3
3488	movddup	12 * SIZE(BO), %xmm11
3489
3490	addq   $16 * SIZE, AO
3491	addq   $ 8 * SIZE, BO
3492	decq   %rax
3493	jne    .L102
3494	ALIGN_4
3495
3496.L105:
3497#if defined(LT) || defined(RN)
3498	movq	KK, %rax
3499#else
3500	movq	K, %rax
3501	subq	KK, %rax
3502#endif
3503	andq	$7, %rax		# if (k & 1)
3504	BRANCH
3505	je .L109
3506	ALIGN_4
3507
3508.L106:
3509	mulpd	%xmm9, %xmm8
3510	movddup	 1 * SIZE(BO), %xmm9
3511	addpd	%xmm8, %xmm0
3512	movapd	 2 * SIZE(AO), %xmm8
3513
3514	addq	$2 * SIZE, AO		# aoffset  += 4
3515	addq	$1 * SIZE, BO		# boffset1 += 8
3516	decq	%rax
3517	jg	.L106
3518	ALIGN_4
3519
3520.L109:
3521	addpd	%xmm1, %xmm0
3522	addpd	%xmm3, %xmm2
3523	addpd	%xmm2, %xmm0
3524
3525#if defined(LN) || defined(RT)
3526	movq	KK, %rax
3527#ifdef LN
3528	subq	$2, %rax
3529#else
3530	subq	$1, %rax
3531#endif
3532	leaq	(, %rax, SIZE), %rax
3533
3534	movq	AORIG, AO
3535	leaq	(AO, %rax, 2), AO
3536	leaq	(B,  %rax, 1), BO
3537#endif
3538
3539#if defined(LN) || defined(LT)
3540	movapd	 0 * SIZE(BO), %xmm2
3541	subpd	%xmm0,  %xmm2
3542#else
3543	movapd	 0 * SIZE(AO), %xmm2
3544	subpd	%xmm0, %xmm2
3545#endif
3546
3547#ifdef LN
3548	movapd	%xmm2, %xmm0
3549        unpckhpd %xmm0, %xmm0
3550
3551	movsd	 3 * SIZE(AO), %xmm4
3552	mulsd	 %xmm4, %xmm0
3553
3554	movsd	 2 * SIZE(AO), %xmm5
3555	mulsd	 %xmm0, %xmm5
3556	subsd	 %xmm5, %xmm2
3557
3558	movsd	 0 * SIZE(AO), %xmm4
3559	mulsd	 %xmm4, %xmm2
3560
3561	unpcklpd %xmm0, %xmm2
3562#endif
3563
3564#ifdef LT
3565	movapd	%xmm2, %xmm0
3566        unpckhpd %xmm0, %xmm0
3567
3568	movsd	 0 * SIZE(AO), %xmm4
3569	mulsd	 %xmm4, %xmm2
3570
3571	movsd	 1 * SIZE(AO), %xmm5
3572	mulsd	 %xmm2, %xmm5
3573	subsd	 %xmm5, %xmm0
3574
3575	movsd	 3 * SIZE(AO), %xmm4
3576	mulsd	 %xmm4, %xmm0
3577
3578	unpcklpd %xmm0, %xmm2
3579#endif
3580
3581#ifdef RN
3582	movddup	 0 * SIZE(BO), %xmm0
3583	mulpd	 %xmm0, %xmm2
3584#endif
3585
3586#ifdef RT
3587	movddup	 0 * SIZE(BO), %xmm0
3588	mulpd	 %xmm0, %xmm2
3589#endif
3590
3591#ifdef LN
3592	subq	$2 * SIZE, CO1
3593#endif
3594
3595#if defined(LN) || defined(LT)
3596	movsd	%xmm2,  0 * SIZE(CO1)
3597	movhpd	%xmm2,  1 * SIZE(CO1)
3598#else
3599	movsd	%xmm2,  0 * SIZE(CO1)
3600	movhpd	%xmm2,  1 * SIZE(CO1)
3601#endif
3602
3603#if defined(LN) || defined(LT)
3604	movapd	%xmm2,   0 * SIZE(BO)
3605#else
3606	movapd	%xmm2,   0 * SIZE(AO)
3607#endif
3608
3609#ifndef LN
3610	addq	$2 * SIZE, CO1
3611#endif
3612
3613#if defined(LT) || defined(RN)
3614	movq	K,  %rax
3615	subq	KK, %rax
3616	leaq	(,%rax, SIZE), %rax
3617	leaq	(AO, %rax, 2), AO
3618	leaq	(BO, %rax, 1), BO
3619#endif
3620
3621#ifdef LN
3622	subq	$2, KK
3623#endif
3624
3625#ifdef LT
3626	addq	$2, KK
3627#endif
3628
3629#ifdef RT
3630       movq	K, %rax
3631       salq	$1 + BASE_SHIFT, %rax
3632       addq	%rax, AORIG
3633#endif
3634	ALIGN_4
3635
3636.L110:
3637	testq	$1, M
3638	je	.L119
3639	ALIGN_4
3640
3641.L111:
3642#ifdef LN
3643       movq	K, %rax
3644       salq	$0 + BASE_SHIFT, %rax
3645       subq	%rax, AORIG
3646#endif
3647
3648#if defined(LN) || defined(RT)
3649	movq	KK, %rax
3650	leaq	(, %rax, SIZE), %rax
3651	movq	AORIG, AO
3652	leaq	(AO, %rax, 1), AO
3653	leaq	(B,  %rax, 1), BO
3654#else
3655	movq	B, BO
3656#endif
3657
3658	movapd	 0 * SIZE(AO), %xmm9
3659	pxor	%xmm0, %xmm0
3660	movapd	 0 * SIZE(BO), %xmm8
3661	pxor	%xmm1, %xmm1
3662	movapd	 4 * SIZE(AO), %xmm11
3663	pxor	%xmm2, %xmm2
3664	movapd	 4 * SIZE(BO), %xmm10
3665	pxor	%xmm3, %xmm3
3666
3667#if defined(LT) || defined(RN)
3668	movq	KK, %rax
3669#else
3670	movq	K, %rax
3671	subq	KK, %rax
3672#endif
3673	sarq	$3, %rax
3674	je	.L115
3675	ALIGN_4
3676
3677.L112:
3678	mulpd	%xmm9, %xmm8
3679	movapd	 2 * SIZE(AO), %xmm9
3680	addpd	%xmm8, %xmm0
3681	mulpd	 2 * SIZE(BO), %xmm9
3682	movapd	 8 * SIZE(BO), %xmm8
3683	addpd	%xmm9, %xmm1
3684	movapd	 8 * SIZE(AO), %xmm9
3685	mulpd	%xmm11, %xmm10
3686	movapd	 6 * SIZE(AO), %xmm11
3687	addpd	%xmm10, %xmm0
3688	mulpd	 6 * SIZE(BO), %xmm11
3689	movapd	12 * SIZE(BO), %xmm10
3690	addpd	%xmm11, %xmm1
3691	movapd	12 * SIZE(AO), %xmm11
3692
3693	addq   $8 * SIZE, AO
3694	addq   $8 * SIZE, BO
3695	decq   %rax
3696	jne    .L112
3697	ALIGN_4
3698
3699.L115:
3700#if defined(LT) || defined(RN)
3701	movq	KK, %rax
3702#else
3703	movq	K, %rax
3704	subq	KK, %rax
3705#endif
3706	andq	$7, %rax		# if (k & 1)
3707	BRANCH
3708	je .L118
3709	ALIGN_4
3710
3711.L116:
3712	mulsd	 0 * SIZE(BO), %xmm9
3713	addsd	%xmm9, %xmm0
3714	movsd	 1 * SIZE(AO), %xmm9
3715
3716	addq	$1 * SIZE, AO		# aoffset  += 4
3717	addq	$1 * SIZE, BO		# boffset1 += 8
3718	decq	%rax
3719	jg	.L116
3720	ALIGN_4
3721
3722.L118:
3723	addpd	%xmm1, %xmm0
3724	haddpd	%xmm0, %xmm0
3725
3726#if defined(LN) || defined(RT)
3727	movq	KK, %rax
3728#ifdef LN
3729	subq	$1, %rax
3730#else
3731	subq	$1, %rax
3732#endif
3733	leaq	(, %rax, SIZE), %rax
3734
3735	movq	AORIG, AO
3736	leaq	(AO, %rax, 1), AO
3737	leaq	(B,  %rax, 1), BO
3738#endif
3739
3740#if defined(LN) || defined(LT)
3741	movsd	 0 * SIZE(BO), %xmm2
3742	subsd	%xmm0,  %xmm2
3743#else
3744	movsd	 0 * SIZE(AO), %xmm2
3745	subsd	%xmm0, %xmm2
3746#endif
3747
3748#ifdef LN
3749	movsd	 0 * SIZE(AO), %xmm4
3750	mulsd	 %xmm4, %xmm2
3751#endif
3752
3753#ifdef LT
3754	movsd	 0 * SIZE(AO), %xmm4
3755	mulsd	 %xmm4, %xmm2
3756#endif
3757
3758#ifdef RN
3759	movsd	 0 * SIZE(BO), %xmm0
3760	mulsd	 %xmm0, %xmm2
3761#endif
3762
3763#ifdef RT
3764	movsd	 0 * SIZE(BO), %xmm0
3765	mulsd	 %xmm0, %xmm2
3766#endif
3767
3768#ifdef LN
3769	subq	$1 * SIZE, CO1
3770#endif
3771
3772#if defined(LN) || defined(LT)
3773	movsd	%xmm2,  0 * SIZE(CO1)
3774#else
3775	movsd	%xmm2,  0 * SIZE(CO1)
3776#endif
3777
3778#if defined(LN) || defined(LT)
3779	movsd	%xmm2,   0 * SIZE(BO)
3780#else
3781	movsd	%xmm2,   0 * SIZE(AO)
3782#endif
3783
3784#ifndef LN
3785	addq	$1 * SIZE, CO1
3786#endif
3787
3788#if defined(LT) || defined(RN)
3789	movq	K,  %rax
3790	subq	KK, %rax
3791	leaq	(,%rax, SIZE), %rax
3792	leaq	(AO, %rax, 1), AO
3793	leaq	(BO, %rax, 1), BO
3794#endif
3795
3796#ifdef LN
3797	subq	$1, KK
3798#endif
3799
3800#ifdef LT
3801	addq	$1, KK
3802#endif
3803
3804#ifdef RT
3805       movq	K, %rax
3806       salq	$0 + BASE_SHIFT, %rax
3807       addq	%rax, AORIG
3808#endif
3809	ALIGN_4
3810
3811.L119:
3812#ifdef LN
3813       leaq	(, K, SIZE), %rax
3814       leaq	(B, %rax, 1), B
3815#endif
3816
3817#if defined(LT) || defined(RN)
3818	movq	BO, B
3819#endif
3820
3821#ifdef RN
3822	addq	$1, KK
3823#endif
3824
3825#ifdef RT
3826	subq	$1, KK
3827#endif
3828	ALIGN_2
3829
3830.L999:
3831	movq	  0(%rsp), %rbx
3832	movq	  8(%rsp), %rbp
3833	movq	 16(%rsp), %r12
3834	movq	 24(%rsp), %r13
3835	movq	 32(%rsp), %r14
3836	movq	 40(%rsp), %r15
3837
3838#ifdef WINDOWS_ABI
3839	movq	 48(%rsp), %rdi
3840	movq	 56(%rsp), %rsi
3841	movups	 64(%rsp), %xmm6
3842	movups	 80(%rsp), %xmm7
3843	movups	 96(%rsp), %xmm8
3844	movups	112(%rsp), %xmm9
3845	movups	128(%rsp), %xmm10
3846	movups	144(%rsp), %xmm11
3847	movups	160(%rsp), %xmm12
3848	movups	176(%rsp), %xmm13
3849	movups	192(%rsp), %xmm14
3850	movups	208(%rsp), %xmm15
3851#endif
3852
3853	addq	$STACKSIZE, %rsp
3854	ret
3855
3856	EPILOGUE
3857