1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define OLD_M	%rdi
26#define OLD_N	%rsi
27#define M	%r13
28#define N	%r14
29#define K	%rdx
30#define A	%rcx
31#define B	%r8
32#define C	%r9
33#define LDC	%r10
34
35#define I	%r11
36#define J	%r12
37#define AO	%rdi
38#define BO	%rsi
39#define	CO1	%r15
40#define CO2	%rbp
41
42#ifndef WINDOWS_ABI
43
44#define STACKSIZE 64
45
46#define OLD_LDC		 8 + STACKSIZE(%rsp)
47#define OLD_OFFSET	16 + STACKSIZE(%rsp)
48
49#else
50
51#define STACKSIZE 256
52
53#define OLD_ALPHA_I	40 + STACKSIZE(%rsp)
54#define OLD_A		48 + STACKSIZE(%rsp)
55#define OLD_B		56 + STACKSIZE(%rsp)
56#define OLD_C		64 + STACKSIZE(%rsp)
57#define OLD_LDC		72 + STACKSIZE(%rsp)
58#define OLD_OFFSET	80 + STACKSIZE(%rsp)
59
60#endif
61
62#define POSINV	  0(%rsp)
63#define ALPHA_R	 16(%rsp)
64#define ALPHA_I	 32(%rsp)
65#define OFFSET	 40(%rsp)
66#define KK	 48(%rsp)
67#define KKK	 56(%rsp)
68#define AORIG    64(%rsp)
69#define BORIG	 72(%rsp)
70#define BUFFER	128(%rsp)
71
72#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
73#define PREFETCH     prefetch
74#define PREFETCHW    prefetchw
75#define PREFETCHNTA  prefetchnta
76#define PREFETCHSIZE (8 * 6 + 4)
77#endif
78
79#ifdef GENERIC
80#define PREFETCH     prefetcht0
81#define PREFETCHW    prefetcht0
82#define PREFETCHNTA  prefetchnta
83#define PREFETCHSIZE (8 * 6 + 4)
84#endif
85
86#define KERNEL1(xx) \
87	mulpd	%xmm8, %xmm9 ;\
88	addpd	%xmm9, %xmm0 ;\
89	movapd	 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\
90	mulpd	%xmm8, %xmm11 ;\
91	PREFETCH	(PREFETCHSIZE     +  0) * SIZE + 1 * (xx) * SIZE(AO) ;\
92	addpd	%xmm11, %xmm1 ;\
93	movapd	 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
94	mulpd	%xmm8, %xmm13 ;\
95	mulpd	 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\
96	addpd	%xmm13, %xmm2 ;\
97	movapd	 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
98	addpd	%xmm8, %xmm3 ;\
99	movapd	 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm8
100
101#define KERNEL2(xx) \
102	mulpd	%xmm10, %xmm9 ;\
103	addpd	%xmm9, %xmm4 ;\
104	movapd	16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\
105	mulpd	%xmm10, %xmm11 ;\
106	addpd	%xmm11, %xmm5 ;\
107	movapd	10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
108	mulpd	%xmm10, %xmm13 ;\
109	mulpd	 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\
110	addpd	%xmm13, %xmm6 ;\
111	movapd	12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
112	addpd	%xmm10, %xmm7 ;\
113	movapd	10 * SIZE + 1 * (xx) * SIZE(AO), %xmm10
114
115#define KERNEL3(xx) \
116	mulpd	%xmm12, %xmm15 ;\
117	addpd	%xmm15, %xmm0 ;\
118	movapd	 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\
119	mulpd	%xmm12, %xmm11 ;\
120	addpd	%xmm11, %xmm1 ;\
121	movapd	10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
122	mulpd	%xmm12, %xmm13 ;\
123	mulpd	14 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\
124	addpd	%xmm13, %xmm2 ;\
125	movapd	12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
126	addpd	%xmm12, %xmm3 ;\
127	movapd	12 * SIZE + 1 * (xx) * SIZE(AO), %xmm12
128
129#define KERNEL4(xx) \
130	mulpd	%xmm14, %xmm15 ;\
131	addpd	%xmm15, %xmm4 ;\
132	movapd	24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\
133	mulpd	%xmm14, %xmm11 ;\
134	addpd	%xmm11, %xmm5 ;\
135	movapd	18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
136	mulpd	%xmm14, %xmm13 ;\
137	mulpd	14 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\
138	addpd	%xmm13, %xmm6 ;\
139	movapd	20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
140	addpd	%xmm14, %xmm7 ;\
141	movapd	14 * SIZE + 1 * (xx) * SIZE(AO), %xmm14
142
143#define KERNEL5(xx) \
144	mulpd	%xmm8, %xmm9 ;\
145	addpd	%xmm9, %xmm0 ;\
146	movapd	16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\
147	mulpd	%xmm8, %xmm11 ;\
148	PREFETCH	(PREFETCHSIZE     +  8) * SIZE + 1 * (xx) * SIZE(AO) ;\
149	addpd	%xmm11, %xmm1 ;\
150	movapd	18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
151	mulpd	%xmm8, %xmm13 ;\
152	mulpd	22 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\
153	addpd	%xmm13, %xmm2 ;\
154	movapd	20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
155	addpd	%xmm8, %xmm3 ;\
156	movapd	16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8
157
158#define KERNEL6(xx) \
159	mulpd	%xmm10, %xmm9 ;\
160	addpd	%xmm9, %xmm4 ;\
161	movapd	32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\
162	mulpd	%xmm10, %xmm11 ;\
163	addpd	%xmm11, %xmm5 ;\
164	movapd	26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
165	mulpd	%xmm10, %xmm13 ;\
166	mulpd	22 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\
167	addpd	%xmm13, %xmm6 ;\
168	movapd	28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
169	addpd	%xmm10, %xmm7 ;\
170	movapd	18 * SIZE + 1 * (xx) * SIZE(AO), %xmm10
171
172#define KERNEL7(xx) \
173	mulpd	%xmm12, %xmm15 ;\
174	addpd	%xmm15, %xmm0 ;\
175	movapd	24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\
176	mulpd	%xmm12, %xmm11 ;\
177	addpd	%xmm11, %xmm1 ;\
178	movapd	26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
179	mulpd	%xmm12, %xmm13 ;\
180	mulpd	30 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\
181	addpd	%xmm13, %xmm2 ;\
182	movapd	28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
183	addpd	%xmm12, %xmm3 ;\
184	movapd	20 * SIZE + 1 * (xx) * SIZE(AO), %xmm12
185
186#define KERNEL8(xx) \
187	mulpd	%xmm14, %xmm15 ;\
188	addpd	%xmm15, %xmm4 ;\
189	movapd	40 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\
190	mulpd	%xmm14, %xmm11 ;\
191	addpd	%xmm11, %xmm5 ;\
192	movapd	34 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
193	mulpd	%xmm14, %xmm13 ;\
194	mulpd	30 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\
195	addpd	%xmm13, %xmm6 ;\
196	movapd	36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
197	addpd	%xmm14, %xmm7 ;\
198	movapd	22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14
199
200
201#ifndef CONJ
202#define NN
203#else
204#if defined(LN) || defined(LT)
205#define CN
206#else
207#define NC
208#endif
209#endif
210
211	PROLOGUE
212	PROFCODE
213
214	subq	$STACKSIZE, %rsp
215
216	movq	%rbx,  0(%rsp)
217	movq	%rbp,  8(%rsp)
218	movq	%r12, 16(%rsp)
219	movq	%r13, 24(%rsp)
220	movq	%r14, 32(%rsp)
221	movq	%r15, 40(%rsp)
222
223#ifdef WINDOWS_ABI
224	movq	%rdi,    48(%rsp)
225	movq	%rsi,    56(%rsp)
226	movups	%xmm6,   64(%rsp)
227	movups	%xmm7,   80(%rsp)
228	movups	%xmm8,   96(%rsp)
229	movups	%xmm9,  112(%rsp)
230	movups	%xmm10, 128(%rsp)
231	movups	%xmm11, 144(%rsp)
232	movups	%xmm12, 160(%rsp)
233	movups	%xmm13, 176(%rsp)
234	movups	%xmm14, 192(%rsp)
235	movups	%xmm15, 208(%rsp)
236
237	movq	ARG1,      OLD_M
238	movq	ARG2,      OLD_N
239	movq	ARG3,      K
240	movq	OLD_A,     A
241	movq	OLD_B,     B
242	movq	OLD_C,     C
243	movq	OLD_LDC,   LDC
244	movsd	OLD_OFFSET, %xmm4
245
246	movaps	%xmm3, %xmm0
247
248#else
249	movq	OLD_LDC,   LDC
250	movsd	OLD_OFFSET, %xmm4
251
252#endif
253
254	movq	%rsp, %rbx	# save old stack
255	subq	$128 + LOCAL_BUFFER_SIZE, %rsp
256	andq	$-4096, %rsp	# align stack
257
258	STACK_TOUCHING
259
260	movq	OLD_M, M
261	movq	OLD_N, N
262
263	pcmpeqb	%xmm15, %xmm15
264	psllq	$63, %xmm15	# Generate mask
265	pxor	%xmm2, %xmm2
266
267	movlpd	  %xmm2,  0 + POSINV
268	movlpd	  %xmm15, 8 + POSINV
269
270	movlpd	%xmm4, OFFSET
271	movlpd	%xmm4, KK
272
273	salq	$ZBASE_SHIFT, LDC
274
275#ifdef LN
276       movq	M, %rax
277       salq	$ZBASE_SHIFT, %rax
278       addq	%rax, C
279       imulq	K, %rax
280       addq	%rax, A
281#endif
282
283#ifdef RT
284       movq	N, %rax
285       salq	$ZBASE_SHIFT, %rax
286       imulq	K, %rax
287       addq	%rax, B
288
289       movq	N, %rax
290       imulq	LDC, %rax
291       addq	%rax, C
292#endif
293
294#ifdef RN
295	negq	KK
296#endif
297
298#ifdef RT
299       movq	N, %rax
300       subq	OFFSET, %rax
301       movq	%rax, KK
302#endif
303
304	movq	N,  J
305	sarq	$1, J		# j = (n >> 2)
306	jle	.L100
307	ALIGN_4
308
309.L01:
310#ifdef LN
311	movq	OFFSET, %rax
312	addq	M, %rax
313	movq	%rax, KK
314#endif
315
316	leaq	BUFFER, BO
317
318#ifdef RT
319       movq	K, %rax
320       salq	$1 + ZBASE_SHIFT, %rax
321       subq	%rax, B
322#endif
323
324#if defined(LN) || defined(RT)
325	movq	KK, %rax
326	movq	B, BORIG
327	salq	$ZBASE_SHIFT, %rax
328	leaq	(B,  %rax, 2), B
329	leaq	(BO, %rax, 4), BO
330#endif
331
332#if defined(LT)
333	movq	OFFSET, %rax
334	movq	%rax, KK
335#endif
336
337#if defined(LT) || defined(RN)
338	movq	KK, %rax
339#else
340	movq	K, %rax
341	subq	KK, %rax
342#endif
343	sarq	$2, %rax
344	jle	.L03
345
346	addq	%rax, %rax
347	ALIGN_4
348
349.L02:
350	PREFETCHNTA	 56 * SIZE(B)
351
352	movlpd	 0 * SIZE(B), %xmm0
353	movlpd	 1 * SIZE(B), %xmm1
354	movlpd	 2 * SIZE(B), %xmm2
355	movlpd	 3 * SIZE(B), %xmm3
356	movlpd	 4 * SIZE(B), %xmm4
357	movlpd	 5 * SIZE(B), %xmm5
358	movlpd	 6 * SIZE(B), %xmm6
359	movlpd	 7 * SIZE(B), %xmm7
360
361	movlpd	%xmm0,  0 * SIZE(BO)
362	movlpd	%xmm0,  1 * SIZE(BO)
363	movlpd	%xmm1,  2 * SIZE(BO)
364	movlpd	%xmm1,  3 * SIZE(BO)
365	movlpd	%xmm2,  4 * SIZE(BO)
366	movlpd	%xmm2,  5 * SIZE(BO)
367	movlpd	%xmm3,  6 * SIZE(BO)
368	movlpd	%xmm3,  7 * SIZE(BO)
369	movlpd	%xmm4,  8 * SIZE(BO)
370	movlpd	%xmm4,  9 * SIZE(BO)
371	movlpd	%xmm5, 10 * SIZE(BO)
372	movlpd	%xmm5, 11 * SIZE(BO)
373	movlpd	%xmm6, 12 * SIZE(BO)
374	movlpd	%xmm6, 13 * SIZE(BO)
375	movlpd	%xmm7, 14 * SIZE(BO)
376	movlpd	%xmm7, 15 * SIZE(BO)
377
378	subq	$-16 * SIZE, BO
379	addq	$  8 * SIZE, B
380	decq	%rax
381	jne	.L02
382	ALIGN_4
383
384.L03:
385#if defined(LT) || defined(RN)
386	movq	KK, %rax
387#else
388	movq	K, %rax
389	subq	KK, %rax
390#endif
391	andq	$3, %rax
392	BRANCH
393	jle	.L05
394	ALIGN_4
395
396.L04:
397	movlpd	 0 * SIZE(B), %xmm0
398	movlpd	 1 * SIZE(B), %xmm1
399	movlpd	 2 * SIZE(B), %xmm2
400	movlpd	 3 * SIZE(B), %xmm3
401
402	movlpd	%xmm0,  0 * SIZE(BO)
403	movlpd	%xmm0,  1 * SIZE(BO)
404	movlpd	%xmm1,  2 * SIZE(BO)
405	movlpd	%xmm1,  3 * SIZE(BO)
406	movlpd	%xmm2,  4 * SIZE(BO)
407	movlpd	%xmm2,  5 * SIZE(BO)
408	movlpd	%xmm3,  6 * SIZE(BO)
409	movlpd	%xmm3,  7 * SIZE(BO)
410
411	addq	$ 4 * SIZE, B
412	addq	$ 8 * SIZE, BO
413
414	decq	%rax
415	jne	.L04
416	ALIGN_4
417
418.L05:
419#if defined(LT) || defined(RN)
420	movq	A, AO
421#else
422	movq	A, AORIG
423#endif
424
425#ifdef RT
426       leaq	(, LDC, 2), %rax
427       subq	%rax, C
428#endif
429
430	movq	C, CO1
431	leaq	(C, LDC, 1), CO2
432
433#ifndef RT
434	leaq	(C, LDC, 2), C
435#endif
436
437	movq	M,  I
438	sarq	$1, I		# i = (m >> 2)
439	jle	.L30
440	ALIGN_4
441
442.L10:
443#ifdef LN
444       movq	K, %rax
445       salq	$1 + ZBASE_SHIFT, %rax
446       subq	%rax, AORIG
447#endif
448
449#if defined(LN) || defined(RT)
450	movq	KK, %rax
451	movq	AORIG, AO
452	salq	$ZBASE_SHIFT, %rax
453	leaq	(AO, %rax, 2), AO
454#endif
455
456	leaq	BUFFER, BO
457
458#if defined(LN) || defined(RT)
459	movq	KK, %rax
460	salq	$1 + ZBASE_SHIFT, %rax
461	leaq	(BO, %rax, 2), BO
462#endif
463
464	movapd	 0 * SIZE(AO), %xmm8
465	pxor	%xmm0, %xmm0
466	movapd	 2 * SIZE(AO), %xmm10
467	pxor	%xmm1, %xmm1
468	movapd	 4 * SIZE(AO), %xmm12
469	pxor	%xmm2, %xmm2
470	movapd	 6 * SIZE(AO), %xmm14
471	pxor	%xmm3, %xmm3
472
473	movapd	 0 * SIZE(BO), %xmm9
474	pxor	%xmm4, %xmm4
475	movapd	 2 * SIZE(BO), %xmm11
476	pxor	%xmm5, %xmm5
477	movapd	 4 * SIZE(BO), %xmm13
478	movapd	 8 * SIZE(BO), %xmm15
479
480	PREFETCHW      4 * SIZE(CO1)
481	pxor	%xmm6, %xmm6
482	PREFETCHW      4 * SIZE(CO2)
483	pxor	%xmm7, %xmm7
484
485#if defined(LT) || defined(RN)
486	movq	KK, %rax
487#else
488	movq	K, %rax
489	subq	KK, %rax
490#endif
491	andq	$-8, %rax
492	salq	$4, %rax
493	je	.L15
494.L1X:
495	KERNEL1(16 *  0)
496	KERNEL2(16 *  0)
497	KERNEL3(16 *  0)
498	KERNEL4(16 *  0)
499	KERNEL5(16 *  0)
500	KERNEL6(16 *  0)
501	KERNEL7(16 *  0)
502	KERNEL8(16 *  0)
503	KERNEL1(16 *  1)
504	KERNEL2(16 *  1)
505	KERNEL3(16 *  1)
506	KERNEL4(16 *  1)
507	KERNEL5(16 *  1)
508	KERNEL6(16 *  1)
509	KERNEL7(16 *  1)
510	KERNEL8(16 *  1)
511	cmpq	$64 *  2, %rax
512	jle	.L12
513	KERNEL1(16 *  2)
514	KERNEL2(16 *  2)
515	KERNEL3(16 *  2)
516	KERNEL4(16 *  2)
517	KERNEL5(16 *  2)
518	KERNEL6(16 *  2)
519	KERNEL7(16 *  2)
520	KERNEL8(16 *  2)
521	KERNEL1(16 *  3)
522	KERNEL2(16 *  3)
523	KERNEL3(16 *  3)
524	KERNEL4(16 *  3)
525	KERNEL5(16 *  3)
526	KERNEL6(16 *  3)
527	KERNEL7(16 *  3)
528	KERNEL8(16 *  3)
529	cmpq	$64 *  4, %rax
530	jle	.L12
531	KERNEL1(16 *  4)
532	KERNEL2(16 *  4)
533	KERNEL3(16 *  4)
534	KERNEL4(16 *  4)
535	KERNEL5(16 *  4)
536	KERNEL6(16 *  4)
537	KERNEL7(16 *  4)
538	KERNEL8(16 *  4)
539	KERNEL1(16 *  5)
540	KERNEL2(16 *  5)
541	KERNEL3(16 *  5)
542	KERNEL4(16 *  5)
543	KERNEL5(16 *  5)
544	KERNEL6(16 *  5)
545	KERNEL7(16 *  5)
546	KERNEL8(16 *  5)
547	cmpq	$64 *  6, %rax
548	jle	.L12
549	KERNEL1(16 *  6)
550	KERNEL2(16 *  6)
551	KERNEL3(16 *  6)
552	KERNEL4(16 *  6)
553	KERNEL5(16 *  6)
554	KERNEL6(16 *  6)
555	KERNEL7(16 *  6)
556	KERNEL8(16 *  6)
557	KERNEL1(16 *  7)
558	KERNEL2(16 *  7)
559	KERNEL3(16 *  7)
560	KERNEL4(16 *  7)
561	KERNEL5(16 *  7)
562	KERNEL6(16 *  7)
563	KERNEL7(16 *  7)
564	KERNEL8(16 *  7)
565
566	addq	$16 * 8  * SIZE, AO
567	addq	$32 * 8  * SIZE, BO
568	subq	$64 * 8, %rax
569	jg	.L1X
570
571.L12:
572	leaq	(AO, %rax, 2), AO	# * 16
573	leaq	(BO, %rax, 4), BO	# * 64
574	ALIGN_4
575
576.L15:
577#if defined(LT) || defined(RN)
578	movq	KK, %rax
579#else
580	movq	K, %rax
581	subq	KK, %rax
582#endif
583	movapd	POSINV,  %xmm15
584	andq	$7, %rax		# if (k & 1)
585	BRANCH
586	je .L19
587	ALIGN_4
588
589.L16:
590	mulpd	%xmm8, %xmm9
591	addpd	%xmm9, %xmm0
592	movapd	 2 * SIZE(BO), %xmm9
593	mulpd	%xmm8, %xmm9
594	addpd	%xmm9, %xmm1
595	movapd	 4 * SIZE(BO), %xmm9
596	mulpd	%xmm8, %xmm9
597	mulpd	 6 * SIZE(BO), %xmm8
598	addpd	%xmm9, %xmm2
599	movapd	 0 * SIZE(BO), %xmm9
600	addpd	%xmm8, %xmm3
601	movapd	 4 * SIZE(AO), %xmm8
602	mulpd	%xmm10, %xmm9
603	addpd	%xmm9, %xmm4
604	movapd	 2 * SIZE(BO), %xmm9
605	mulpd	%xmm10, %xmm9
606	addpd	%xmm9, %xmm5
607	movapd	 4 * SIZE(BO), %xmm9
608	mulpd	%xmm10, %xmm9
609	mulpd	 6 * SIZE(BO), %xmm10
610	addpd	%xmm9, %xmm6
611	movapd	 8 * SIZE(BO), %xmm9
612	addpd	%xmm10, %xmm7
613	movapd	 6 * SIZE(AO), %xmm10
614
615	addq	$4 * SIZE, AO		# aoffset  += 4
616	addq	$8 * SIZE, BO		# boffset1 += 8
617	decq	%rax
618	jg	.L16
619	ALIGN_4
620
621.L19:
622#if defined(LN) || defined(RT)
623	movq	KK, %rax
624#ifdef LN
625	subq	$2, %rax
626#else
627	subq	$2, %rax
628#endif
629
630	movq	AORIG, AO
631	movq	BORIG, B
632	leaq	BUFFER, BO
633
634	salq	$ZBASE_SHIFT, %rax
635	leaq	(AO, %rax, 2), AO
636	leaq	(B,  %rax, 2), B
637	leaq	(BO, %rax, 4), BO
638#endif
639
640	SHUFPD_1 %xmm1, %xmm1
641	SHUFPD_1 %xmm3, %xmm3
642	SHUFPD_1 %xmm5, %xmm5
643	SHUFPD_1 %xmm7, %xmm7
644
645#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
646    defined(NR) || defined(NC) || defined(TR) || defined(TC)
647	xorpd	%xmm15, %xmm1
648	xorpd	%xmm15, %xmm3
649	xorpd	%xmm15, %xmm5
650	xorpd	%xmm15, %xmm7
651#else
652	xorpd	%xmm15, %xmm0
653	xorpd	%xmm15, %xmm2
654	xorpd	%xmm15, %xmm4
655	xorpd	%xmm15, %xmm6
656#endif
657
658#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
659    defined(RR) || defined(RC) || defined(CR) || defined(CC)
660	subpd	%xmm1, %xmm0
661	subpd	%xmm3, %xmm2
662	subpd	%xmm5, %xmm4
663	subpd	%xmm7, %xmm6
664#else
665	addpd	%xmm1, %xmm0
666	addpd	%xmm3, %xmm2
667	addpd	%xmm5, %xmm4
668	addpd	%xmm7, %xmm6
669#endif
670
671#if defined(LN) || defined(LT)
672	movapd	 0 * SIZE(B), %xmm1
673	movapd	 2 * SIZE(B), %xmm3
674	movapd	 4 * SIZE(B), %xmm5
675	movapd	 6 * SIZE(B), %xmm7
676
677	subpd	%xmm0,  %xmm1
678	subpd	%xmm2,  %xmm3
679	subpd	%xmm4,  %xmm5
680	subpd	%xmm6,  %xmm7
681#else
682	movapd	 0 * SIZE(AO), %xmm1
683	movapd	 2 * SIZE(AO), %xmm5
684	movapd	 4 * SIZE(AO), %xmm3
685	movapd	 6 * SIZE(AO), %xmm7
686
687	subpd	%xmm0,  %xmm1
688	subpd	%xmm2,  %xmm3
689	subpd	%xmm4,  %xmm5
690	subpd	%xmm6,  %xmm7
691#endif
692
693#ifndef CONJ
694	SHUFPD_1 %xmm15, %xmm15
695#endif
696
697#ifdef LN
698	movlpd	 6 * SIZE(AO), %xmm8
699	movhpd	 6 * SIZE(AO), %xmm8
700	movlpd	 7 * SIZE(AO), %xmm9
701	movhpd	 7 * SIZE(AO), %xmm9
702	movlpd	 4 * SIZE(AO), %xmm10
703	movhpd	 4 * SIZE(AO), %xmm10
704	movlpd	 5 * SIZE(AO), %xmm11
705	movhpd	 5 * SIZE(AO), %xmm11
706	movlpd	 0 * SIZE(AO), %xmm12
707	movhpd	 0 * SIZE(AO), %xmm12
708	movlpd	 1 * SIZE(AO), %xmm13
709	movhpd	 1 * SIZE(AO), %xmm13
710
711	pshufd	$0x4e, %xmm5, %xmm4
712	pshufd	$0x4e, %xmm7, %xmm6
713
714	xorpd	 %xmm15, %xmm4
715	xorpd	 %xmm15, %xmm6
716
717	mulpd	 %xmm8, %xmm5
718	mulpd	 %xmm9, %xmm4
719	mulpd	 %xmm8, %xmm7
720	mulpd	 %xmm9, %xmm6
721
722	addpd	 %xmm4, %xmm5
723	addpd	 %xmm6, %xmm7
724
725	movapd	 %xmm5, %xmm0
726	movapd	 %xmm7, %xmm2
727	pshufd	 $0x4e, %xmm5, %xmm4
728	pshufd	 $0x4e, %xmm7, %xmm6
729
730	xorpd	 %xmm15, %xmm4
731	xorpd	 %xmm15, %xmm6
732
733	mulpd	 %xmm10, %xmm0
734	mulpd	 %xmm10, %xmm2
735	mulpd	 %xmm11, %xmm4
736	mulpd	 %xmm11, %xmm6
737
738	subpd	 %xmm0, %xmm1
739	subpd	 %xmm2, %xmm3
740	subpd	 %xmm4, %xmm1
741	subpd	 %xmm6, %xmm3
742
743	pshufd	$0x4e, %xmm1, %xmm0
744	pshufd	$0x4e, %xmm3, %xmm2
745
746	xorpd	 %xmm15, %xmm0
747	xorpd	 %xmm15, %xmm2
748
749	mulpd	 %xmm12, %xmm1
750	mulpd	 %xmm13, %xmm0
751	mulpd	 %xmm12, %xmm3
752	mulpd	 %xmm13, %xmm2
753
754	addpd	 %xmm0, %xmm1
755	addpd	 %xmm2, %xmm3
756#endif
757
758#ifdef LT
759	movlpd	 0 * SIZE(AO), %xmm8
760	movhpd	 0 * SIZE(AO), %xmm8
761	movlpd	 1 * SIZE(AO), %xmm9
762	movhpd	 1 * SIZE(AO), %xmm9
763	movlpd	 2 * SIZE(AO), %xmm10
764	movhpd	 2 * SIZE(AO), %xmm10
765	movlpd	 3 * SIZE(AO), %xmm11
766	movhpd	 3 * SIZE(AO), %xmm11
767	movlpd	 6 * SIZE(AO), %xmm12
768	movhpd	 6 * SIZE(AO), %xmm12
769	movlpd	 7 * SIZE(AO), %xmm13
770	movhpd	 7 * SIZE(AO), %xmm13
771
772	pshufd	$0x4e, %xmm1, %xmm0
773	pshufd	$0x4e, %xmm3, %xmm2
774
775	xorpd	 %xmm15, %xmm0
776	xorpd	 %xmm15, %xmm2
777
778	mulpd	 %xmm8, %xmm1
779	mulpd	 %xmm9, %xmm0
780	mulpd	 %xmm8, %xmm3
781	mulpd	 %xmm9, %xmm2
782
783	addpd	 %xmm0, %xmm1
784	addpd	 %xmm2, %xmm3
785
786	movapd	 %xmm1, %xmm0
787	movapd	 %xmm3, %xmm2
788	pshufd	 $0x4e, %xmm1, %xmm4
789	pshufd	 $0x4e, %xmm3, %xmm6
790
791	xorpd	 %xmm15, %xmm4
792	xorpd	 %xmm15, %xmm6
793
794	mulpd	 %xmm10, %xmm0
795	mulpd	 %xmm10, %xmm2
796	mulpd	 %xmm11, %xmm4
797	mulpd	 %xmm11, %xmm6
798
799	subpd	 %xmm0, %xmm5
800	subpd	 %xmm2, %xmm7
801	subpd	 %xmm4, %xmm5
802	subpd	 %xmm6, %xmm7
803
804	pshufd	$0x4e, %xmm5, %xmm4
805	pshufd	$0x4e, %xmm7, %xmm6
806
807	xorpd	 %xmm15, %xmm4
808	xorpd	 %xmm15, %xmm6
809
810	mulpd	 %xmm12, %xmm5
811	mulpd	 %xmm13, %xmm4
812	mulpd	 %xmm12, %xmm7
813	mulpd	 %xmm13, %xmm6
814
815	addpd	 %xmm4, %xmm5
816	addpd	 %xmm6, %xmm7
817#endif
818
819#ifdef RN
820	movlpd	 0 * SIZE(B), %xmm8
821	movhpd	 0 * SIZE(B), %xmm8
822	movlpd	 1 * SIZE(B), %xmm9
823	movhpd	 1 * SIZE(B), %xmm9
824	movlpd	 2 * SIZE(B), %xmm10
825	movhpd	 2 * SIZE(B), %xmm10
826	movlpd	 3 * SIZE(B), %xmm11
827	movhpd	 3 * SIZE(B), %xmm11
828	movlpd	 6 * SIZE(B), %xmm12
829	movhpd	 6 * SIZE(B), %xmm12
830	movlpd	 7 * SIZE(B), %xmm13
831	movhpd	 7 * SIZE(B), %xmm13
832
833	pshufd	$0x4e, %xmm1, %xmm0
834	pshufd	$0x4e, %xmm5, %xmm4
835
836	xorpd	 %xmm15, %xmm0
837	xorpd	 %xmm15, %xmm4
838
839	mulpd	 %xmm8, %xmm1
840	mulpd	 %xmm9, %xmm0
841	mulpd	 %xmm8, %xmm5
842	mulpd	 %xmm9, %xmm4
843
844	addpd	 %xmm0, %xmm1
845	addpd	 %xmm4, %xmm5
846
847	movapd	 %xmm1, %xmm0
848	movapd	 %xmm5, %xmm2
849	pshufd	 $0x4e, %xmm1, %xmm4
850	pshufd	 $0x4e, %xmm5, %xmm6
851
852	xorpd	 %xmm15, %xmm4
853	xorpd	 %xmm15, %xmm6
854
855	mulpd	 %xmm10, %xmm0
856	mulpd	 %xmm10, %xmm2
857	mulpd	 %xmm11, %xmm4
858	mulpd	 %xmm11, %xmm6
859
860	subpd	 %xmm0, %xmm3
861	subpd	 %xmm2, %xmm7
862	subpd	 %xmm4, %xmm3
863	subpd	 %xmm6, %xmm7
864
865	pshufd	$0x4e, %xmm3, %xmm2
866	pshufd	$0x4e, %xmm7, %xmm6
867
868	xorpd	 %xmm15, %xmm2
869	xorpd	 %xmm15, %xmm6
870
871	mulpd	 %xmm12, %xmm3
872	mulpd	 %xmm13, %xmm2
873	mulpd	 %xmm12, %xmm7
874	mulpd	 %xmm13, %xmm6
875
876	addpd	 %xmm2, %xmm3
877	addpd	 %xmm6, %xmm7
878#endif
879
880#ifdef RT
881	movlpd	 6 * SIZE(B), %xmm8
882	movhpd	 6 * SIZE(B), %xmm8
883	movlpd	 7 * SIZE(B), %xmm9
884	movhpd	 7 * SIZE(B), %xmm9
885	movlpd	 4 * SIZE(B), %xmm10
886	movhpd	 4 * SIZE(B), %xmm10
887	movlpd	 5 * SIZE(B), %xmm11
888	movhpd	 5 * SIZE(B), %xmm11
889	movlpd	 0 * SIZE(B), %xmm12
890	movhpd	 0 * SIZE(B), %xmm12
891	movlpd	 1 * SIZE(B), %xmm13
892	movhpd	 1 * SIZE(B), %xmm13
893
894	pshufd	$0x4e, %xmm3, %xmm2
895	pshufd	$0x4e, %xmm7, %xmm6
896
897	xorpd	 %xmm15, %xmm2
898	xorpd	 %xmm15, %xmm6
899
900	mulpd	 %xmm8, %xmm3
901	mulpd	 %xmm9, %xmm2
902	mulpd	 %xmm8, %xmm7
903	mulpd	 %xmm9, %xmm6
904
905	addpd	 %xmm2, %xmm3
906	addpd	 %xmm6, %xmm7
907
908	movapd	 %xmm3, %xmm0
909	movapd	 %xmm7, %xmm2
910	pshufd	 $0x4e, %xmm3, %xmm4
911	pshufd	 $0x4e, %xmm7, %xmm6
912
913	xorpd	 %xmm15, %xmm4
914	xorpd	 %xmm15, %xmm6
915
916	mulpd	 %xmm10, %xmm0
917	mulpd	 %xmm10, %xmm2
918	mulpd	 %xmm11, %xmm4
919	mulpd	 %xmm11, %xmm6
920
921	subpd	 %xmm0, %xmm1
922	subpd	 %xmm2, %xmm5
923	subpd	 %xmm4, %xmm1
924	subpd	 %xmm6, %xmm5
925
926	pshufd	$0x4e, %xmm1, %xmm0
927	pshufd	$0x4e, %xmm5, %xmm4
928
929	xorpd	 %xmm15, %xmm0
930	xorpd	 %xmm15, %xmm4
931
932	mulpd	 %xmm12, %xmm1
933	mulpd	 %xmm13, %xmm0
934	mulpd	 %xmm12, %xmm5
935	mulpd	 %xmm13, %xmm4
936
937	addpd	 %xmm0, %xmm1
938	addpd	 %xmm4, %xmm5
939#endif
940
941#ifdef LN
942	subq	$4 * SIZE, CO1
943	subq	$4 * SIZE, CO2
944#endif
945
946	movsd	%xmm1,   0 * SIZE(CO1)
947	movhpd	%xmm1,   1 * SIZE(CO1)
948	movsd	%xmm5,   2 * SIZE(CO1)
949	movhpd	%xmm5,   3 * SIZE(CO1)
950
951	movsd	%xmm3,   0 * SIZE(CO2)
952	movhpd	%xmm3,   1 * SIZE(CO2)
953	movsd	%xmm7,   2 * SIZE(CO2)
954	movhpd	%xmm7,   3 * SIZE(CO2)
955
956#if defined(LN) || defined(LT)
957	movapd	%xmm1,   0 * SIZE(B)
958	movapd	%xmm3,   2 * SIZE(B)
959	movapd	%xmm5,   4 * SIZE(B)
960	movapd	%xmm7,   6 * SIZE(B)
961
962	movlpd	%xmm1,   0 * SIZE(BO)
963	movlpd	%xmm1,   1 * SIZE(BO)
964	movhpd	%xmm1,   2 * SIZE(BO)
965	movhpd	%xmm1,   3 * SIZE(BO)
966	movlpd	%xmm3,   4 * SIZE(BO)
967	movlpd	%xmm3,   5 * SIZE(BO)
968	movhpd	%xmm3,   6 * SIZE(BO)
969	movhpd	%xmm3,   7 * SIZE(BO)
970	movlpd	%xmm5,   8 * SIZE(BO)
971	movlpd	%xmm5,   9 * SIZE(BO)
972	movhpd	%xmm5,  10 * SIZE(BO)
973	movhpd	%xmm5,  11 * SIZE(BO)
974	movlpd	%xmm7,  12 * SIZE(BO)
975	movlpd	%xmm7,  13 * SIZE(BO)
976	movhpd	%xmm7,  14 * SIZE(BO)
977	movhpd	%xmm7,  15 * SIZE(BO)
978#else
979	movapd	%xmm1,   0 * SIZE(AO)
980	movapd	%xmm5,   2 * SIZE(AO)
981	movapd	%xmm3,   4 * SIZE(AO)
982	movapd	%xmm7,   6 * SIZE(AO)
983#endif
984
985#ifndef LN
986	addq	$4 * SIZE, CO1
987	addq	$4 * SIZE, CO2
988#endif
989
990#if defined(LT) || defined(RN)
991	movq	K,  %rax
992	subq	KK, %rax
993	salq	$ZBASE_SHIFT, %rax
994	leaq	(AO, %rax, 2), AO
995#ifdef LT
996	addq	$8 * SIZE, B
997#endif
998#endif
999
1000#ifdef LN
1001	subq	$2, KK
1002	movq	BORIG, B
1003#endif
1004
1005#ifdef LT
1006	addq	$2, KK
1007#endif
1008
1009#ifdef RT
1010	movq	K, %rax
1011	movq	BORIG, B
1012	salq	$1 + ZBASE_SHIFT, %rax
1013	addq	%rax, AORIG
1014#endif
1015
1016	decq	I			# i --
1017	jg	.L10
1018	ALIGN_4
1019
1020.L30:
1021	testq	$1, M
1022	jle	.L99
1023
1024#ifdef LN
1025       movq	K, %rax
1026       salq	$0 + ZBASE_SHIFT, %rax
1027       subq	%rax, AORIG
1028#endif
1029
1030#if defined(LN) || defined(RT)
1031	movq	KK, %rax
1032	movq	AORIG, AO
1033	salq	$ZBASE_SHIFT, %rax
1034	addq	%rax, AO
1035#endif
1036
1037	leaq	BUFFER, BO
1038
1039#if defined(LN) || defined(RT)
1040	movq	KK, %rax
1041	salq	$1 + ZBASE_SHIFT, %rax
1042	leaq	(BO, %rax, 2), BO
1043#endif
1044
1045	pxor	%xmm0, %xmm0
1046	pxor	%xmm1, %xmm1
1047	pxor	%xmm2, %xmm2
1048	pxor	%xmm3, %xmm3
1049
1050#if defined(LT) || defined(RN)
1051	movq	KK, %rax
1052#else
1053	movq	K, %rax
1054	subq	KK, %rax
1055#endif
1056	sarq	$2, %rax
1057	je	.L42
1058
1059.L41:
1060	movapd	 0 * SIZE(AO), %xmm8
1061
1062	movapd	 0 * SIZE(BO), %xmm9
1063	mulpd	%xmm8, %xmm9
1064	addpd	%xmm9, %xmm0
1065
1066	movapd	 2 * SIZE(BO), %xmm9
1067	mulpd	%xmm8, %xmm9
1068	addpd	%xmm9, %xmm1
1069
1070	movapd	 4 * SIZE(BO), %xmm9
1071	mulpd	%xmm8, %xmm9
1072	addpd	%xmm9, %xmm2
1073
1074	movapd	 6 * SIZE(BO), %xmm9
1075	mulpd	%xmm8, %xmm9
1076	addpd	%xmm9, %xmm3
1077
1078	movapd	 2 * SIZE(AO), %xmm8
1079
1080	movapd	 8 * SIZE(BO), %xmm9
1081	mulpd	%xmm8, %xmm9
1082	addpd	%xmm9, %xmm0
1083
1084	movapd	10 * SIZE(BO), %xmm9
1085	mulpd	%xmm8, %xmm9
1086	addpd	%xmm9, %xmm1
1087
1088	movapd	12 * SIZE(BO), %xmm9
1089	mulpd	%xmm8, %xmm9
1090	addpd	%xmm9, %xmm2
1091
1092	movapd	14 * SIZE(BO), %xmm9
1093	mulpd	%xmm8, %xmm9
1094	addpd	%xmm9, %xmm3
1095
1096	movapd	 4 * SIZE(AO), %xmm8
1097
1098	movapd	16 * SIZE(BO), %xmm9
1099	mulpd	%xmm8, %xmm9
1100	addpd	%xmm9, %xmm0
1101
1102	movapd	18 * SIZE(BO), %xmm9
1103	mulpd	%xmm8, %xmm9
1104	addpd	%xmm9, %xmm1
1105
1106	movapd	20 * SIZE(BO), %xmm9
1107	mulpd	%xmm8, %xmm9
1108	addpd	%xmm9, %xmm2
1109
1110	movapd	22 * SIZE(BO), %xmm9
1111	mulpd	%xmm8, %xmm9
1112	addpd	%xmm9, %xmm3
1113
1114	movapd	 6 * SIZE(AO), %xmm8
1115
1116	movapd	24 * SIZE(BO), %xmm9
1117	mulpd	%xmm8, %xmm9
1118	addpd	%xmm9, %xmm0
1119
1120	movapd	26 * SIZE(BO), %xmm9
1121	mulpd	%xmm8, %xmm9
1122	addpd	%xmm9, %xmm1
1123
1124	movapd	28 * SIZE(BO), %xmm9
1125	mulpd	%xmm8, %xmm9
1126	addpd	%xmm9, %xmm2
1127
1128	movapd	30 * SIZE(BO), %xmm9
1129	mulpd	%xmm8, %xmm9
1130	addpd	%xmm9, %xmm3
1131
1132	addq   $ 8 * SIZE, AO
1133	addq   $32 * SIZE, BO
1134	decq   %rax
1135	jne    .L41
1136
1137.L42:
1138#if defined(LT) || defined(RN)
1139	movq	KK, %rax
1140#else
1141	movq	K, %rax
1142	subq	KK, %rax
1143#endif
1144	movapd	POSINV,  %xmm15
1145	andq	$3, %rax		# if (k & 1)
1146	BRANCH
1147	jle .L44
1148
1149.L43:
1150	movapd	 0 * SIZE(AO), %xmm8
1151
1152	movapd	 0 * SIZE(BO), %xmm9
1153	mulpd	%xmm8, %xmm9
1154	addpd	%xmm9, %xmm0
1155
1156	movapd	 2 * SIZE(BO), %xmm9
1157	mulpd	%xmm8, %xmm9
1158	addpd	%xmm9, %xmm1
1159
1160	movapd	 4 * SIZE(BO), %xmm9
1161	mulpd	%xmm8, %xmm9
1162	addpd	%xmm9, %xmm2
1163
1164	movapd	 6 * SIZE(BO), %xmm9
1165	mulpd	%xmm8, %xmm9
1166	addpd	%xmm9, %xmm3
1167
1168	addq	$2 * SIZE, AO		# aoffset  += 4
1169	addq	$8 * SIZE, BO		# boffset1 += 8
1170
1171	decq	%rax
1172	jg	.L43
1173	ALIGN_4
1174
1175.L44:
1176#if defined(LN) || defined(RT)
1177	movq	KK, %rax
1178#ifdef LN
1179	subq	$1, %rax
1180#else
1181	subq	$2, %rax
1182#endif
1183
1184	movq	AORIG, AO
1185	movq	BORIG, B
1186	leaq	BUFFER, BO
1187
1188	salq	$ZBASE_SHIFT, %rax
1189	leaq	(AO, %rax, 1), AO
1190	leaq	(B,  %rax, 2), B
1191	leaq	(BO, %rax, 4), BO
1192#endif
1193
1194	SHUFPD_1 %xmm1, %xmm1
1195	SHUFPD_1 %xmm3, %xmm3
1196
1197#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1198    defined(NR) || defined(NC) || defined(TR) || defined(TC)
1199	xorpd	%xmm15, %xmm1
1200	xorpd	%xmm15, %xmm3
1201#else
1202	xorpd	%xmm15, %xmm0
1203	xorpd	%xmm15, %xmm2
1204#endif
1205
1206#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1207    defined(RR) || defined(RC) || defined(CR) || defined(CC)
1208	subpd	%xmm1, %xmm0
1209	subpd	%xmm3, %xmm2
1210#else
1211	addpd	%xmm1, %xmm0
1212	addpd	%xmm3, %xmm2
1213#endif
1214
1215#if defined(LN) || defined(LT)
1216	movapd	 0 * SIZE(B), %xmm1
1217	movapd	 2 * SIZE(B), %xmm3
1218
1219	subpd	%xmm0,  %xmm1
1220	subpd	%xmm2,  %xmm3
1221#else
1222	movapd	 0 * SIZE(AO), %xmm1
1223	movapd	 2 * SIZE(AO), %xmm3
1224
1225	subpd	%xmm0,  %xmm1
1226	subpd	%xmm2,  %xmm3
1227#endif
1228
1229#ifndef CONJ
1230	SHUFPD_1 %xmm15, %xmm15
1231#endif
1232
1233#if defined(LN) || defined(LT)
1234	movlpd	 0 * SIZE(AO), %xmm8
1235	movhpd	 0 * SIZE(AO), %xmm8
1236	movlpd	 1 * SIZE(AO), %xmm9
1237	movhpd	 1 * SIZE(AO), %xmm9
1238
1239	pshufd	$0x4e, %xmm1, %xmm0
1240	pshufd	$0x4e, %xmm3, %xmm2
1241
1242	xorpd	 %xmm15, %xmm0
1243	xorpd	 %xmm15, %xmm2
1244
1245	mulpd	 %xmm8, %xmm1
1246	mulpd	 %xmm9, %xmm0
1247	mulpd	 %xmm8, %xmm3
1248	mulpd	 %xmm9, %xmm2
1249
1250	addpd	 %xmm0, %xmm1
1251	addpd	 %xmm2, %xmm3
1252#endif
1253
1254#ifdef RN
1255	movlpd	 0 * SIZE(B), %xmm8
1256	movhpd	 0 * SIZE(B), %xmm8
1257	movlpd	 1 * SIZE(B), %xmm9
1258	movhpd	 1 * SIZE(B), %xmm9
1259	movlpd	 2 * SIZE(B), %xmm10
1260	movhpd	 2 * SIZE(B), %xmm10
1261	movlpd	 3 * SIZE(B), %xmm11
1262	movhpd	 3 * SIZE(B), %xmm11
1263	movlpd	 6 * SIZE(B), %xmm12
1264	movhpd	 6 * SIZE(B), %xmm12
1265	movlpd	 7 * SIZE(B), %xmm13
1266	movhpd	 7 * SIZE(B), %xmm13
1267
1268	pshufd	$0x4e, %xmm1, %xmm0
1269
1270	xorpd	 %xmm15, %xmm0
1271
1272	mulpd	 %xmm8, %xmm1
1273	mulpd	 %xmm9, %xmm0
1274
1275	addpd	 %xmm0, %xmm1
1276
1277	movapd	 %xmm1, %xmm0
1278	pshufd	 $0x4e, %xmm1, %xmm4
1279
1280	xorpd	 %xmm15, %xmm4
1281
1282	mulpd	 %xmm10, %xmm0
1283	mulpd	 %xmm11, %xmm4
1284
1285	subpd	 %xmm0, %xmm3
1286	subpd	 %xmm4, %xmm3
1287
1288	pshufd	$0x4e, %xmm3, %xmm2
1289
1290	xorpd	 %xmm15, %xmm2
1291
1292	mulpd	 %xmm12, %xmm3
1293	mulpd	 %xmm13, %xmm2
1294
1295	addpd	 %xmm2, %xmm3
1296#endif
1297
1298#ifdef RT
1299	movlpd	 6 * SIZE(B), %xmm8
1300	movhpd	 6 * SIZE(B), %xmm8
1301	movlpd	 7 * SIZE(B), %xmm9
1302	movhpd	 7 * SIZE(B), %xmm9
1303	movlpd	 4 * SIZE(B), %xmm10
1304	movhpd	 4 * SIZE(B), %xmm10
1305	movlpd	 5 * SIZE(B), %xmm11
1306	movhpd	 5 * SIZE(B), %xmm11
1307	movlpd	 0 * SIZE(B), %xmm12
1308	movhpd	 0 * SIZE(B), %xmm12
1309	movlpd	 1 * SIZE(B), %xmm13
1310	movhpd	 1 * SIZE(B), %xmm13
1311
1312	pshufd	$0x4e, %xmm3, %xmm2
1313
1314	xorpd	 %xmm15, %xmm2
1315
1316	mulpd	 %xmm8, %xmm3
1317	mulpd	 %xmm9, %xmm2
1318
1319	addpd	 %xmm2, %xmm3
1320
1321	movapd	 %xmm3, %xmm0
1322	pshufd	 $0x4e, %xmm3, %xmm4
1323
1324	xorpd	 %xmm15, %xmm4
1325
1326	mulpd	 %xmm10, %xmm0
1327	mulpd	 %xmm11, %xmm4
1328
1329	subpd	 %xmm0, %xmm1
1330	subpd	 %xmm4, %xmm1
1331
1332	pshufd	$0x4e, %xmm1, %xmm0
1333
1334	xorpd	 %xmm15, %xmm0
1335
1336	mulpd	 %xmm12, %xmm1
1337	mulpd	 %xmm13, %xmm0
1338
1339	addpd	 %xmm0, %xmm1
1340#endif
1341
1342#ifdef LN
1343	subq	$2 * SIZE, CO1
1344	subq	$2 * SIZE, CO2
1345#endif
1346
1347	movsd	%xmm1,   0 * SIZE(CO1)
1348	movhpd	%xmm1,   1 * SIZE(CO1)
1349
1350	movsd	%xmm3,   0 * SIZE(CO2)
1351	movhpd	%xmm3,   1 * SIZE(CO2)
1352
1353#if defined(LN) || defined(LT)
1354	movapd	%xmm1,   0 * SIZE(B)
1355	movapd	%xmm3,   2 * SIZE(B)
1356
1357	movlpd	%xmm1,   0 * SIZE(BO)
1358	movlpd	%xmm1,   1 * SIZE(BO)
1359	movhpd	%xmm1,   2 * SIZE(BO)
1360	movhpd	%xmm1,   3 * SIZE(BO)
1361	movlpd	%xmm3,   4 * SIZE(BO)
1362	movlpd	%xmm3,   5 * SIZE(BO)
1363	movhpd	%xmm3,   6 * SIZE(BO)
1364	movhpd	%xmm3,   7 * SIZE(BO)
1365#else
1366	movapd	%xmm1,   0 * SIZE(AO)
1367	movapd	%xmm3,   2 * SIZE(AO)
1368
1369#endif
1370
1371#ifndef LN
1372	addq	$2 * SIZE, CO1
1373	addq	$2 * SIZE, CO2
1374#endif
1375
1376#if defined(LT) || defined(RN)
1377	movq	K,  %rax
1378	subq	KK, %rax
1379	salq	$ZBASE_SHIFT, %rax
1380	leaq	(AO, %rax, 1), AO
1381#ifdef LT
1382	addq	$4 * SIZE, B
1383#endif
1384#endif
1385
1386#ifdef LN
1387	subq	$1, KK
1388	movq	BORIG, B
1389#endif
1390
1391#ifdef LT
1392	addq	$1, KK
1393#endif
1394
1395#ifdef RT
1396	movq	K, %rax
1397	movq	BORIG, B
1398	salq	$0 + ZBASE_SHIFT, %rax
1399	addq	%rax, AORIG
1400#endif
1401	ALIGN_4
1402
1403.L99:
1404#ifdef LN
1405       leaq	(, K, SIZE), %rax
1406       leaq	(B, %rax, 4), B
1407#endif
1408
1409#if defined(LT) || defined(RN)
1410	movq	K,  %rax
1411	subq	KK, %rax
1412	leaq	(,%rax, SIZE), %rax
1413	leaq	(B,  %rax, 2 * COMPSIZE), B
1414#endif
1415
1416#ifdef RN
1417	addq	$2, KK
1418#endif
1419
1420#ifdef RT
1421	subq	$2, KK
1422#endif
1423
1424	decq	J			# j --
1425	jg	.L01
1426
1427.L100:
1428	testq	$1, N
1429	jle	.L999
1430
1431.L101:
1432#ifdef LN
1433	movq	OFFSET, %rax
1434	addq	M, %rax
1435	movq	%rax, KK
1436#endif
1437
1438/* Copying to Sub Buffer */
1439	leaq	BUFFER, BO
1440
1441#ifdef RT
1442       movq	K, %rax
1443       salq	$0 + ZBASE_SHIFT, %rax
1444       subq	%rax, B
1445#endif
1446
1447#if defined(LN) || defined(RT)
1448	movq	KK, %rax
1449	movq	B, BORIG
1450	salq	$ZBASE_SHIFT, %rax
1451	leaq	(B,  %rax, 1), B
1452	leaq	(BO, %rax, 2), BO
1453#endif
1454
1455#if defined(LT)
1456	movq	OFFSET, %rax
1457	movq	%rax, KK
1458#endif
1459
1460#if defined(LT) || defined(RN)
1461	movq	KK, %rax
1462#else
1463	movq	K, %rax
1464	subq	KK, %rax
1465#endif
1466	sarq	$2, %rax
1467	jle	.L103
1468	ALIGN_4
1469
1470.L102:
1471	movlpd	 0 * SIZE(B), %xmm0
1472	movlpd	 1 * SIZE(B), %xmm1
1473	movlpd	 2 * SIZE(B), %xmm2
1474	movlpd	 3 * SIZE(B), %xmm3
1475	movlpd	 4 * SIZE(B), %xmm4
1476	movlpd	 5 * SIZE(B), %xmm5
1477	movlpd	 6 * SIZE(B), %xmm6
1478	movlpd	 7 * SIZE(B), %xmm7
1479
1480	movlpd	%xmm0,  0 * SIZE(BO)
1481	movlpd	%xmm0,  1 * SIZE(BO)
1482	movlpd	%xmm1,  2 * SIZE(BO)
1483	movlpd	%xmm1,  3 * SIZE(BO)
1484	movlpd	%xmm2,  4 * SIZE(BO)
1485	movlpd	%xmm2,  5 * SIZE(BO)
1486	movlpd	%xmm3,  6 * SIZE(BO)
1487	movlpd	%xmm3,  7 * SIZE(BO)
1488	movlpd	%xmm4,  8 * SIZE(BO)
1489	movlpd	%xmm4,  9 * SIZE(BO)
1490	movlpd	%xmm5, 10 * SIZE(BO)
1491	movlpd	%xmm5, 11 * SIZE(BO)
1492	movlpd	%xmm6, 12 * SIZE(BO)
1493	movlpd	%xmm6, 13 * SIZE(BO)
1494	movlpd	%xmm7, 14 * SIZE(BO)
1495	movlpd	%xmm7, 15 * SIZE(BO)
1496
1497	subq	$-16 * SIZE, BO
1498	addq	$ 8 * SIZE, B
1499	decq	%rax
1500	jne	.L102
1501	ALIGN_4
1502
1503.L103:
1504#if defined(LT) || defined(RN)
1505	movq	KK, %rax
1506#else
1507	movq	K, %rax
1508	subq	KK, %rax
1509#endif
1510	andq	$3, %rax
1511	BRANCH
1512	jle	.L105
1513	ALIGN_4
1514
1515.L104:
1516	movlpd	 0 * SIZE(B), %xmm0
1517	movlpd	 1 * SIZE(B), %xmm1
1518
1519	movlpd	%xmm0,  0 * SIZE(BO)
1520	movlpd	%xmm0,  1 * SIZE(BO)
1521	movlpd	%xmm1,  2 * SIZE(BO)
1522	movlpd	%xmm1,  3 * SIZE(BO)
1523
1524	addq	$4 * SIZE, BO
1525	addq	$2 * SIZE, B
1526	decq	%rax
1527	jne	.L104
1528	ALIGN_4
1529
1530.L105:
1531#if defined(LT) || defined(RN)
1532	movq	A, AO
1533#else
1534	movq	A, AORIG
1535#endif
1536
1537#ifdef RT
1538       subq	LDC, C
1539#endif
1540
1541	movq	C, CO1
1542#ifndef RT
1543	addq	LDC, C
1544#endif
1545
1546	movq	M,  I
1547	sarq	$1, I		# i = (m >> 2)
1548	jle	.L130
1549	ALIGN_4
1550
1551.L110:
1552#ifdef LN
1553       movq	K, %rax
1554       salq	$1 + ZBASE_SHIFT, %rax
1555       subq	%rax, AORIG
1556#endif
1557
1558#if defined(LN) || defined(RT)
1559	movq	KK, %rax
1560	movq	AORIG, AO
1561	salq	$ZBASE_SHIFT, %rax
1562	leaq	(AO, %rax, 2), AO
1563#endif
1564
1565	leaq	BUFFER, BO
1566
1567#if defined(LN) || defined(RT)
1568	movq	KK, %rax
1569	salq	$0 + ZBASE_SHIFT, %rax
1570	leaq	(BO, %rax, 2), BO
1571#endif
1572
1573	pxor	%xmm0, %xmm0
1574	pxor	%xmm1, %xmm1
1575	pxor	%xmm4, %xmm4
1576	pxor	%xmm5, %xmm5
1577	PREFETCHW      4 * SIZE(CO1)
1578
1579#if defined(LT) || defined(RN)
1580	movq	KK, %rax
1581#else
1582	movq	K, %rax
1583	subq	KK, %rax
1584#endif
1585	sarq	$2, %rax
1586	je	.L112
1587
1588.L111:
1589	movapd	 0 * SIZE(AO), %xmm8
1590	movapd	 0 * SIZE(BO), %xmm9
1591	mulpd	%xmm8, %xmm9
1592	addpd	%xmm9, %xmm0
1593	mulpd	 2 * SIZE(BO), %xmm8
1594	addpd	%xmm8, %xmm1
1595
1596	movapd	 2 * SIZE(AO), %xmm8
1597	movapd	 0 * SIZE(BO), %xmm9
1598	mulpd	%xmm8, %xmm9
1599	addpd	%xmm9, %xmm4
1600	mulpd	 2 * SIZE(BO), %xmm8
1601	addpd	%xmm8, %xmm5
1602
1603	movapd	 4 * SIZE(AO), %xmm8
1604	movapd	 4 * SIZE(BO), %xmm9
1605	mulpd	%xmm8, %xmm9
1606	addpd	%xmm9, %xmm0
1607	mulpd	 6 * SIZE(BO), %xmm8
1608	addpd	%xmm8, %xmm1
1609
1610	movapd	 6 * SIZE(AO), %xmm8
1611	movapd	 4 * SIZE(BO), %xmm9
1612	mulpd	%xmm8, %xmm9
1613	addpd	%xmm9, %xmm4
1614	mulpd	 6 * SIZE(BO), %xmm8
1615	addpd	%xmm8, %xmm5
1616
1617	movapd	 8 * SIZE(AO), %xmm8
1618	movapd	 8 * SIZE(BO), %xmm9
1619	mulpd	%xmm8, %xmm9
1620	addpd	%xmm9, %xmm0
1621	mulpd	10 * SIZE(BO), %xmm8
1622	addpd	%xmm8, %xmm1
1623
1624	movapd	10 * SIZE(AO), %xmm8
1625	movapd	 8 * SIZE(BO), %xmm9
1626	mulpd	%xmm8, %xmm9
1627	addpd	%xmm9, %xmm4
1628	mulpd	10 * SIZE(BO), %xmm8
1629	addpd	%xmm8, %xmm5
1630
1631	movapd	12 * SIZE(AO), %xmm8
1632	movapd	12 * SIZE(BO), %xmm9
1633	mulpd	%xmm8, %xmm9
1634	addpd	%xmm9, %xmm0
1635	mulpd	14 * SIZE(BO), %xmm8
1636	addpd	%xmm8, %xmm1
1637
1638	movapd	14 * SIZE(AO), %xmm8
1639	movapd	12 * SIZE(BO), %xmm9
1640	mulpd	%xmm8, %xmm9
1641	addpd	%xmm9, %xmm4
1642	mulpd	14 * SIZE(BO), %xmm8
1643	addpd	%xmm8, %xmm5
1644
1645	addq   $16 * SIZE, AO
1646	addq   $16 * SIZE, BO
1647	decq   %rax
1648	jne    .L111
1649	ALIGN_4
1650
1651.L112:
1652#if defined(LT) || defined(RN)
1653	movq	KK, %rax
1654#else
1655	movq	K, %rax
1656	subq	KK, %rax
1657#endif
1658	movapd	POSINV,  %xmm15
1659	andq	$3, %rax		# if (k & 1)
1660	BRANCH
1661	jle .L114
1662
1663.L113:
1664	movapd	 0 * SIZE(AO), %xmm8
1665	movapd	 0 * SIZE(BO), %xmm9
1666	mulpd	%xmm8, %xmm9
1667	addpd	%xmm9, %xmm0
1668	mulpd	 2 * SIZE(BO), %xmm8
1669	addpd	%xmm8, %xmm1
1670
1671	movapd	 2 * SIZE(AO), %xmm8
1672	movapd	 0 * SIZE(BO), %xmm9
1673	mulpd	%xmm8, %xmm9
1674	addpd	%xmm9, %xmm4
1675	mulpd	 2 * SIZE(BO), %xmm8
1676	addpd	%xmm8, %xmm5
1677
1678	addq	$4 * SIZE, AO		# aoffset  += 4
1679	addq	$4 * SIZE, BO		# boffset1 += 8
1680	decq	%rax
1681	jg	.L113
1682	ALIGN_4
1683
1684.L114:
1685#if defined(LN) || defined(RT)
1686	movq	KK, %rax
1687#ifdef LN
1688	subq	$2, %rax
1689#else
1690	subq	$1, %rax
1691#endif
1692
1693	movq	AORIG, AO
1694	movq	BORIG, B
1695	leaq	BUFFER, BO
1696
1697	salq	$ZBASE_SHIFT, %rax
1698	leaq	(AO, %rax, 2), AO
1699	leaq	(B,  %rax, 1), B
1700	leaq	(BO, %rax, 2), BO
1701#endif
1702
1703	SHUFPD_1 %xmm1, %xmm1
1704	SHUFPD_1 %xmm5, %xmm5
1705
1706#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1707    defined(NR) || defined(NC) || defined(TR) || defined(TC)
1708	xorpd	%xmm15, %xmm1
1709	xorpd	%xmm15, %xmm5
1710#else
1711	xorpd	%xmm15, %xmm0
1712	xorpd	%xmm15, %xmm4
1713#endif
1714
1715#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1716    defined(RR) || defined(RC) || defined(CR) || defined(CC)
1717	subpd	%xmm1, %xmm0
1718	subpd	%xmm5, %xmm4
1719#else
1720	addpd	%xmm1, %xmm0
1721	addpd	%xmm5, %xmm4
1722#endif
1723
1724#if defined(LN) || defined(LT)
1725	movapd	 0 * SIZE(B), %xmm1
1726	movapd	 2 * SIZE(B), %xmm5
1727
1728	subpd	%xmm0,  %xmm1
1729	subpd	%xmm4,  %xmm5
1730#else
1731	movapd	 0 * SIZE(AO), %xmm1
1732	movapd	 2 * SIZE(AO), %xmm5
1733
1734	subpd	%xmm0,  %xmm1
1735	subpd	%xmm4,  %xmm5
1736#endif
1737
1738#ifndef CONJ
1739	SHUFPD_1 %xmm15, %xmm15
1740#endif
1741
1742#ifdef LN
1743	movlpd	 6 * SIZE(AO), %xmm8
1744	movhpd	 6 * SIZE(AO), %xmm8
1745	movlpd	 7 * SIZE(AO), %xmm9
1746	movhpd	 7 * SIZE(AO), %xmm9
1747	movlpd	 4 * SIZE(AO), %xmm10
1748	movhpd	 4 * SIZE(AO), %xmm10
1749	movlpd	 5 * SIZE(AO), %xmm11
1750	movhpd	 5 * SIZE(AO), %xmm11
1751	movlpd	 0 * SIZE(AO), %xmm12
1752	movhpd	 0 * SIZE(AO), %xmm12
1753	movlpd	 1 * SIZE(AO), %xmm13
1754	movhpd	 1 * SIZE(AO), %xmm13
1755
1756	pshufd	$0x4e, %xmm5, %xmm4
1757
1758	xorpd	 %xmm15, %xmm4
1759
1760	mulpd	 %xmm8, %xmm5
1761	mulpd	 %xmm9, %xmm4
1762
1763	addpd	 %xmm4, %xmm5
1764
1765	movapd	 %xmm5, %xmm0
1766	pshufd	 $0x4e, %xmm5, %xmm4
1767
1768	xorpd	 %xmm15, %xmm4
1769
1770	mulpd	 %xmm10, %xmm0
1771	mulpd	 %xmm11, %xmm4
1772
1773	subpd	 %xmm0, %xmm1
1774	subpd	 %xmm4, %xmm1
1775
1776	pshufd	$0x4e, %xmm1, %xmm0
1777
1778	xorpd	 %xmm15, %xmm0
1779
1780	mulpd	 %xmm12, %xmm1
1781	mulpd	 %xmm13, %xmm0
1782
1783	addpd	 %xmm0, %xmm1
1784#endif
1785
1786#ifdef LT
1787	movlpd	 0 * SIZE(AO), %xmm8
1788	movhpd	 0 * SIZE(AO), %xmm8
1789	movlpd	 1 * SIZE(AO), %xmm9
1790	movhpd	 1 * SIZE(AO), %xmm9
1791	movlpd	 2 * SIZE(AO), %xmm10
1792	movhpd	 2 * SIZE(AO), %xmm10
1793	movlpd	 3 * SIZE(AO), %xmm11
1794	movhpd	 3 * SIZE(AO), %xmm11
1795	movlpd	 6 * SIZE(AO), %xmm12
1796	movhpd	 6 * SIZE(AO), %xmm12
1797	movlpd	 7 * SIZE(AO), %xmm13
1798	movhpd	 7 * SIZE(AO), %xmm13
1799
1800	pshufd	$0x4e, %xmm1, %xmm0
1801
1802	xorpd	 %xmm15, %xmm0
1803
1804	mulpd	 %xmm8, %xmm1
1805	mulpd	 %xmm9, %xmm0
1806
1807	addpd	 %xmm0, %xmm1
1808
1809	movapd	 %xmm1, %xmm0
1810	pshufd	 $0x4e, %xmm1, %xmm4
1811
1812	xorpd	 %xmm15, %xmm4
1813
1814	mulpd	 %xmm10, %xmm0
1815	mulpd	 %xmm11, %xmm4
1816
1817	subpd	 %xmm0, %xmm5
1818	subpd	 %xmm4, %xmm5
1819
1820	pshufd	$0x4e, %xmm5, %xmm4
1821
1822	xorpd	 %xmm15, %xmm4
1823
1824	mulpd	 %xmm12, %xmm5
1825	mulpd	 %xmm13, %xmm4
1826
1827	addpd	 %xmm4, %xmm5
1828#endif
1829
1830#ifdef RN
1831	movlpd	 0 * SIZE(B), %xmm8
1832	movhpd	 0 * SIZE(B), %xmm8
1833	movlpd	 1 * SIZE(B), %xmm9
1834	movhpd	 1 * SIZE(B), %xmm9
1835
1836	pshufd	$0x4e, %xmm1, %xmm0
1837	pshufd	$0x4e, %xmm5, %xmm4
1838
1839	xorpd	 %xmm15, %xmm0
1840	xorpd	 %xmm15, %xmm4
1841
1842	mulpd	 %xmm8, %xmm1
1843	mulpd	 %xmm9, %xmm0
1844	mulpd	 %xmm8, %xmm5
1845	mulpd	 %xmm9, %xmm4
1846
1847	addpd	 %xmm0, %xmm1
1848	addpd	 %xmm4, %xmm5
1849#endif
1850
1851#ifdef RT
1852	movlpd	 0 * SIZE(B), %xmm8
1853	movhpd	 0 * SIZE(B), %xmm8
1854	movlpd	 1 * SIZE(B), %xmm9
1855	movhpd	 1 * SIZE(B), %xmm9
1856
1857	pshufd	$0x4e, %xmm1, %xmm0
1858	pshufd	$0x4e, %xmm5, %xmm4
1859
1860	xorpd	 %xmm15, %xmm0
1861	xorpd	 %xmm15, %xmm4
1862
1863	mulpd	 %xmm8, %xmm1
1864	mulpd	 %xmm9, %xmm0
1865	mulpd	 %xmm8, %xmm5
1866	mulpd	 %xmm9, %xmm4
1867
1868	addpd	 %xmm0, %xmm1
1869	addpd	 %xmm4, %xmm5
1870#endif
1871
1872#ifdef LN
1873	subq	$4 * SIZE, CO1
1874#endif
1875
1876	movsd	%xmm1,   0 * SIZE(CO1)
1877	movhpd	%xmm1,   1 * SIZE(CO1)
1878	movsd	%xmm5,   2 * SIZE(CO1)
1879	movhpd	%xmm5,   3 * SIZE(CO1)
1880
1881#if defined(LN) || defined(LT)
1882	movapd	%xmm1,   0 * SIZE(B)
1883	movapd	%xmm5,   2 * SIZE(B)
1884
1885	movlpd	%xmm1,   0 * SIZE(BO)
1886	movlpd	%xmm1,   1 * SIZE(BO)
1887	movhpd	%xmm1,   2 * SIZE(BO)
1888	movhpd	%xmm1,   3 * SIZE(BO)
1889	movlpd	%xmm5,   4 * SIZE(BO)
1890	movlpd	%xmm5,   5 * SIZE(BO)
1891	movhpd	%xmm5,   6 * SIZE(BO)
1892	movhpd	%xmm5,   7 * SIZE(BO)
1893#else
1894	movapd	%xmm1,   0 * SIZE(AO)
1895	movapd	%xmm5,   2 * SIZE(AO)
1896#endif
1897
1898#ifndef LN
1899	addq	$4 * SIZE, CO1
1900#endif
1901
1902#if defined(LT) || defined(RN)
1903	movq	K,  %rax
1904	subq	KK, %rax
1905	salq	$ZBASE_SHIFT, %rax
1906	leaq	(AO, %rax, 2), AO
1907#ifdef LT
1908	addq	$4 * SIZE, B
1909#endif
1910#endif
1911
1912#ifdef LN
1913	subq	$2, KK
1914	movq	BORIG, B
1915#endif
1916
1917#ifdef LT
1918	addq	$2, KK
1919#endif
1920
1921#ifdef RT
1922	movq	K, %rax
1923	movq	BORIG, B
1924	salq	$1 + ZBASE_SHIFT, %rax
1925	addq	%rax, AORIG
1926#endif
1927
1928	decq	I			# i --
1929	jg	.L110
1930	ALIGN_4
1931
1932.L130:
1933	testq	$1, M
1934	jle	.L199
1935	ALIGN_4
1936
1937.L140:
1938#ifdef LN
1939       movq	K, %rax
1940       salq	$0 + ZBASE_SHIFT, %rax
1941       subq	%rax, AORIG
1942#endif
1943
1944#if defined(LN) || defined(RT)
1945	movq	KK, %rax
1946	movq	AORIG, AO
1947	salq	$ZBASE_SHIFT, %rax
1948	leaq	(AO, %rax, 1), AO
1949#endif
1950
1951	leaq	BUFFER, BO
1952
1953#if defined(LN) || defined(RT)
1954	movq	KK, %rax
1955	salq	$0 + ZBASE_SHIFT, %rax
1956	leaq	(BO, %rax, 2), BO
1957#endif
1958
1959	pxor	%xmm0, %xmm0
1960	pxor	%xmm1, %xmm1
1961	pxor	%xmm2, %xmm2
1962	pxor	%xmm3, %xmm3
1963
1964#if defined(LT) || defined(RN)
1965	movq	KK, %rax
1966#else
1967	movq	K, %rax
1968	subq	KK, %rax
1969#endif
1970	sarq	$2, %rax
1971	je	.L142
1972
1973.L141:
1974	movapd	 0 * SIZE(AO), %xmm8
1975	movapd	 0 * SIZE(BO), %xmm9
1976	mulpd	%xmm8, %xmm9
1977	addpd	%xmm9, %xmm0
1978	mulpd	 2 * SIZE(BO), %xmm8
1979	addpd	%xmm8, %xmm1
1980
1981	movapd	 2 * SIZE(AO), %xmm8
1982	movapd	 4 * SIZE(BO), %xmm9
1983	mulpd	%xmm8, %xmm9
1984	addpd	%xmm9, %xmm2
1985	mulpd	 6 * SIZE(BO), %xmm8
1986	addpd	%xmm8, %xmm3
1987
1988	movapd	 4 * SIZE(AO), %xmm8
1989	movapd	 8 * SIZE(BO), %xmm9
1990	mulpd	%xmm8, %xmm9
1991	addpd	%xmm9, %xmm0
1992	mulpd	10 * SIZE(BO), %xmm8
1993	addpd	%xmm8, %xmm1
1994
1995	movapd	 6 * SIZE(AO), %xmm8
1996	movapd	12 * SIZE(BO), %xmm9
1997	mulpd	%xmm8, %xmm9
1998	addpd	%xmm9, %xmm2
1999	mulpd	14 * SIZE(BO), %xmm8
2000	addpd	%xmm8, %xmm3
2001
2002	addq   $8  * SIZE, AO
2003	addq   $16 * SIZE, BO
2004	decq   %rax
2005	jne    .L141
2006
2007.L142:
2008	addpd	%xmm2, %xmm0
2009	addpd	%xmm3, %xmm1
2010
2011	movapd	POSINV, %xmm15
2012
2013#if defined(LT) || defined(RN)
2014	movq	KK, %rax
2015#else
2016	movq	K, %rax
2017	subq	KK, %rax
2018#endif
2019	andq	$3, %rax		# if (k & 1)
2020	BRANCH
2021	jle .L144
2022
2023.L143:
2024	movapd	 0 * SIZE(AO), %xmm8
2025	movapd	 0 * SIZE(BO), %xmm9
2026	mulpd	%xmm8, %xmm9
2027	addpd	%xmm9, %xmm0
2028	mulpd	 2 * SIZE(BO), %xmm8
2029	addpd	%xmm8, %xmm1
2030
2031	addq	$2 * SIZE, AO		# aoffset  += 4
2032	addq	$4 * SIZE, BO		# boffset1 += 8
2033	decq	%rax
2034	jg	.L143
2035	ALIGN_4
2036
2037.L144:
2038#if defined(LN) || defined(RT)
2039	movq	KK, %rax
2040#ifdef LN
2041	subq	$1, %rax
2042#else
2043	subq	$1, %rax
2044#endif
2045
2046	movq	AORIG, AO
2047	movq	BORIG, B
2048	leaq	BUFFER, BO
2049
2050	salq	$ZBASE_SHIFT, %rax
2051	leaq	(AO, %rax, 1), AO
2052	leaq	(B,  %rax, 1), B
2053	leaq	(BO, %rax, 2), BO
2054#endif
2055
2056	SHUFPD_1 %xmm1, %xmm1
2057
2058#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
2059    defined(NR) || defined(NC) || defined(TR) || defined(TC)
2060	xorpd	%xmm15, %xmm1
2061#else
2062	xorpd	%xmm15, %xmm0
2063#endif
2064
2065#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
2066    defined(RR) || defined(RC) || defined(CR) || defined(CC)
2067	subpd	%xmm1, %xmm0
2068#else
2069	addpd	%xmm1, %xmm0
2070#endif
2071
2072
2073#if defined(LN) || defined(LT)
2074	movapd	 0 * SIZE(B), %xmm1
2075
2076	subpd	%xmm0,  %xmm1
2077#else
2078	movapd	 0 * SIZE(AO), %xmm1
2079
2080	subpd	%xmm0,  %xmm1
2081#endif
2082
2083#ifndef CONJ
2084	SHUFPD_1 %xmm15, %xmm15
2085#endif
2086
2087#ifdef LN
2088	movlpd	 0 * SIZE(AO), %xmm8
2089	movhpd	 0 * SIZE(AO), %xmm8
2090	movlpd	 1 * SIZE(AO), %xmm9
2091	movhpd	 1 * SIZE(AO), %xmm9
2092
2093	pshufd	$0x4e, %xmm1, %xmm0
2094	xorpd	 %xmm15, %xmm0
2095
2096	mulpd	 %xmm8, %xmm1
2097	mulpd	 %xmm9, %xmm0
2098
2099	addpd	 %xmm0, %xmm1
2100#endif
2101
2102#ifdef LT
2103	movlpd	 0 * SIZE(AO), %xmm8
2104	movhpd	 0 * SIZE(AO), %xmm8
2105	movlpd	 1 * SIZE(AO), %xmm9
2106	movhpd	 1 * SIZE(AO), %xmm9
2107
2108	pshufd	$0x4e, %xmm1, %xmm0
2109
2110	xorpd	 %xmm15, %xmm0
2111
2112	mulpd	 %xmm8, %xmm1
2113	mulpd	 %xmm9, %xmm0
2114
2115	addpd	 %xmm0, %xmm1
2116#endif
2117
2118#ifdef RN
2119	movlpd	 0 * SIZE(B), %xmm8
2120	movhpd	 0 * SIZE(B), %xmm8
2121	movlpd	 1 * SIZE(B), %xmm9
2122	movhpd	 1 * SIZE(B), %xmm9
2123
2124	pshufd	$0x4e, %xmm1, %xmm0
2125
2126	xorpd	 %xmm15, %xmm0
2127
2128	mulpd	 %xmm8, %xmm1
2129	mulpd	 %xmm9, %xmm0
2130
2131	addpd	 %xmm0, %xmm1
2132#endif
2133
2134#ifdef RT
2135	movlpd	 0 * SIZE(B), %xmm8
2136	movhpd	 0 * SIZE(B), %xmm8
2137	movlpd	 1 * SIZE(B), %xmm9
2138	movhpd	 1 * SIZE(B), %xmm9
2139
2140	pshufd	$0x4e, %xmm1, %xmm0
2141
2142	xorpd	 %xmm15, %xmm0
2143
2144	mulpd	 %xmm8, %xmm1
2145	mulpd	 %xmm9, %xmm0
2146
2147	addpd	 %xmm0, %xmm1
2148#endif
2149
2150#ifdef LN
2151	subq	$2 * SIZE, CO1
2152#endif
2153
2154	movsd	%xmm1,   0 * SIZE(CO1)
2155	movhpd	%xmm1,   1 * SIZE(CO1)
2156
2157#if defined(LN) || defined(LT)
2158	movapd	%xmm1,   0 * SIZE(B)
2159
2160	movlpd	%xmm1,   0 * SIZE(BO)
2161	movlpd	%xmm1,   1 * SIZE(BO)
2162	movhpd	%xmm1,   2 * SIZE(BO)
2163	movhpd	%xmm1,   3 * SIZE(BO)
2164#else
2165	movapd	%xmm1,   0 * SIZE(AO)
2166#endif
2167
2168#ifndef LN
2169	addq	$2 * SIZE, CO1
2170#endif
2171
2172#if defined(LT) || defined(RN)
2173	movq	K,  %rax
2174	subq	KK, %rax
2175	salq	$ZBASE_SHIFT, %rax
2176	leaq	(AO, %rax, 1), AO
2177#ifdef LT
2178	addq	$2 * SIZE, B
2179#endif
2180#endif
2181
2182#ifdef LN
2183	subq	$1, KK
2184	movq	BORIG, B
2185#endif
2186
2187#ifdef LT
2188	addq	$1, KK
2189#endif
2190
2191#ifdef RT
2192	movq	K, %rax
2193	movq	BORIG, B
2194	salq	$0 + ZBASE_SHIFT, %rax
2195	addq	%rax, AORIG
2196#endif
2197	ALIGN_4
2198
2199.L199:
2200#ifdef LN
2201       leaq	(, K, SIZE), %rax
2202       leaq	(B, %rax, 2), B
2203#endif
2204
2205#if defined(LT) || defined(RN)
2206	movq	K,  %rax
2207	subq	KK, %rax
2208	leaq	(,%rax, SIZE), %rax
2209	leaq	(B,  %rax, 1 * COMPSIZE), B
2210#endif
2211
2212#ifdef RN
2213	addq	$1, KK
2214#endif
2215
2216#ifdef RT
2217	subq	$1, KK
2218#endif
2219	ALIGN_4
2220
2221
2222.L999:
2223	movq	%rbx, %rsp
2224	movq	  0(%rsp), %rbx
2225	movq	  8(%rsp), %rbp
2226	movq	 16(%rsp), %r12
2227	movq	 24(%rsp), %r13
2228	movq	 32(%rsp), %r14
2229	movq	 40(%rsp), %r15
2230
2231#ifdef WINDOWS_ABI
2232	movq	 48(%rsp), %rdi
2233	movq	 56(%rsp), %rsi
2234	movups	 64(%rsp), %xmm6
2235	movups	 80(%rsp), %xmm7
2236	movups	 96(%rsp), %xmm8
2237	movups	112(%rsp), %xmm9
2238	movups	128(%rsp), %xmm10
2239	movups	144(%rsp), %xmm11
2240	movups	160(%rsp), %xmm12
2241	movups	176(%rsp), %xmm13
2242	movups	192(%rsp), %xmm14
2243	movups	208(%rsp), %xmm15
2244#endif
2245
2246	addq	$STACKSIZE, %rsp
2247	ret
2248
2249	EPILOGUE
2250