1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define STACK	16
26#define ARGS	16
27
28#define M	 4 + STACK + ARGS(%esp)
29#define N	 8 + STACK + ARGS(%esp)
30#define K	12 + STACK + ARGS(%esp)
31#define A	24 + STACK + ARGS(%esp)
32#define ARG_B	28 + STACK + ARGS(%esp)
33#define C	32 + STACK + ARGS(%esp)
34#define ARG_LDC	36 + STACK + ARGS(%esp)
35#define OFFSET	40 + STACK + ARGS(%esp)
36
37#define J	 0 + STACK(%esp)
38#define KK	 4 + STACK(%esp)
39#define KKK	 8 + STACK(%esp)
40#define AORIG	12 + STACK(%esp)
41
42#if defined(PENRYN) || defined(DUNNINGTON)
43#define PREFETCH	prefetcht1
44#define PREFETCHSIZE 84
45#endif
46
47#ifdef NEHALEM
48#define PREFETCH	prefetcht1
49#define PREFETCHSIZE 84
50#endif
51
52#ifdef ATOM
53#define PREFETCH	prefetcht0
54#define PREFETCHSIZE 84
55#endif
56
57#ifdef NANO
58#define PREFETCH	prefetcht0
59#define PREFETCHSIZE (16 * 2)
60#endif
61
62#define B	%edi
63#define	LDC	%ebp
64#define AA	%edx
65#define BB	%ecx
66#define CO1	%esi
67
68#define ADD1	  addps
69#define ADD2	  addps
70
71	PROLOGUE
72
73	subl	$ARGS, %esp
74
75	pushl	%ebp
76	pushl	%edi
77	pushl	%esi
78	pushl	%ebx
79
80	PROFCODE
81
82	movl	ARG_B,   B
83	movl	ARG_LDC, LDC
84	movl	OFFSET, %eax
85#ifdef RN
86	negl	%eax
87#endif
88	movl	%eax, KK
89
90	movl	M,    %ebx
91	testl	%ebx, %ebx
92	jle	.L999
93
94	subl	$-32 * SIZE, A
95	subl	$-32 * SIZE, B
96
97	sall	$ZBASE_SHIFT, LDC
98
99#ifdef LN
100       movl	M, %eax
101       sall	$ZBASE_SHIFT, %eax
102       addl	%eax, C
103       imull	K, %eax
104       addl	%eax, A
105#endif
106
107#ifdef RT
108       movl	N, %eax
109       sall	$ZBASE_SHIFT, %eax
110       imull	K, %eax
111       addl	%eax, B
112
113       movl	N, %eax
114       imull	LDC, %eax
115       addl	%eax, C
116#endif
117
118#ifdef RN
119	negl	KK
120#endif
121
122#ifdef RT
123       movl	N, %eax
124       subl	OFFSET, %eax
125       movl	%eax, KK
126#endif
127
128	movl	N, %eax
129	movl	%eax, J
130	sarl	$1, J
131	jle	.L100
132	ALIGN_4
133
134.L01:
135#if defined(LT) || defined(RN)
136	movl	A, %eax
137	movl	%eax, AA
138#else
139	movl	A, %eax
140	movl	%eax, AORIG
141#endif
142
143#ifdef RT
144	movl	K, %eax
145	sall	$1 + ZBASE_SHIFT, %eax
146	subl	%eax, B
147#endif
148
149       leal	(, LDC, 2), %eax
150
151#ifdef RT
152       subl	%eax, C
153#endif
154	movl	C,  CO1
155#ifndef RT
156	addl	%eax, C
157#endif
158
159#ifdef LN
160	movl	OFFSET, %eax
161	addl	M, %eax
162	movl	%eax, KK
163#endif
164
165#ifdef LT
166	movl	OFFSET, %eax
167	movl	%eax, KK
168#endif
169
170	movl	M,  %ebx
171	sarl	$1, %ebx
172	jle	.L30
173	ALIGN_4
174
175.L10:
176#ifdef LN
177       movl	K, %eax
178       sall	$1 + ZBASE_SHIFT, %eax
179       subl	%eax, AORIG
180#endif
181
182#if defined(LN) || defined(RT)
183	movl	KK, %eax
184	movl	AORIG, AA
185	sall	$1 + ZBASE_SHIFT, %eax
186	addl	%eax, AA
187#endif
188
189	movl	B, BB
190
191#if defined(LN) || defined(RT)
192	movl	KK, %eax
193	sall	$1 + ZBASE_SHIFT, %eax
194	addl	%eax, BB
195#endif
196
197	movaps	-32 * SIZE(AA), %xmm0
198	pxor	%xmm2, %xmm2
199	movaps	-32 * SIZE(BB), %xmm1
200	pxor	%xmm3, %xmm3
201
202#ifdef LN
203	pxor	%xmm4, %xmm4
204	prefetcht0     -4 * SIZE(CO1)
205	pxor	%xmm5, %xmm5
206	prefetcht0     -4 * SIZE(CO1, LDC)
207	pxor	%xmm6, %xmm6
208	pxor	%xmm7, %xmm7
209#else
210	pxor	%xmm4, %xmm4
211	prefetcht0	3 * SIZE(CO1)
212	pxor	%xmm5, %xmm5
213	prefetcht0	3 * SIZE(CO1, LDC)
214	pxor	%xmm6, %xmm6
215	pxor	%xmm7, %xmm7
216#endif
217
218#if defined(LT) || defined(RN)
219	movl	KK, %eax
220#else
221	movl	K, %eax
222	subl	KK, %eax
223#endif
224	sarl	$3, %eax
225	je	.L15
226	ALIGN_4
227
228.L11:
229	PREFETCH (PREFETCHSIZE +  0) * SIZE(AA)
230
231	ADD2	%xmm2, %xmm7
232	pshufd	$0xb1, %xmm1, %xmm2
233	mulps	%xmm0, %xmm1
234	ADD1	%xmm3, %xmm6
235	pshufd	$0x1b, %xmm2, %xmm3
236	mulps	%xmm0, %xmm2
237
238	ADD2	%xmm2, %xmm5
239	pshufd	$0xb1, %xmm3, %xmm2
240	mulps	%xmm0, %xmm3
241	ADD1	%xmm1, %xmm4
242	movaps	-28 * SIZE(BB), %xmm1
243	mulps	%xmm0, %xmm2
244	movaps	-28 * SIZE(AA), %xmm0
245
246	ADD2	%xmm2, %xmm7
247	pshufd	$0xb1, %xmm1, %xmm2
248	mulps	%xmm0, %xmm1
249	ADD1	%xmm3, %xmm6
250	pshufd	$0x1b, %xmm2, %xmm3
251	mulps	%xmm0, %xmm2
252
253	ADD2	%xmm2, %xmm5
254	pshufd	$0xb1, %xmm3, %xmm2
255	mulps	%xmm0, %xmm3
256	ADD1	%xmm1, %xmm4
257	movaps	-24 * SIZE(BB), %xmm1
258	mulps	%xmm0, %xmm2
259	movaps	-24 * SIZE(AA), %xmm0
260
261	ADD2	%xmm2, %xmm7
262	pshufd	$0xb1, %xmm1, %xmm2
263	mulps	%xmm0, %xmm1
264	ADD1	%xmm3, %xmm6
265	pshufd	$0x1b, %xmm2, %xmm3
266	mulps	%xmm0, %xmm2
267
268	ADD2	%xmm2, %xmm5
269	pshufd	$0xb1, %xmm3, %xmm2
270	mulps	%xmm0, %xmm3
271	ADD1	%xmm1, %xmm4
272	movaps	-20 * SIZE(BB), %xmm1
273	mulps	%xmm0, %xmm2
274	movaps	-20 * SIZE(AA), %xmm0
275
276	ADD2	%xmm2, %xmm7
277	pshufd	$0xb1, %xmm1, %xmm2
278	mulps	%xmm0, %xmm1
279	ADD1	%xmm3, %xmm6
280	pshufd	$0x1b, %xmm2, %xmm3
281	mulps	%xmm0, %xmm2
282
283	ADD2	%xmm2, %xmm5
284	pshufd	$0xb1, %xmm3, %xmm2
285	mulps	%xmm0, %xmm3
286	ADD1	%xmm1, %xmm4
287	movaps	-16 * SIZE(BB), %xmm1
288	mulps	%xmm0, %xmm2
289	movaps	-16 * SIZE(AA), %xmm0
290
291	PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
292
293	ADD2	%xmm2, %xmm7
294	pshufd	$0xb1, %xmm1, %xmm2
295	mulps	%xmm0, %xmm1
296	ADD1	%xmm3, %xmm6
297	pshufd	$0x1b, %xmm2, %xmm3
298	mulps	%xmm0, %xmm2
299
300	ADD2	%xmm2, %xmm5
301	pshufd	$0xb1, %xmm3, %xmm2
302	mulps	%xmm0, %xmm3
303	ADD1	%xmm1, %xmm4
304	movaps	-12 * SIZE(BB), %xmm1
305	mulps	%xmm0, %xmm2
306	movaps	-12 * SIZE(AA), %xmm0
307
308	ADD2	%xmm2, %xmm7
309	pshufd	$0xb1, %xmm1, %xmm2
310	mulps	%xmm0, %xmm1
311	ADD1	%xmm3, %xmm6
312	pshufd	$0x1b, %xmm2, %xmm3
313	mulps	%xmm0, %xmm2
314
315	ADD2	%xmm2, %xmm5
316	pshufd	$0xb1, %xmm3, %xmm2
317	mulps	%xmm0, %xmm3
318	ADD1	%xmm1, %xmm4
319	movaps	 -8 * SIZE(BB), %xmm1
320	mulps	%xmm0, %xmm2
321	movaps	 -8 * SIZE(AA), %xmm0
322
323	ADD2	%xmm2, %xmm7
324	pshufd	$0xb1, %xmm1, %xmm2
325	mulps	%xmm0, %xmm1
326	ADD1	%xmm3, %xmm6
327	pshufd	$0x1b, %xmm2, %xmm3
328	mulps	%xmm0, %xmm2
329
330	ADD2	%xmm2, %xmm5
331	pshufd	$0xb1, %xmm3, %xmm2
332	mulps	%xmm0, %xmm3
333	ADD1	%xmm1, %xmm4
334	movaps	 -4 * SIZE(BB), %xmm1
335	mulps	%xmm0, %xmm2
336	movaps	 -4 * SIZE(AA), %xmm0
337
338	ADD2	%xmm2, %xmm7
339	subl   $-32 * SIZE, BB
340	pshufd	$0xb1, %xmm1, %xmm2
341	mulps	%xmm0, %xmm1
342	ADD1	%xmm3, %xmm6
343	pshufd	$0x1b, %xmm2, %xmm3
344	mulps	%xmm0, %xmm2
345
346	ADD2	%xmm2, %xmm5
347	subl   $-32 * SIZE, AA
348	pshufd	$0xb1, %xmm3, %xmm2
349	mulps	%xmm0, %xmm3
350	ADD1	%xmm1, %xmm4
351	movaps	-32 * SIZE(BB), %xmm1
352	mulps	%xmm0, %xmm2
353	movaps	-32 * SIZE(AA), %xmm0
354
355	decl   %eax
356	jne    .L11
357	ALIGN_4
358
359.L15:
360#if defined(LT) || defined(RN)
361	movl	KK, %eax
362#else
363	movl	K,  %eax
364	subl	KK, %eax
365#endif
366	andl	$7, %eax		# if (k & 1)
367	BRANCH
368	je .L14
369	ALIGN_4
370
371.L13:
372	ADD2	%xmm2, %xmm7
373	pshufd	$0xb1, %xmm1, %xmm2
374	mulps	%xmm0, %xmm1
375	ADD1	%xmm3, %xmm6
376	pshufd	$0x1b, %xmm2, %xmm3
377	mulps	%xmm0, %xmm2
378
379	ADD2	%xmm2, %xmm5
380	pshufd	$0xb1, %xmm3, %xmm2
381	mulps	%xmm0, %xmm3
382	ADD1	%xmm1, %xmm4
383	movaps	-28 * SIZE(BB), %xmm1
384	mulps	%xmm0, %xmm2
385	movaps	-28 * SIZE(AA), %xmm0
386
387	addl	$4 * SIZE, AA
388	addl	$4 * SIZE, BB
389	decl	%eax
390	jg	.L13
391	ALIGN_4
392
393.L14:
394#if defined(LN) || defined(RT)
395	movl	KK, %eax
396#ifdef LN
397	subl	$2, %eax
398#else
399	subl	$2, %eax
400#endif
401
402	movl	AORIG, AA
403	sall	$ZBASE_SHIFT, %eax
404	leal	(AA, %eax, 2), AA
405	leal	(B,  %eax, 2), BB
406#endif
407
408	ADD2	%xmm2, %xmm7
409	pcmpeqb	%xmm0, %xmm0
410	ADD1	%xmm3, %xmm6
411	psllq	$63,   %xmm0
412
413#ifndef CONJ
414	pxor	%xmm0, %xmm4
415	pxor	%xmm0, %xmm6
416
417	shufps	$0xb1, %xmm0, %xmm0
418#else
419#if defined(LN) || defined(LT)
420	pxor	%xmm0, %xmm5
421	pxor	%xmm0, %xmm7
422#else
423	pshufd	$0xb1, %xmm0, %xmm1
424
425	pxor	%xmm1, %xmm5
426	pxor	%xmm1, %xmm7
427#endif
428#endif
429
430	haddps	%xmm5, %xmm4
431	haddps	%xmm7, %xmm6
432
433	shufps	$0xd8, %xmm4, %xmm4
434	shufps	$0xd8, %xmm6, %xmm6
435
436	movaps	%xmm4, %xmm5
437	shufps	$0xe4, %xmm6, %xmm4
438	shufps	$0xe4, %xmm5, %xmm6
439
440#if defined(LN) || defined(LT)
441	movaps	%xmm4,  %xmm5
442	unpcklpd %xmm6, %xmm4
443	unpckhpd %xmm6, %xmm5
444
445	movaps	-32 * SIZE(BB), %xmm2
446	movaps	-28 * SIZE(BB), %xmm3
447
448	subps	%xmm4,  %xmm2
449	subps	%xmm5,  %xmm3
450#else
451	movaps	-32 * SIZE(AA), %xmm1
452	movaps	-28 * SIZE(AA), %xmm5
453
454	subps	%xmm4,  %xmm1
455	subps	%xmm6,  %xmm5
456#endif
457
458#ifdef LN
459	movaps	-28 * SIZE(AA), %xmm5
460
461	pshufd	 $0xee, %xmm5, %xmm6
462	pshufd	 $0xbb, %xmm5, %xmm7
463
464	pshufd	 $0xa0, %xmm3, %xmm4
465	pshufd	 $0xf5, %xmm3, %xmm3
466
467#ifndef CONJ
468	xorps	 %xmm0, %xmm3
469#else
470	xorps	 %xmm0, %xmm4
471#endif
472
473	mulps	%xmm6,  %xmm4
474	mulps	%xmm7, %xmm3
475	addps	%xmm4,  %xmm3
476
477	pshufd	 $0x44, %xmm5, %xmm6
478	pshufd	 $0x11, %xmm5, %xmm7
479
480	pshufd	 $0xa0, %xmm3, %xmm4
481	pshufd	 $0xf5, %xmm3, %xmm1
482
483#ifndef CONJ
484	xorps	 %xmm0, %xmm1
485#else
486	xorps	 %xmm0, %xmm4
487#endif
488
489	mulps	%xmm6,  %xmm4
490	mulps	%xmm7, %xmm1
491	subps	%xmm4,  %xmm2
492	subps	%xmm1,  %xmm2
493
494	movaps	-32 * SIZE(AA), %xmm5
495
496	pshufd	 $0x44, %xmm5, %xmm6
497	pshufd	 $0x11, %xmm5, %xmm7
498
499	pshufd	 $0xa0, %xmm2, %xmm4
500	pshufd	 $0xf5, %xmm2, %xmm2
501
502#ifndef CONJ
503	xorps	 %xmm0, %xmm2
504#else
505	xorps	 %xmm0, %xmm4
506#endif
507
508	mulps	%xmm6,  %xmm4
509	mulps	%xmm7, %xmm2
510	addps	%xmm4,  %xmm2
511#endif
512
513#ifdef LT
514	movaps	-32 * SIZE(AA), %xmm5
515
516	pshufd	 $0x44, %xmm5, %xmm6
517	pshufd	 $0x11, %xmm5, %xmm7
518
519	pshufd	 $0xa0, %xmm2, %xmm4
520	pshufd	 $0xf5, %xmm2, %xmm2
521
522#ifndef CONJ
523	xorps	 %xmm0, %xmm2
524#else
525	xorps	 %xmm0, %xmm4
526#endif
527
528	mulps	%xmm6,  %xmm4
529	mulps	%xmm7, %xmm2
530	addps	%xmm4,  %xmm2
531
532	pshufd	 $0xee, %xmm5, %xmm6
533	pshufd	 $0xbb, %xmm5, %xmm7
534
535	pshufd	 $0xa0, %xmm2, %xmm4
536	pshufd	 $0xf5, %xmm2, %xmm1
537
538#ifndef CONJ
539	xorps	 %xmm0, %xmm1
540#else
541	xorps	 %xmm0, %xmm4
542#endif
543
544	mulps	%xmm6,  %xmm4
545	mulps	%xmm7, %xmm1
546	subps	%xmm4,  %xmm3
547	subps	%xmm1,  %xmm3
548
549	movaps	-28 * SIZE(AA), %xmm5
550
551	pshufd	 $0xee, %xmm5, %xmm6
552	pshufd	 $0xbb, %xmm5, %xmm7
553
554	pshufd	 $0xa0, %xmm3, %xmm4
555	pshufd	 $0xf5, %xmm3, %xmm3
556
557#ifndef CONJ
558	xorps	 %xmm0, %xmm3
559#else
560	xorps	 %xmm0, %xmm4
561#endif
562
563	mulps	%xmm6,  %xmm4
564	mulps	%xmm7,  %xmm3
565	addps	%xmm4,  %xmm3
566#endif
567
568#ifdef RN
569	movaps	-32 * SIZE(BB), %xmm4
570
571	pshufd	 $0x44, %xmm4, %xmm6
572	pshufd	 $0x11, %xmm4, %xmm7
573
574	pshufd	 $0xa0, %xmm1, %xmm3
575	pshufd	 $0xf5, %xmm1, %xmm1
576
577#ifndef CONJ
578	xorps	 %xmm0, %xmm1
579#else
580	xorps	 %xmm0, %xmm3
581#endif
582
583	mulps	%xmm6,  %xmm3
584	mulps	%xmm7,  %xmm1
585
586	addps	%xmm3,  %xmm1
587
588	pshufd	 $0xee, %xmm4, %xmm6
589	pshufd	 $0xbb, %xmm4, %xmm7
590
591	pshufd	 $0xa0, %xmm1, %xmm3
592	pshufd	 $0xf5, %xmm1, %xmm2
593
594#ifndef CONJ
595	xorps	 %xmm0, %xmm2
596#else
597	xorps	 %xmm0, %xmm3
598#endif
599
600	mulps	%xmm6,  %xmm3
601	mulps	%xmm7,  %xmm2
602
603	subps	%xmm3,  %xmm5
604	subps	%xmm2,  %xmm5
605
606	movaps	-28 * SIZE(BB), %xmm4
607
608	pshufd	 $0xee, %xmm4, %xmm6
609	pshufd	 $0xbb, %xmm4, %xmm7
610
611	pshufd	 $0xa0, %xmm5, %xmm3
612	pshufd	 $0xf5, %xmm5, %xmm5
613
614#ifndef CONJ
615	xorps	 %xmm0, %xmm5
616#else
617	xorps	 %xmm0, %xmm3
618#endif
619
620	mulps	%xmm6,  %xmm3
621	mulps	%xmm7,  %xmm5
622
623	addps	%xmm3,  %xmm5
624#endif
625
626#ifdef RT
627	movaps	-28 * SIZE(BB), %xmm4
628
629	pshufd	 $0xee, %xmm4, %xmm6
630	pshufd	 $0xbb, %xmm4, %xmm7
631
632	pshufd	 $0xa0, %xmm5, %xmm3
633	pshufd	 $0xf5, %xmm5, %xmm5
634
635#ifndef CONJ
636	xorps	 %xmm0, %xmm5
637#else
638	xorps	 %xmm0, %xmm3
639#endif
640
641	mulps	%xmm6,  %xmm3
642	mulps	%xmm7,  %xmm5
643
644	addps	%xmm3,  %xmm5
645
646	pshufd	 $0x44, %xmm4, %xmm6
647	pshufd	 $0x11, %xmm4, %xmm7
648
649	pshufd	 $0xa0, %xmm5, %xmm3
650	pshufd	 $0xf5, %xmm5, %xmm2
651
652#ifndef CONJ
653	xorps	 %xmm0, %xmm2
654#else
655	xorps	 %xmm0, %xmm3
656#endif
657
658	mulps	%xmm6,  %xmm3
659	mulps	%xmm7,  %xmm2
660
661	subps	%xmm3,  %xmm1
662	subps	%xmm2,  %xmm1
663
664	movaps	-32 * SIZE(BB), %xmm4
665
666	pshufd	 $0x44, %xmm4, %xmm6
667	pshufd	 $0x11, %xmm4, %xmm7
668
669	pshufd	 $0xa0, %xmm1, %xmm3
670	pshufd	 $0xf5, %xmm1, %xmm1
671
672#ifndef CONJ
673	xorps	 %xmm0, %xmm1
674#else
675	xorps	 %xmm0, %xmm3
676#endif
677
678	mulps	%xmm6,  %xmm3
679	mulps	%xmm7,  %xmm1
680
681	addps	%xmm3,  %xmm1
682#endif
683
684#ifdef LN
685	subl	$4 * SIZE, CO1
686#endif
687
688#if defined(LN) || defined(LT)
689	movaps	%xmm2,  -32 * SIZE(BB)
690	movaps	%xmm3,  -28 * SIZE(BB)
691
692	movlps	%xmm2,   0 * SIZE(CO1)
693	movlps	%xmm3,   2 * SIZE(CO1)
694	movhps	%xmm2,   0 * SIZE(CO1, LDC)
695	movhps	%xmm3,   2 * SIZE(CO1, LDC)
696#else
697	movaps	%xmm1,  -32 * SIZE(AA)
698	movaps	%xmm5,  -28 * SIZE(AA)
699
700	movlps	%xmm1,   0 * SIZE(CO1)
701	movhps	%xmm1,   2 * SIZE(CO1)
702
703	movlps	%xmm5,   0 * SIZE(CO1, LDC)
704	movhps	%xmm5,   2 * SIZE(CO1, LDC)
705#endif
706
707#ifndef LN
708	addl	$4 * SIZE, CO1
709#endif
710
711#if defined(LT) || defined(RN)
712	movl	K,  %eax
713	subl	KK, %eax
714	sall	$ZBASE_SHIFT, %eax
715	leal	(AA, %eax, 2), AA
716	leal	(BB, %eax, 2), BB
717#endif
718
719#ifdef LN
720	subl	$2, KK
721#endif
722
723#ifdef LT
724	addl	$2, KK
725#endif
726
727#ifdef RT
728	movl	K, %eax
729	sall	$1 + ZBASE_SHIFT, %eax
730	addl	%eax, AORIG
731#endif
732
733	decl	%ebx
734	jg	.L10
735	ALIGN_4
736
737.L30:
738	movl	M,  %ebx
739	andl	$1, %ebx
740	jle	.L99
741
742#ifdef LN
743       movl	K, %eax
744       sall	$ZBASE_SHIFT, %eax
745       subl	%eax, AORIG
746#endif
747
748#if defined(LN) || defined(RT)
749	movl	KK, %eax
750	movl	AORIG, AA
751	sall	$ZBASE_SHIFT, %eax
752	addl	%eax, AA
753#endif
754
755	movl	B, BB
756
757#if defined(LN) || defined(RT)
758	movl	KK, %eax
759	sall	$1 + ZBASE_SHIFT, %eax
760	addl	%eax, BB
761#endif
762
763	movsd	-32 * SIZE(AA), %xmm0
764	pxor	%xmm2, %xmm2
765	movaps	-32 * SIZE(BB), %xmm1
766	pxor	%xmm3, %xmm3
767
768	pxor	%xmm4, %xmm4
769	pxor	%xmm5, %xmm5
770	pxor	%xmm6, %xmm6
771	pxor	%xmm7, %xmm7
772
773#if defined(LT) || defined(RN)
774	movl	KK, %eax
775#else
776	movl	K, %eax
777	subl	KK, %eax
778#endif
779	sarl	$3, %eax
780	je	.L42
781	ALIGN_4
782
783.L41:
784	addps	%xmm2, %xmm6
785	pshufd	$0x00, %xmm1, %xmm2
786	mulps	%xmm0, %xmm2
787	addps	%xmm3, %xmm7
788	pshufd	$0x55, %xmm1, %xmm3
789	mulps	%xmm0, %xmm3
790
791	PREFETCH (PREFETCHSIZE +  0) * SIZE(AA)
792
793	addps	%xmm2, %xmm4
794	pshufd	$0xaa, %xmm1, %xmm2
795	mulps	%xmm0, %xmm2
796	addps	%xmm3, %xmm5
797	pshufd	$0xff, %xmm1, %xmm3
798	movaps	-28 * SIZE(BB), %xmm1
799	mulps	%xmm0, %xmm3
800	movsd	-30 * SIZE(AA), %xmm0
801
802	addps	%xmm2, %xmm6
803	pshufd	$0x00, %xmm1, %xmm2
804	mulps	%xmm0, %xmm2
805	addps	%xmm3, %xmm7
806	pshufd	$0x55, %xmm1, %xmm3
807	mulps	%xmm0, %xmm3
808
809	addps	%xmm2, %xmm4
810	pshufd	$0xaa, %xmm1, %xmm2
811	mulps	%xmm0, %xmm2
812	addps	%xmm3, %xmm5
813	pshufd	$0xff, %xmm1, %xmm3
814	movaps	-24 * SIZE(BB), %xmm1
815	mulps	%xmm0, %xmm3
816	movsd	-28 * SIZE(AA), %xmm0
817
818	addps	%xmm2, %xmm6
819	pshufd	$0x00, %xmm1, %xmm2
820	mulps	%xmm0, %xmm2
821	addps	%xmm3, %xmm7
822	pshufd	$0x55, %xmm1, %xmm3
823	mulps	%xmm0, %xmm3
824
825	addps	%xmm2, %xmm4
826	pshufd	$0xaa, %xmm1, %xmm2
827	mulps	%xmm0, %xmm2
828	addps	%xmm3, %xmm5
829	pshufd	$0xff, %xmm1, %xmm3
830	movaps	-20 * SIZE(BB), %xmm1
831	mulps	%xmm0, %xmm3
832	movsd	-26 * SIZE(AA), %xmm0
833
834	addps	%xmm2, %xmm6
835	pshufd	$0x00, %xmm1, %xmm2
836	mulps	%xmm0, %xmm2
837	addps	%xmm3, %xmm7
838	pshufd	$0x55, %xmm1, %xmm3
839	mulps	%xmm0, %xmm3
840
841	addps	%xmm2, %xmm4
842	pshufd	$0xaa, %xmm1, %xmm2
843	mulps	%xmm0, %xmm2
844	addps	%xmm3, %xmm5
845	pshufd	$0xff, %xmm1, %xmm3
846	movaps	-16 * SIZE(BB), %xmm1
847	mulps	%xmm0, %xmm3
848	movsd	-24 * SIZE(AA), %xmm0
849
850	addps	%xmm2, %xmm6
851	pshufd	$0x00, %xmm1, %xmm2
852	mulps	%xmm0, %xmm2
853	addps	%xmm3, %xmm7
854	pshufd	$0x55, %xmm1, %xmm3
855	mulps	%xmm0, %xmm3
856
857	addps	%xmm2, %xmm4
858	pshufd	$0xaa, %xmm1, %xmm2
859	mulps	%xmm0, %xmm2
860	addps	%xmm3, %xmm5
861	pshufd	$0xff, %xmm1, %xmm3
862	movaps	-12 * SIZE(BB), %xmm1
863	mulps	%xmm0, %xmm3
864	movsd	-22 * SIZE(AA), %xmm0
865
866	addps	%xmm2, %xmm6
867	pshufd	$0x00, %xmm1, %xmm2
868	mulps	%xmm0, %xmm2
869	addps	%xmm3, %xmm7
870	pshufd	$0x55, %xmm1, %xmm3
871	mulps	%xmm0, %xmm3
872
873	addps	%xmm2, %xmm4
874	pshufd	$0xaa, %xmm1, %xmm2
875	mulps	%xmm0, %xmm2
876	addps	%xmm3, %xmm5
877	pshufd	$0xff, %xmm1, %xmm3
878	movaps	 -8 * SIZE(BB), %xmm1
879	mulps	%xmm0, %xmm3
880	movsd	-20 * SIZE(AA), %xmm0
881
882	addps	%xmm2, %xmm6
883	pshufd	$0x00, %xmm1, %xmm2
884	mulps	%xmm0, %xmm2
885	addps	%xmm3, %xmm7
886	pshufd	$0x55, %xmm1, %xmm3
887	mulps	%xmm0, %xmm3
888
889	addps	%xmm2, %xmm4
890	pshufd	$0xaa, %xmm1, %xmm2
891	mulps	%xmm0, %xmm2
892	addps	%xmm3, %xmm5
893	pshufd	$0xff, %xmm1, %xmm3
894	movaps	 -4 * SIZE(BB), %xmm1
895	mulps	%xmm0, %xmm3
896	movsd	-18 * SIZE(AA), %xmm0
897
898	addps	%xmm2, %xmm6
899	pshufd	$0x00, %xmm1, %xmm2
900	mulps	%xmm0, %xmm2
901	addps	%xmm3, %xmm7
902	pshufd	$0x55, %xmm1, %xmm3
903	mulps	%xmm0, %xmm3
904
905	addps	%xmm2, %xmm4
906	pshufd	$0xaa, %xmm1, %xmm2
907	mulps	%xmm0, %xmm2
908	addps	%xmm3, %xmm5
909	pshufd	$0xff, %xmm1, %xmm3
910	movaps	  0 * SIZE(BB), %xmm1
911	mulps	%xmm0, %xmm3
912	movsd	-16 * SIZE(AA), %xmm0
913
914	subl   $-16 * SIZE, AA
915	subl   $-32 * SIZE, BB
916	decl	%eax
917	jne	.L41
918	ALIGN_4
919
920.L42:
921#if defined(LT) || defined(RN)
922	movl	KK, %eax
923#else
924	movl	K,  %eax
925	subl	KK, %eax
926#endif
927	andl	$7, %eax		# if (k & 1)
928	BRANCH
929	je .L44
930	ALIGN_4
931
932.L43:
933	addps	%xmm2, %xmm6
934	pshufd	$0x00, %xmm1, %xmm2
935	mulps	%xmm0, %xmm2
936	addps	%xmm3, %xmm7
937	pshufd	$0x55, %xmm1, %xmm3
938	mulps	%xmm0, %xmm3
939
940	addps	%xmm2, %xmm4
941	pshufd	$0xaa, %xmm1, %xmm2
942	mulps	%xmm0, %xmm2
943	addps	%xmm3, %xmm5
944	pshufd	$0xff, %xmm1, %xmm3
945	movaps	-28 * SIZE(BB), %xmm1
946	mulps	%xmm0, %xmm3
947	movsd	-30 * SIZE(AA), %xmm0
948
949	addl	$2 * SIZE, AA
950	addl	$4 * SIZE, BB
951	decl	%eax
952	jg	.L43
953	ALIGN_4
954
955.L44:
956#if defined(LN) || defined(RT)
957	movl	KK, %eax
958#ifdef LN
959	subl	$1, %eax
960#else
961	subl	$2, %eax
962#endif
963
964	movl	AORIG, AA
965	sall	$ZBASE_SHIFT, %eax
966	leal	(AA, %eax, 1), AA
967	leal	(B,  %eax, 2), BB
968#endif
969
970	addps	%xmm2, %xmm6
971	addps	%xmm3, %xmm7
972
973	pshufd	 $0xb1, %xmm5, %xmm5
974	pcmpeqb	%xmm0, %xmm0
975	pshufd	 $0xb1, %xmm7, %xmm7
976	psllq	$63,   %xmm0
977
978#ifndef CONJ
979	shufps	$0xb1, %xmm0, %xmm0
980
981	pxor	%xmm0, %xmm5
982	pxor	%xmm0, %xmm7
983#else
984#if defined(LN) || defined(LT)
985	pxor	%xmm0, %xmm4
986	pxor	%xmm0, %xmm6
987#else
988	pxor	%xmm0, %xmm5
989	pxor	%xmm0, %xmm7
990#endif
991#endif
992
993	addps	%xmm5, %xmm4
994	addps	%xmm7, %xmm6
995
996#if defined(LN) || defined(LT)
997	unpcklpd %xmm6, %xmm4
998
999	movaps	-32 * SIZE(BB), %xmm2
1000
1001	subps	%xmm4,  %xmm2
1002#else
1003	movsd	-32 * SIZE(AA), %xmm1
1004	movsd	-30 * SIZE(AA), %xmm5
1005
1006	subps	%xmm4,  %xmm1
1007	subps	%xmm6,  %xmm5
1008#endif
1009
1010#if defined(LN) || defined(LT)
1011	movaps	-32 * SIZE(AA), %xmm5
1012
1013	pshufd	 $0x44, %xmm5, %xmm6
1014	pshufd	 $0x11, %xmm5, %xmm7
1015
1016	pshufd	 $0xa0, %xmm2, %xmm4
1017	pshufd	 $0xf5, %xmm2, %xmm2
1018
1019#ifndef CONJ
1020	xorps	 %xmm0, %xmm2
1021#else
1022	xorps	 %xmm0, %xmm4
1023#endif
1024
1025	mulps	%xmm6,  %xmm4
1026	mulps	%xmm7,  %xmm2
1027	addps	%xmm4,  %xmm2
1028#endif
1029
1030#ifdef RN
1031	movaps	-32 * SIZE(BB), %xmm4
1032
1033	pshufd	 $0x44, %xmm4, %xmm6
1034	pshufd	 $0x11, %xmm4, %xmm7
1035
1036	pshufd	 $0xa0, %xmm1, %xmm3
1037	pshufd	 $0xf5, %xmm1, %xmm1
1038
1039#ifndef CONJ
1040	xorps	 %xmm0, %xmm1
1041#else
1042	xorps	 %xmm0, %xmm3
1043#endif
1044
1045	mulps	%xmm6,  %xmm3
1046	mulps	%xmm7,  %xmm1
1047
1048	addps	%xmm3,  %xmm1
1049
1050	pshufd	 $0xee, %xmm4, %xmm6
1051	pshufd	 $0xbb, %xmm4, %xmm7
1052
1053	pshufd	 $0xa0, %xmm1, %xmm3
1054	pshufd	 $0xf5, %xmm1, %xmm2
1055
1056#ifndef CONJ
1057	xorps	 %xmm0, %xmm2
1058#else
1059	xorps	 %xmm0, %xmm3
1060#endif
1061
1062	mulps	%xmm6,  %xmm3
1063	mulps	%xmm7,  %xmm2
1064
1065	subps	%xmm3,  %xmm5
1066	subps	%xmm2,  %xmm5
1067
1068	movaps	-28 * SIZE(BB), %xmm4
1069
1070	pshufd	 $0xee, %xmm4, %xmm6
1071	pshufd	 $0xbb, %xmm4, %xmm7
1072
1073	pshufd	 $0xa0, %xmm5, %xmm3
1074	pshufd	 $0xf5, %xmm5, %xmm5
1075
1076#ifndef CONJ
1077	xorps	 %xmm0, %xmm5
1078#else
1079	xorps	 %xmm0, %xmm3
1080#endif
1081
1082	mulps	%xmm6,  %xmm3
1083	mulps	%xmm7,  %xmm5
1084
1085	addps	%xmm3,  %xmm5
1086#endif
1087
1088#ifdef RT
1089	movaps	-28 * SIZE(BB), %xmm4
1090
1091	pshufd	 $0xee, %xmm4, %xmm6
1092	pshufd	 $0xbb, %xmm4, %xmm7
1093
1094	pshufd	 $0xa0, %xmm5, %xmm3
1095	pshufd	 $0xf5, %xmm5, %xmm5
1096
1097#ifndef CONJ
1098	xorps	 %xmm0, %xmm5
1099#else
1100	xorps	 %xmm0, %xmm3
1101#endif
1102
1103	mulps	%xmm6,  %xmm3
1104	mulps	%xmm7,  %xmm5
1105
1106	addps	%xmm3,  %xmm5
1107
1108	pshufd	 $0x44, %xmm4, %xmm6
1109	pshufd	 $0x11, %xmm4, %xmm7
1110
1111	pshufd	 $0xa0, %xmm5, %xmm3
1112	pshufd	 $0xf5, %xmm5, %xmm2
1113
1114#ifndef CONJ
1115	xorps	 %xmm0, %xmm2
1116#else
1117	xorps	 %xmm0, %xmm3
1118#endif
1119
1120	mulps	%xmm6,  %xmm3
1121	mulps	%xmm7,  %xmm2
1122
1123	subps	%xmm3,  %xmm1
1124	subps	%xmm2,  %xmm1
1125
1126	movaps	-32 * SIZE(BB), %xmm4
1127
1128	pshufd	 $0x44, %xmm4, %xmm6
1129	pshufd	 $0x11, %xmm4, %xmm7
1130
1131	pshufd	 $0xa0, %xmm1, %xmm3
1132	pshufd	 $0xf5, %xmm1, %xmm1
1133
1134#ifndef CONJ
1135	xorps	 %xmm0, %xmm1
1136#else
1137	xorps	 %xmm0, %xmm3
1138#endif
1139
1140	mulps	%xmm6,  %xmm3
1141	mulps	%xmm7,  %xmm1
1142
1143	addps	%xmm3,  %xmm1
1144#endif
1145
1146#ifdef LN
1147	subl	$2 * SIZE, CO1
1148#endif
1149
1150#if defined(LN) || defined(LT)
1151	movaps	%xmm2, -32 * SIZE(BB)
1152
1153	movlps	%xmm2,   0 * SIZE(CO1)
1154	movhps	%xmm2,   0 * SIZE(CO1, LDC)
1155#else
1156	movlps	%xmm1, -32 * SIZE(AA)
1157	movlps	%xmm5, -30 * SIZE(AA)
1158
1159	movlps	%xmm1,   0 * SIZE(CO1)
1160	movlps	%xmm5,   0 * SIZE(CO1, LDC)
1161#endif
1162
1163#ifndef LN
1164	addl	$2 * SIZE, CO1
1165#endif
1166
1167#if defined(LT) || defined(RN)
1168	movl	K,  %eax
1169	subl	KK, %eax
1170	sall	$ZBASE_SHIFT, %eax
1171	leal	(AA, %eax, 1), AA
1172	leal	(BB, %eax, 2), BB
1173#endif
1174
1175#ifdef LN
1176	subl	$1, KK
1177#endif
1178
1179#ifdef LT
1180	addl	$1, KK
1181#endif
1182
1183#ifdef RT
1184	movl	K, %eax
1185	sall	$ZBASE_SHIFT, %eax
1186	addl	%eax, AORIG
1187#endif
1188	ALIGN_4
1189
1190.L99:
1191#ifdef LN
1192       movl	K, %eax
1193       sall	$1 + ZBASE_SHIFT, %eax
1194       addl	%eax, B
1195#endif
1196
1197#if defined(LT) || defined(RN)
1198	movl	BB, B
1199#endif
1200
1201#ifdef RN
1202	addl	$2, KK
1203#endif
1204
1205#ifdef RT
1206	subl	$2, KK
1207#endif
1208
1209	decl	J			# j --
1210	jg	.L01
1211	ALIGN_4
1212
1213.L100:
1214	movl	N, %eax
1215	andl	$1, %eax
1216	jle	.L999
1217
1218#if defined(LT) || defined(RN)
1219	movl	A, %eax
1220	movl	%eax, AA
1221#else
1222	movl	A, %eax
1223	movl	%eax, AORIG
1224#endif
1225
1226#ifdef RT
1227	movl	K, %eax
1228	sall	$ZBASE_SHIFT, %eax
1229	subl	%eax, B
1230#endif
1231
1232#ifdef RT
1233       subl	LDC, C
1234#endif
1235	movl	C,  CO1
1236#ifndef RT
1237	addl	LDC, C
1238#endif
1239
1240#ifdef LN
1241	movl	OFFSET, %eax
1242	addl	M, %eax
1243	movl	%eax, KK
1244#endif
1245
1246#ifdef LT
1247	movl	OFFSET, %eax
1248	movl	%eax, KK
1249#endif
1250
1251	movl	M,  %ebx
1252	sarl	$1, %ebx
1253	jle	.L130
1254	ALIGN_4
1255
1256.L110:
1257#ifdef LN
1258       movl	K, %eax
1259       sall	$1 + ZBASE_SHIFT, %eax
1260       subl	%eax, AORIG
1261#endif
1262
1263#if defined(LN) || defined(RT)
1264	movl	KK, %eax
1265	movl	AORIG, AA
1266	sall	$1 + ZBASE_SHIFT, %eax
1267	addl	%eax, AA
1268#endif
1269
1270	movl	B, BB
1271
1272#if defined(LN) || defined(RT)
1273	movl	KK, %eax
1274	sall	$ZBASE_SHIFT, %eax
1275	addl	%eax, BB
1276#endif
1277
1278	movaps	-32 * SIZE(AA), %xmm0
1279	pxor	%xmm2, %xmm2
1280	movsd	-32 * SIZE(BB), %xmm1
1281	pxor	%xmm3, %xmm3
1282	movhps	-30 * SIZE(BB), %xmm1
1283	pxor	%xmm4, %xmm4
1284#ifdef LN
1285	prefetcht0	-4 * SIZE(CO1)
1286#else
1287	prefetcht0	 3 * SIZE(CO1)
1288#endif
1289	pxor	%xmm5, %xmm5
1290	pxor	%xmm6, %xmm6
1291	pxor	%xmm7, %xmm7
1292
1293#if defined(LT) || defined(RN)
1294	movl	KK, %eax
1295#else
1296	movl	K, %eax
1297	subl	KK, %eax
1298#endif
1299	sarl	$3, %eax
1300	je	.L112
1301	ALIGN_4
1302
1303.L111:
1304	addps	%xmm2, %xmm4
1305	pshufd	$0x00, %xmm1, %xmm2
1306	mulps	%xmm0, %xmm2
1307	PREFETCH (PREFETCHSIZE +  0) * SIZE(AA)
1308	addps	%xmm3, %xmm5
1309	pshufd	$0x55, %xmm1, %xmm3
1310	mulps	%xmm0, %xmm3
1311	movaps	-28 * SIZE(AA), %xmm0
1312
1313	addps	%xmm2, %xmm4
1314	pshufd	$0xaa, %xmm1, %xmm2
1315	mulps	%xmm0, %xmm2
1316	addps	%xmm3, %xmm5
1317	pshufd	$0xff, %xmm1, %xmm3
1318	movaps	-28 * SIZE(BB), %xmm1
1319	mulps	%xmm0, %xmm3
1320	movaps	-24 * SIZE(AA), %xmm0
1321
1322	addps	%xmm2, %xmm4
1323	pshufd	$0x00, %xmm1, %xmm2
1324	mulps	%xmm0, %xmm2
1325	addps	%xmm3, %xmm5
1326	pshufd	$0x55, %xmm1, %xmm3
1327	mulps	%xmm0, %xmm3
1328	movaps	-20 * SIZE(AA), %xmm0
1329
1330	addps	%xmm2, %xmm4
1331	pshufd	$0xaa, %xmm1, %xmm2
1332	mulps	%xmm0, %xmm2
1333	addps	%xmm3, %xmm5
1334	pshufd	$0xff, %xmm1, %xmm3
1335	movaps	-24 * SIZE(BB), %xmm1
1336	mulps	%xmm0, %xmm3
1337	movaps	-16 * SIZE(AA), %xmm0
1338
1339	PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
1340
1341	addps	%xmm2, %xmm4
1342	pshufd	$0x00, %xmm1, %xmm2
1343	mulps	%xmm0, %xmm2
1344	addps	%xmm3, %xmm5
1345	pshufd	$0x55, %xmm1, %xmm3
1346	mulps	%xmm0, %xmm3
1347	movaps	-12 * SIZE(AA), %xmm0
1348
1349	addps	%xmm2, %xmm4
1350	pshufd	$0xaa, %xmm1, %xmm2
1351	mulps	%xmm0, %xmm2
1352	addps	%xmm3, %xmm5
1353	pshufd	$0xff, %xmm1, %xmm3
1354	movaps	-20 * SIZE(BB), %xmm1
1355	mulps	%xmm0, %xmm3
1356	movaps	 -8 * SIZE(AA), %xmm0
1357
1358	addps	%xmm2, %xmm4
1359	pshufd	$0x00, %xmm1, %xmm2
1360	mulps	%xmm0, %xmm2
1361	addps	%xmm3, %xmm5
1362	pshufd	$0x55, %xmm1, %xmm3
1363	mulps	%xmm0, %xmm3
1364	movaps	 -4 * SIZE(AA), %xmm0
1365
1366	addps	%xmm2, %xmm4
1367	pshufd	$0xaa, %xmm1, %xmm2
1368	mulps	%xmm0, %xmm2
1369	addps	%xmm3, %xmm5
1370	pshufd	$0xff, %xmm1, %xmm3
1371	movaps	-16 * SIZE(BB), %xmm1
1372	mulps	%xmm0, %xmm3
1373	movaps	  0 * SIZE(AA), %xmm0
1374
1375	subl   $-32 * SIZE, AA
1376	subl   $-16 * SIZE, BB
1377
1378	decl   %eax
1379	jne	.L111
1380	ALIGN_4
1381
1382.L112:
1383#if defined(LT) || defined(RN)
1384	movl	KK, %eax
1385#else
1386	movl	K,  %eax
1387	subl	KK, %eax
1388#endif
1389	andl	$7, %eax		# if (k & 1)
1390	BRANCH
1391	je .L114
1392	ALIGN_4
1393
1394.L113:
1395	addps	%xmm2, %xmm4
1396	pshufd	$0x00, %xmm1, %xmm2
1397	mulps	%xmm0, %xmm2
1398	addps	%xmm3, %xmm5
1399	pshufd	$0x55, %xmm1, %xmm3
1400	movsd	-30 * SIZE(BB), %xmm1
1401	mulps	%xmm0, %xmm3
1402	movaps	-28 * SIZE(AA), %xmm0
1403
1404	addl	$4 * SIZE, AA
1405	addl	$2 * SIZE, BB
1406	decl	%eax
1407	jg	.L113
1408	ALIGN_4
1409
1410.L114:
1411#if defined(LN) || defined(RT)
1412	movl	KK, %eax
1413#ifdef LN
1414	subl	$2, %eax
1415#else
1416	subl	$1, %eax
1417#endif
1418
1419	movl	AORIG, AA
1420	sall	$ZBASE_SHIFT, %eax
1421	leal	(AA, %eax, 2), AA
1422	leal	(B,  %eax, 1), BB
1423#endif
1424
1425	addps	%xmm2, %xmm4
1426	addps	%xmm3, %xmm5
1427
1428	pshufd	 $0xb1, %xmm5, %xmm5
1429	pcmpeqb	%xmm0, %xmm0
1430	psllq	$63,   %xmm0
1431
1432#ifndef CONJ
1433	shufps	$0xb1, %xmm0, %xmm0
1434
1435	pxor	%xmm0, %xmm5
1436#else
1437#if defined(LN) || defined(LT)
1438	pxor	%xmm0, %xmm4
1439#else
1440	pxor	%xmm0, %xmm5
1441#endif
1442#endif
1443
1444	addps	%xmm5, %xmm4
1445
1446#if defined(LN) || defined(LT)
1447	movaps	%xmm4, %xmm5
1448	unpcklpd %xmm6, %xmm4
1449	unpckhpd %xmm6, %xmm5
1450
1451	movsd	-32 * SIZE(BB), %xmm2
1452	movsd	-30 * SIZE(BB), %xmm3
1453
1454	subps	%xmm4,  %xmm2
1455	subps	%xmm5,  %xmm3
1456#else
1457	movaps	-32 * SIZE(AA), %xmm1
1458
1459	subps	%xmm4,  %xmm1
1460#endif
1461
1462#ifdef LN
1463	movaps	-28 * SIZE(AA), %xmm5
1464
1465	pshufd	 $0xee, %xmm5, %xmm6
1466	pshufd	 $0xbb, %xmm5, %xmm7
1467
1468	pshufd	 $0xa0, %xmm3, %xmm4
1469	pshufd	 $0xf5, %xmm3, %xmm3
1470
1471#ifndef CONJ
1472	xorps	 %xmm0, %xmm3
1473#else
1474	xorps	 %xmm0, %xmm4
1475#endif
1476
1477	mulps	%xmm6,  %xmm4
1478	mulps	%xmm7, %xmm3
1479	addps	%xmm4,  %xmm3
1480
1481	pshufd	 $0x44, %xmm5, %xmm6
1482	pshufd	 $0x11, %xmm5, %xmm7
1483
1484	pshufd	 $0xa0, %xmm3, %xmm4
1485	pshufd	 $0xf5, %xmm3, %xmm1
1486
1487#ifndef CONJ
1488	xorps	 %xmm0, %xmm1
1489#else
1490	xorps	 %xmm0, %xmm4
1491#endif
1492
1493	mulps	%xmm6,  %xmm4
1494	mulps	%xmm7, %xmm1
1495	subps	%xmm4,  %xmm2
1496	subps	%xmm1,  %xmm2
1497
1498	movaps	-32 * SIZE(AA), %xmm5
1499
1500	pshufd	 $0x44, %xmm5, %xmm6
1501	pshufd	 $0x11, %xmm5, %xmm7
1502
1503	pshufd	 $0xa0, %xmm2, %xmm4
1504	pshufd	 $0xf5, %xmm2, %xmm2
1505
1506#ifndef CONJ
1507	xorps	 %xmm0, %xmm2
1508#else
1509	xorps	 %xmm0, %xmm4
1510#endif
1511
1512	mulps	%xmm6,  %xmm4
1513	mulps	%xmm7, %xmm2
1514	addps	%xmm4,  %xmm2
1515#endif
1516
1517#ifdef LT
1518	movaps	-32 * SIZE(AA), %xmm5
1519
1520	pshufd	 $0x44, %xmm5, %xmm6
1521	pshufd	 $0x11, %xmm5, %xmm7
1522
1523	pshufd	 $0xa0, %xmm2, %xmm4
1524	pshufd	 $0xf5, %xmm2, %xmm2
1525
1526#ifndef CONJ
1527	xorps	 %xmm0, %xmm2
1528#else
1529	xorps	 %xmm0, %xmm4
1530#endif
1531
1532	mulps	%xmm6,  %xmm4
1533	mulps	%xmm7, %xmm2
1534	addps	%xmm4,  %xmm2
1535
1536	pshufd	 $0xee, %xmm5, %xmm6
1537	pshufd	 $0xbb, %xmm5, %xmm7
1538
1539	pshufd	 $0xa0, %xmm2, %xmm4
1540	pshufd	 $0xf5, %xmm2, %xmm1
1541
1542#ifndef CONJ
1543	xorps	 %xmm0, %xmm1
1544#else
1545	xorps	 %xmm0, %xmm4
1546#endif
1547
1548	mulps	%xmm6,  %xmm4
1549	mulps	%xmm7, %xmm1
1550	subps	%xmm4,  %xmm3
1551	subps	%xmm1,  %xmm3
1552
1553	movaps	-28 * SIZE(AA), %xmm5
1554
1555	pshufd	 $0xee, %xmm5, %xmm6
1556	pshufd	 $0xbb, %xmm5, %xmm7
1557
1558	pshufd	 $0xa0, %xmm3, %xmm4
1559	pshufd	 $0xf5, %xmm3, %xmm3
1560
1561#ifndef CONJ
1562	xorps	 %xmm0, %xmm3
1563#else
1564	xorps	 %xmm0, %xmm4
1565#endif
1566
1567	mulps	%xmm6,  %xmm4
1568	mulps	%xmm7,  %xmm3
1569	addps	%xmm4,  %xmm3
1570#endif
1571
1572#if defined(RN) || defined(RT)
1573	movaps	-32 * SIZE(BB), %xmm4
1574
1575	pshufd	 $0x44, %xmm4, %xmm6
1576	pshufd	 $0x11, %xmm4, %xmm7
1577
1578	pshufd	 $0xa0, %xmm1, %xmm3
1579	pshufd	 $0xf5, %xmm1, %xmm1
1580
1581#ifndef CONJ
1582	xorps	 %xmm0, %xmm1
1583#else
1584	xorps	 %xmm0, %xmm3
1585#endif
1586
1587	mulps	%xmm6,  %xmm3
1588	mulps	%xmm7,  %xmm1
1589
1590	addps	%xmm3,  %xmm1
1591#endif
1592
1593#ifdef LN
1594	subl	$4 * SIZE, CO1
1595#endif
1596
1597#if defined(LN) || defined(LT)
1598	movlps	%xmm2, -32 * SIZE(BB)
1599	movlps	%xmm3, -30 * SIZE(BB)
1600
1601	movlps	%xmm2,   0 * SIZE(CO1)
1602	movlps	%xmm3,   2 * SIZE(CO1)
1603#else
1604	movaps	%xmm1, -32 * SIZE(AA)
1605
1606	movlps	%xmm1,   0 * SIZE(CO1)
1607	movhps	%xmm1,   2 * SIZE(CO1)
1608#endif
1609
1610#ifndef LN
1611	addl	$4 * SIZE, CO1
1612#endif
1613
1614#if defined(LT) || defined(RN)
1615	movl	K,  %eax
1616	subl	KK, %eax
1617	sall	$ZBASE_SHIFT, %eax
1618	leal	(AA, %eax, 2), AA
1619	leal	(BB, %eax, 1), BB
1620#endif
1621
1622#ifdef LN
1623	subl	$2, KK
1624#endif
1625
1626#ifdef LT
1627	addl	$2, KK
1628#endif
1629
1630#ifdef RT
1631	movl	K, %eax
1632	sall	$1 + ZBASE_SHIFT, %eax
1633	addl	%eax, AORIG
1634#endif
1635
1636	decl	%ebx			# i --
1637	jg	.L110
1638	ALIGN_4
1639
1640.L130:
1641	movl	M,  %ebx
1642	andl	$1, %ebx
1643	jle	.L149
1644
1645#ifdef LN
1646       movl	K, %eax
1647       sall	$ZBASE_SHIFT, %eax
1648       subl	%eax, AORIG
1649#endif
1650
1651#if defined(LN) || defined(RT)
1652	movl	KK, %eax
1653	movl	AORIG, AA
1654	sall	$ZBASE_SHIFT, %eax
1655	addl	%eax, AA
1656#endif
1657
1658	movl	B, BB
1659
1660#if defined(LN) || defined(RT)
1661	movl	KK, %eax
1662	sall	$ZBASE_SHIFT, %eax
1663	addl	%eax, BB
1664#endif
1665
1666	movsd	-32 * SIZE(AA), %xmm0
1667	pxor	%xmm2, %xmm2
1668	movsd	-32 * SIZE(BB), %xmm1
1669	pxor	%xmm3, %xmm3
1670
1671	pxor	%xmm4, %xmm4
1672	pxor	%xmm5, %xmm5
1673	pxor	%xmm6, %xmm6
1674	pxor	%xmm7, %xmm7
1675
1676#if defined(LT) || defined(RN)
1677	movl	KK, %eax
1678#else
1679	movl	K, %eax
1680	subl	KK, %eax
1681#endif
1682	sarl	$3, %eax
1683	je	.L142
1684	ALIGN_4
1685
1686.L141:
1687	addps	%xmm2, %xmm4
1688	pshufd	$0x00, %xmm1, %xmm2
1689	mulps	%xmm0, %xmm2
1690	addps	%xmm3, %xmm5
1691	pshufd	$0x55, %xmm1, %xmm3
1692	movsd	-30 * SIZE(BB), %xmm1
1693	mulps	%xmm0, %xmm3
1694	movsd	-30 * SIZE(AA), %xmm0
1695
1696	PREFETCH (PREFETCHSIZE +  0) * SIZE(AA)
1697
1698	addps	%xmm2, %xmm4
1699	pshufd	$0x00, %xmm1, %xmm2
1700	mulps	%xmm0, %xmm2
1701	addps	%xmm3, %xmm5
1702	pshufd	$0x55, %xmm1, %xmm3
1703	movsd	-28 * SIZE(BB), %xmm1
1704	mulps	%xmm0, %xmm3
1705	movsd	-28 * SIZE(AA), %xmm0
1706
1707	addps	%xmm2, %xmm4
1708	pshufd	$0x00, %xmm1, %xmm2
1709	mulps	%xmm0, %xmm2
1710	addps	%xmm3, %xmm5
1711	pshufd	$0x55, %xmm1, %xmm3
1712	movsd	-26 * SIZE(BB), %xmm1
1713	mulps	%xmm0, %xmm3
1714	movsd	-26 * SIZE(AA), %xmm0
1715
1716	addps	%xmm2, %xmm4
1717	pshufd	$0x00, %xmm1, %xmm2
1718	mulps	%xmm0, %xmm2
1719	addps	%xmm3, %xmm5
1720	pshufd	$0x55, %xmm1, %xmm3
1721	movsd	-24 * SIZE(BB), %xmm1
1722	mulps	%xmm0, %xmm3
1723	movsd	-24 * SIZE(AA), %xmm0
1724
1725	addps	%xmm2, %xmm4
1726	pshufd	$0x00, %xmm1, %xmm2
1727	mulps	%xmm0, %xmm2
1728	addps	%xmm3, %xmm5
1729	pshufd	$0x55, %xmm1, %xmm3
1730	movsd	-22 * SIZE(BB), %xmm1
1731	mulps	%xmm0, %xmm3
1732	movsd	-22 * SIZE(AA), %xmm0
1733
1734	addps	%xmm2, %xmm4
1735	pshufd	$0x00, %xmm1, %xmm2
1736	mulps	%xmm0, %xmm2
1737	addps	%xmm3, %xmm5
1738	pshufd	$0x55, %xmm1, %xmm3
1739	movsd	-20 * SIZE(BB), %xmm1
1740	mulps	%xmm0, %xmm3
1741	movsd	-20 * SIZE(AA), %xmm0
1742
1743	addps	%xmm2, %xmm4
1744	pshufd	$0x00, %xmm1, %xmm2
1745	mulps	%xmm0, %xmm2
1746	addps	%xmm3, %xmm5
1747	pshufd	$0x55, %xmm1, %xmm3
1748	movsd	-18 * SIZE(BB), %xmm1
1749	mulps	%xmm0, %xmm3
1750	movsd	-18 * SIZE(AA), %xmm0
1751
1752	addps	%xmm2, %xmm4
1753	pshufd	$0x00, %xmm1, %xmm2
1754	mulps	%xmm0, %xmm2
1755	addps	%xmm3, %xmm5
1756	pshufd	$0x55, %xmm1, %xmm3
1757	movsd	-16 * SIZE(BB), %xmm1
1758	mulps	%xmm0, %xmm3
1759	movsd	-16 * SIZE(AA), %xmm0
1760
1761	subl   $-16 * SIZE, AA
1762	subl   $-16 * SIZE, BB
1763
1764	decl	%eax
1765	jne	.L141
1766	ALIGN_4
1767
1768.L142:
1769#if defined(LT) || defined(RN)
1770	movl	KK, %eax
1771#else
1772	movl	K,  %eax
1773	subl	KK, %eax
1774#endif
1775	andl	$7, %eax		# if (k & 1)
1776	BRANCH
1777	je .L144
1778	ALIGN_4
1779
1780.L143:
1781	addps	%xmm2, %xmm4
1782	pshufd	$0x00, %xmm1, %xmm2
1783	mulps	%xmm0, %xmm2
1784	addps	%xmm3, %xmm5
1785	pshufd	$0x55, %xmm1, %xmm3
1786	movsd	-30 * SIZE(BB), %xmm1
1787	mulps	%xmm0, %xmm3
1788	movsd	-30 * SIZE(AA), %xmm0
1789
1790	addl	$2 * SIZE, AA
1791	addl	$2 * SIZE, BB
1792	decl	%eax
1793	jg	.L143
1794	ALIGN_4
1795
1796.L144:
1797#if defined(LN) || defined(RT)
1798	movl	KK, %eax
1799	subl	$1, %eax
1800
1801	movl	AORIG, AA
1802	sall	$ZBASE_SHIFT, %eax
1803	leal	(AA, %eax, 1), AA
1804	leal	(B,  %eax, 1), BB
1805#endif
1806
1807	addps	%xmm2, %xmm4
1808	addps	%xmm3, %xmm5
1809
1810	pshufd	 $0xb1, %xmm5, %xmm5
1811	pcmpeqb	%xmm0, %xmm0
1812	psllq	$63,   %xmm0
1813
1814#ifndef CONJ
1815	shufps	$0xb1, %xmm0, %xmm0
1816
1817	pxor	%xmm0, %xmm5
1818#else
1819#if defined(LN) || defined(LT)
1820	pxor	%xmm0, %xmm4
1821#else
1822	pxor	%xmm0, %xmm5
1823#endif
1824#endif
1825
1826	addps	%xmm5, %xmm4
1827
1828#if defined(LN) || defined(LT)
1829	movsd	-32 * SIZE(BB), %xmm2
1830
1831	subps	%xmm4,  %xmm2
1832#else
1833	movsd	-32 * SIZE(AA), %xmm1
1834
1835	subps	%xmm4,  %xmm1
1836#endif
1837
1838#if defined(LN) || defined(LT)
1839	movaps	-32 * SIZE(AA), %xmm5
1840
1841	pshufd	 $0x44, %xmm5, %xmm6
1842	pshufd	 $0x11, %xmm5, %xmm7
1843
1844	pshufd	 $0xa0, %xmm2, %xmm4
1845	pshufd	 $0xf5, %xmm2, %xmm2
1846
1847#ifndef CONJ
1848	xorps	 %xmm0, %xmm2
1849#else
1850	xorps	 %xmm0, %xmm4
1851#endif
1852
1853	mulps	%xmm6,  %xmm4
1854	mulps	%xmm7,  %xmm2
1855	addps	%xmm4,  %xmm2
1856#endif
1857
1858#if defined(RN) || defined(RT)
1859	movaps	-32 * SIZE(BB), %xmm4
1860
1861	pshufd	 $0x44, %xmm4, %xmm6
1862	pshufd	 $0x11, %xmm4, %xmm7
1863
1864	pshufd	 $0xa0, %xmm1, %xmm3
1865	pshufd	 $0xf5, %xmm1, %xmm1
1866
1867#ifndef CONJ
1868	xorps	 %xmm0, %xmm1
1869#else
1870	xorps	 %xmm0, %xmm3
1871#endif
1872
1873	mulps	%xmm6,  %xmm3
1874	mulps	%xmm7,  %xmm1
1875
1876	addps	%xmm3,  %xmm1
1877#endif
1878
1879#ifdef LN
1880	subl	$2 * SIZE, CO1
1881#endif
1882
1883#if defined(LN) || defined(LT)
1884	movlps	%xmm2,  -32 * SIZE(BB)
1885
1886	movlps	%xmm2,   0 * SIZE(CO1)
1887#else
1888	movlps	%xmm1,  -32 * SIZE(AA)
1889
1890	movlps	%xmm1,   0 * SIZE(CO1)
1891#endif
1892
1893#ifndef LN
1894	addl	$2 * SIZE, CO1
1895#endif
1896
1897#if defined(LT) || defined(RN)
1898	movl	K,  %eax
1899	subl	KK, %eax
1900	sall	$ZBASE_SHIFT, %eax
1901	leal	(AA, %eax, 1), AA
1902	leal	(BB, %eax, 1), BB
1903#endif
1904
1905#ifdef LN
1906	subl	$1, KK
1907#endif
1908
1909#ifdef LT
1910	addl	$1, KK
1911#endif
1912
1913#ifdef RT
1914	movl	K, %eax
1915	sall	$ZBASE_SHIFT, %eax
1916	addl	%eax, AORIG
1917#endif
1918	ALIGN_4
1919
1920.L149:
1921#ifdef LN
1922       movl	K, %eax
1923       sall	$ZBASE_SHIFT, %eax
1924       addl	%eax, B
1925#endif
1926
1927#if defined(LT) || defined(RN)
1928	movl	BB, B
1929#endif
1930
1931#ifdef RN
1932	addl	$1, KK
1933#endif
1934
1935#ifdef RT
1936	subl	$1, KK
1937#endif
1938	ALIGN_4
1939
1940.L999:
1941	popl	%ebx
1942	popl	%esi
1943	popl	%edi
1944	popl	%ebp
1945
1946	addl	$ARGS, %esp
1947	ret
1948
1949	EPILOGUE
1950